3 # Copyright (C) 2013, 2014 Intra2net AG
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published
7 # by the Free Software Foundation; either version 3 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU Lesser General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see
17 # <http://www.gnu.org/licenses/lgpl-3.0.html>
19 DELTATAR_HEADER_VERSION = 1
20 DELTATAR_PARAMETER_VERSION = 1
34 from functools import partial
39 class NullHandler(logging.Handler):
40 def emit(self, record):
44 logging.getLogger("deltatar.DeltaTar").addHandler(NullHandler())
52 # encryption direction
53 CRYPTO_MODE_ENCRYPT = 0
54 CRYPTO_MODE_DECRYPT = 1
56 # The canonical extension for encrypted backup files regardless of the actual
57 # encryption parameters is “.pdtcrypt”. This is analogous to the encryption
58 # header which starts with the eight ASCII bytes “PDTCRYPT”. Historical note:
59 # Since the introduction of the versioned header there no longer any need
60 # for encoding encryption parameters in the file extensions (“.aes128” and
62 PDTCRYPT_EXTENSION = "pdtcrypt"
66 AUXILIARY_FILE_INDEX = 0
67 AUXILIARY_FILE_INFO = 1
69 class DeltaTar(object):
71 Backup class used to create backups
74 # list of files to exclude in the backup creation or restore operation. It
75 # can contain python regular expressions.
78 # list of files to include in the backup creation or restore operation. It
79 # can contain python regular expressions. If empty, all files in the source
80 # path will be backed up (when creating a backup) or all the files in the
81 # backup will be restored (when restoring a backup), but if included_files
82 # is set then only the files include in the list will be processed.
85 # custom filter of files to be backed up (or restored). Unused and unset
86 # by default. The function receives a file path and must return a boolean.
89 # mode in which the delta will be created (when creating a backup) or
90 # opened (when restoring). Accepts modes analog to the tarfile library.
93 # used together with aes modes to encrypt and decrypt backups.
98 # parameter version to use when encrypting; note that this has no effect
99 # on decryption since the required settings are determined from the headers
100 crypto_version = DELTATAR_HEADER_VERSION
101 crypto_paramversion = None
103 # when encrypting or decrypting, these hold crypto handlers; created before
104 # establishing the Tarfile stream iff a password is supplied.
108 # python logger object.
111 # specifies the index mode in the same format as @param mode, but without
112 # the ':', '|' or '#' at the begining. It doesn't make sense to specify
113 # that the index is encrypted if no password is given in the constructor.
116 # current time for this backup. Used for file names and file creation checks
119 # extra data to included in the header of the index file when creating a
123 # valid tarfile modes and their corresponding default file extension
124 __file_extensions_dict = {
133 '#gz.pdtcrypt': '.gz',
138 # valid index modes and their corresponding default file extension
139 __index_extensions_dict = {
143 'gz.pdtcrypt': '.gz',
147 # valid path prefixes
148 __path_prefix_list = [
154 def __init__(self, excluded_files=[], included_files=[],
155 filter_func=None, mode="", password=None,
156 crypto_key=None, nacl=None,
157 crypto_version=DELTATAR_HEADER_VERSION,
158 crypto_paramversion=DELTATAR_PARAMETER_VERSION,
159 logger=None, index_mode=None, index_name_func=None,
160 volume_name_func=None):
162 Constructor. Configures the diff engine.
165 - excluded_files: list of files to exclude in the backup creation or
166 restore operation. It can contain python regular expressions.
168 - included_files: list of files to include in the backup creation or
169 restore operation. It can contain python regular expressions. If
170 empty, all files in the source path will be backed up (when creating a
171 backup) or all the files in the backup will be restored (when
172 restoring a backup), but if included_files is set then only the files
173 include in the list will be processed.
175 - filter_func: custom filter of files to be backed up (or restored).
176 Unused and unset by default. The function receives a file path and
177 must return a boolean.
179 - mode: mode in which the delta will be created (when creating a backup)
180 or opened (when restoring). Accepts the same modes as the tarfile
181 library. Valid modes are:
184 ':' open uncompressed
185 ':gz' open with gzip compression
186 ':bz2' open with bzip2 compression
187 '|' open an uncompressed stream of tar blocks
188 '|gz' open a gzip compressed stream of tar blocks
189 '|bz2' open a bzip2 compressed stream of tar blocks
190 '#gz' open a stream of gzip compressed tar blocks
192 - crypto_key: used to encrypt and decrypt backups. Encryption will
193 be enabled automatically if a key is supplied. Requires a salt to be
196 - nacl: salt that was used to derive the encryption key for embedding
197 in the PDTCRYPT header. Not needed when decrypting and when
198 encrypting with password.
200 - password: used to encrypt and decrypt backups. Encryption will be
201 enabled automatically if a password is supplied.
203 - crypto_version: version of the format, determining the kind of PDT
206 - crypto_paramversion: optionally request encryption conforming to
207 a specific parameter version. Defaults to the standard PDT value
208 which as of 2017 is the only one available.
210 - logger: python logger object. Optional.
212 - index_mode: specifies the index mode in the same format as @param
213 mode, but without the ':', '|' or '#' at the begining. If encryption
214 is requested it will extend to the auxiliary (index, info) files as
215 well. This is an optional parameter that will automatically mimic
216 @param mode by default if not provided. Valid modes are:
219 'gz' open with gzip compression
220 'bz2' open with bzip2 compression
222 - index_name_func: function that sets a custom name for the index file.
223 This function receives a flag to indicate whether the name will be
224 used for a full or diff backup. The backup path will be prepended to
227 - volume_name_func: function that defines the name of tar volumes. It
228 receives the backup_path, if it's a full backup and the volume number,
229 and must return the name for the corresponding volume name. Optional,
230 DeltaTar has default names for tar volumes.
233 if mode not in self.__file_extensions_dict:
234 raise Exception('Unrecognized extension mode=[%s] requested for files'
237 self.excluded_files = excluded_files
238 self.included_files = included_files
239 self.filter_func = filter_func
240 self.logger = logging.getLogger('deltatar.DeltaTar')
242 self.logger.addHandler(logger)
245 if crypto_key is not None:
246 self.crypto_key = crypto_key
247 self.nacl = nacl # encryption only
249 if password is not None:
250 self.password = password
252 if crypto_version is not None:
253 self.crypto_version = crypto_version
255 if crypto_paramversion is not None:
256 self.crypto_paramversion = crypto_paramversion
258 # generate index_mode
259 if index_mode is None:
265 elif mode not in self.__index_extensions_dict:
266 raise Exception('Unrecognized extension mode=[%s] requested for index'
269 self.index_mode = index_mode
270 self.current_time = datetime.datetime.now()
272 if index_name_func is not None:
273 self.index_name_func = index_name_func
275 if volume_name_func is not None:
276 self.volume_name_func = volume_name_func
278 def pick_extension(self, kind, mode=None):
280 Choose the extension depending on a) the kind of file given, b) the
281 processing mode, and c) the current encryption settings.
284 if kind == PDT_TYPE_ARCHIVE:
287 mode = self.__index_extensions_dict [self.index_mode]
289 if self.crypto_key is not None or self.password is not None:
290 ret += "." + PDTCRYPT_EXTENSION
293 def index_name_func(self, is_full): # pylint: disable=method-hidden
295 Callback for setting a custom name for the index file. Depending on
296 whether *is_full* is set, it will create a suitable name for a full
299 prefix = "bfull" if is_full else "bdiff"
300 date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
301 extension = self.pick_extension \
303 self.__index_extensions_dict [self.index_mode])
305 return "%s-%s.index%s" % (prefix, date_str, extension)
307 def volume_name_func(self, backup_path, # pylint: disable=method-hidden
308 is_full, volume_number,
311 function that defines the name of tar volumes. It receives the
312 backup_path, if it's a full backup and the volume number, and must return
313 the name for the corresponding volume name. Optional, DeltaTar has default
314 names for tar volumes.
316 If guess_name is activated, the file is intended not to be created but
317 to be found, and thus the date will be guessed.
319 prefix = "bfull" if is_full else "bdiff"
320 extension = self.pick_extension \
322 self.__file_extensions_dict [self.mode])
325 date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
326 return "%s-%s-%03d%s" % (prefix, date_str, volume_number + 1, extension)
328 prefix = prefix + "-"
329 postfix = "-%03d%s" % (volume_number + 1, extension)
330 for f in os.listdir(backup_path):
331 if f.startswith(prefix) and f.endswith(postfix):
333 raise Exception("volume not found")
336 def filter_path(self, path, source_path="", is_dir=None):
338 Filters a path, given the source_path, using the filtering properties
339 set in the constructor.
340 The filtering order is:
341 1. included_files (if any)
343 3. filter_func (which must return whether the file is accepted or not)
346 if len(source_path) > 0:
347 # ensure that exactly one '/' at end of dir is also removed
348 source_path = source_path.rstrip(os.sep) + os.sep
349 path = path[len(source_path):]
351 # 1. filter included_files
353 if len(self.included_files) > 0:
355 for i in self.included_files:
356 # it can be either a regexp or a string
357 if isinstance(i, str):
358 # if the string matches, then continue
363 # if the string ends with / it's a directory, and if the
364 # path is contained in it, it is included
365 if i.endswith('/') and path.startswith(i):
369 # if the string doesn't end with /, add it and do the same
371 elif path.startswith(i + '/'):
375 # check for PARENT_MATCH
378 if not dir_path.endswith('/'):
381 if i.startswith(dir_path):
384 # if it's a reg exp, then we just check if it matches
385 elif isinstance(i, typing.Pattern):
390 self.logger.warning('Invalid pattern in included_files: %s' % str(i))
392 if match == NO_MATCH:
395 # when a directory is in PARENT_MATCH, it doesn't matter if it's
396 # excluded. It's subfiles will be excluded, but the directory itself
398 if match != PARENT_MATCH:
399 for e in self.excluded_files:
400 # it can be either a regexp or a string
401 if isinstance(e, str):
402 # if the string matches, then exclude
406 # if the string ends with / it's a directory, and if the
407 # path starts with the directory, then exclude
408 if e.endswith('/') and path.startswith(e):
411 # if the string doesn't end with /, do the same check with
413 elif path.startswith(e + '/'):
416 # if it's a reg exp, then we just check if it matches
417 elif isinstance(e, typing.Pattern):
421 self.logger.warning('Invalid pattern in excluded_files: %s' % str(e))
424 return self.filter_func(path)
428 def _recursive_walk_dir(self, source_path, keep_base_dir=False):
430 Walk a directory recursively, yielding each file/directory
432 Returns the path of an entity. If ``keep_base_dir`` is set,
433 the path returned contains the prefix ``source_path``; otherwise it is
434 relative to the prefix.
437 source_path = source_path.rstrip(os.sep)
442 beginning_size = len(source_path) + 1 # +1 for os.sep
444 queue = [source_path]
447 cur_path = queue.pop(0)
450 dfd = os.open (cur_path, os.O_DIRECTORY)
451 except FileNotFoundError as exn:
452 self.logger.warning ("failed to open entity [%s] as directory; "
453 "file system (error: %s); skipping"
454 % (cur_path, str (exn)))
458 for filename in sorted(os.listdir(dfd)):
459 child = os.path.join(cur_path, filename)
460 is_dir = os.path.isdir(child)
461 status = self.filter_path(child, source_path, is_dir)
462 if status == NO_MATCH:
464 if not os.access(child, os.R_OK):
465 self.logger.warning('Error accessing possibly locked file %s' % child)
469 yield child[beginning_size:]
471 if is_dir and (status == MATCH or status == PARENT_MATCH):
476 def _stat_dict(self, path):
478 Returns a dict with the stat data used to compare files
480 stinfo = os.stat(path)
481 mode = stinfo.st_mode
484 if stat.S_ISDIR(mode):
486 elif stat.S_ISREG(mode):
488 elif stat.S_ISLNK(mode):
495 u'mtime': int(stinfo.st_mtime),
496 u'ctime': int(stinfo.st_ctime),
497 u'uid': stinfo.st_uid,
498 u'gid': stinfo.st_gid,
499 u'inode': stinfo.st_ino,
500 u'size': stinfo.st_size
503 def _equal_stat_dicts(self, d1, d2, listsnapshot_equal=False):
505 Return if the dicts are equal in the stat keys
507 keys = [u'type', u'mode',u'size', u'mtime',
508 # not restored: u'inode', u'ctime'
511 # only if user is root, then also check gid/uid. otherwise do not check it,
512 # because tarfile can chown in case of being superuser only
514 # also, skip the check in rpmbuild since the sources end up with the
515 # uid:gid of the packager while the extracted files are 0:0.
516 if hasattr(os, "geteuid") and os.geteuid() == 0 \
517 and os.getenv ("RPMBUILD_OPTIONS") is None:
521 if (not d1 and d2 != None) or (d1 != None and not d2):
524 if self.prefixed(d1.get('path', -1), listsnapshot_equal) != self.prefixed(d2.get('path', -2), listsnapshot_equal):
527 type = d1.get('type', '')
530 # size doesn't matter for directories
531 if type == 'directory' and key == 'size':
533 if d1.get(key, -1) != d2.get(key, -2):
537 def prefixed(self, path, listsnapshot_equal=False):
539 if a path is not prefixed, return it prefixed
541 for prefix in self.__path_prefix_list:
542 if path.startswith(prefix):
543 if listsnapshot_equal and prefix == u'list://':
544 return u'snapshot://' + path[len(prefix):]
546 return u'snapshot://' + path
548 def unprefixed(self, path):
550 remove a path prefix if any
552 for prefix in self.__path_prefix_list:
553 if path.startswith(prefix):
554 return path[len(prefix):]
558 def initialize_encryption (self, mode, strict_validation=True):
560 :type strict_validation: bool
561 :param strict_validation: Enable strict IV checking in the crypto
562 layer. Should be disabled when dealing with
563 potentially corrupted data.
565 password = self.password
566 key = self.crypto_key
569 if key is None and password is None:
571 if mode == CRYPTO_MODE_ENCRYPT:
572 return crypto.Encrypt (password=password,
575 version=self.crypto_version,
576 paramversion=self.crypto_paramversion)
577 if mode == CRYPTO_MODE_DECRYPT:
578 return crypto.Decrypt (password=password, key=key,
579 strict_ivs=strict_validation)
581 raise Exception ("invalid encryption mode [%r]" % mode)
584 def open_auxiliary_file(self, path, mode='r', kind=AUXILIARY_FILE_INDEX,
585 strict_validation=True):
587 Given the specified configuration, opens a file for reading or writing,
588 inheriting the encryption and compression settings from the backup.
589 Returns a file object ready to use.
591 :param mode: IO mode (read or write, ``"r"`` and ``"w"``,
594 :param kind: Role of the file, see AUXILIARY_FILE_* constants.
595 Both the info and the auxiliary file have a globally
596 unique, constant counter value.
599 if self.index_mode.startswith('gz'):
601 elif self.index_mode.startswith('bz2'):
609 crypto_ctx = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
611 crypto_ctx = self.initialize_encryption (CRYPTO_MODE_DECRYPT,
612 strict_validation=strict_validation)
614 if crypto_ctx is not None:
615 if kind == AUXILIARY_FILE_INFO:
616 enccounter = crypto.AES_GCM_IV_CNT_INFOFILE
617 elif kind == AUXILIARY_FILE_INDEX:
618 enccounter = crypto.AES_GCM_IV_CNT_INDEX
620 raise Exception ("invalid kind of aux file %r" % kind)
622 sink = tarfile._Stream(name=path, mode=mode, comptype=comptype,
623 bufsize=tarfile.RECORDSIZE, fileobj=None,
624 encryption=crypto_ctx, enccounter=enccounter)
629 def create_full_backup(self, source_path, backup_path,
630 max_volume_size=None, extra_data=dict()):
632 Creates a full backup.
635 - source_path: source path to the directory to back up.
636 - backup_path: path where the back up will be stored. Backup path will
637 be created if not existent.
638 - max_volume_size: maximum volume size in megabytes. Used to split the
639 backup in volumes. Optional (won't split in volumes by default).
640 - extra_data: a json-serializable dictionary with information that you
641 want to be included in the header of the index file
644 if not isinstance(source_path, str):
645 raise Exception('Source path must be a string')
647 if not isinstance(backup_path, str):
648 raise Exception('Backup path must be a string')
650 if not os.path.exists(source_path) or not os.path.isdir(source_path):
651 raise Exception('Source path "%s" does not exist or is not a '\
652 'directory' % source_path)
654 if max_volume_size != None and (not isinstance(max_volume_size, int) or\
655 max_volume_size < 1):
656 raise Exception('max_volume_size must be a positive integer')
657 if max_volume_size != None:
658 max_volume_size = max_volume_size*1024*1024
660 if not isinstance(extra_data, dict):
661 raise Exception('extra_data must be a dictionary')
664 extra_data_str = json.dumps(extra_data)
666 raise Exception('extra_data is not json-serializable')
668 if not os.access(source_path, os.R_OK):
669 raise Exception('Source path "%s" is not readable' % source_path)
671 # try to create backup path if needed
672 os.makedirs(backup_path, exist_ok=True)
674 if not os.access(backup_path, os.W_OK):
675 raise Exception('Backup path "%s" is not writeable' % backup_path)
677 if source_path.endswith('/'):
678 source_path = source_path[:-1]
680 if backup_path.endswith('/'):
681 backup_path = backup_path[:-1]
683 # update current time
684 self.current_time = datetime.datetime.now()
686 if self.mode not in self.__file_extensions_dict:
687 raise Exception('Unrecognized extension')
689 # setup for encrypting payload
690 if self.encryptor is None:
691 self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
693 # some initialization
696 # generate the first volume name
697 vol_name = self.volume_name_func(backup_path, True, 0)
698 tarfile_path = os.path.join(backup_path, vol_name)
701 index_name = self.index_name_func(True)
702 index_path = os.path.join(backup_path, index_name)
703 index_sink = self.open_auxiliary_file(index_path, 'w')
707 def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
709 Handles the new volumes
711 volume_name = deltarobj.volume_name_func(backup_path, True, volume_number)
712 volume_path = os.path.join(backup_path, volume_name)
713 deltarobj.vol_no = volume_number
715 # we convert relative paths into absolute because CWD is changed
716 if not os.path.isabs(volume_path):
717 volume_path = os.path.join(cwd, volume_path)
719 if tarobj.fileobj is not None:
720 tarobj.fileobj.close()
722 deltarobj.logger.debug("opening volume %s" % volume_path)
724 tarobj.open_volume(volume_path, encryption=encryption)
726 # wraps some args from context into the handler
727 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.encryptor)
729 index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "full", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))
731 s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
732 # calculate checksum and write into the stream
733 crc = binascii.crc32(s) & 0xFFFFffff
736 # start creating the tarfile
737 tarobj = tarfile.TarFile.open(tarfile_path,
738 mode='w' + self.mode,
739 format=tarfile.GNU_FORMAT,
740 concat='#' in self.mode,
741 encryption=self.encryptor,
742 max_volume_size=max_volume_size,
743 new_volume_handler=new_volume_handler,
744 save_to_members=False,
746 os.chdir(source_path)
748 # for each file to be in the backup, do:
749 for path in self._recursive_walk_dir('.'):
752 # calculate stat dict for current file
753 statd = self._stat_dict(path)
754 statd['path'] = u'snapshot://' + statd['path']
755 statd['volume'] = self.vol_no
758 tarobj.add(path, arcname = statd['path'], recursive=False)
759 except FileNotFoundError as exn:
760 # file vanished since the call to access(3) above
761 self.logger.warning ("object [%s] no longer available in "
762 "file system (error: %s); skipping"
764 continue # prevent indexing
766 # retrieve file offset
767 statd['offset'] = tarobj.get_last_member_offset()
768 self.logger.debug("backup %s" % statd['path'])
770 # store the stat dict in the index
771 s = bytes(json.dumps(statd) + '\n', 'UTF-8')
772 crc = binascii.crc32(s, crc) & 0xffffffff
775 s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
776 crc = binascii.crc32(s, crc) & 0xffffffff
778 s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
783 index_sink.close (close_fileobj=True)
785 def create_diff_backup(self, source_path, backup_path, previous_index_path,
786 max_volume_size=None, extra_data=dict()):
791 - source_path: source path to the directory to back up.
792 - backup_path: path where the back up will be stored. Backup path will
793 be created if not existent.
794 - previous_index_path: index of the previous backup, needed to know
795 which files changed since then.
796 - max_volume_size: maximum volume size in megabytes (MB). Used to split
797 the backup in volumes. Optional (won't split in volumes by default).
799 NOTE: previous index is assumed to follow exactly the same format as
800 the index_mode setup in the constructor.
802 # check/sanitize input
803 if not isinstance(source_path, str):
804 raise Exception('Source path must be a string')
806 if not isinstance(backup_path, str):
807 raise Exception('Backup path must be a string')
809 if not os.path.exists(source_path) or not os.path.isdir(source_path):
810 raise Exception('Source path "%s" does not exist or is not a '\
811 'directory' % source_path)
813 if not isinstance(extra_data, dict):
814 raise Exception('extra_data must be a dictionary')
817 extra_data_str = json.dumps(extra_data)
819 raise Exception('extra_data is not json-serializable')
821 if not os.access(source_path, os.R_OK):
822 raise Exception('Source path "%s" is not readable' % source_path)
824 if max_volume_size != None and (not isinstance(max_volume_size, int) or\
825 max_volume_size < 1):
826 raise Exception('max_volume_size must be a positive integer')
827 if max_volume_size != None:
828 max_volume_size = max_volume_size*1024*1024
830 if not isinstance(previous_index_path, str):
831 raise Exception('previous_index_path must be A string')
833 if not os.path.exists(previous_index_path) or not os.path.isfile(previous_index_path):
834 raise Exception('Index path "%s" does not exist or is not a '\
835 'file' % previous_index_path)
837 if not os.access(previous_index_path, os.R_OK):
838 raise Exception('Index path "%s" is not readable' % previous_index_path)
840 # try to create backup path if needed
841 os.makedirs(backup_path, exist_ok=True)
843 if not os.access(backup_path, os.W_OK):
844 raise Exception('Backup path "%s" is not writeable' % backup_path)
846 if source_path.endswith('/'):
847 source_path = source_path[:-1]
849 if backup_path.endswith('/'):
850 backup_path = backup_path[:-1]
852 # update current time
853 self.current_time = datetime.datetime.now()
855 if self.mode not in self.__file_extensions_dict:
856 raise Exception('Unrecognized extension')
858 # setup for encrypting payload
859 if self.encryptor is None:
860 self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
862 # some initialization
865 # generate the first volume name
866 vol_name = self.volume_name_func(backup_path, is_full=False,
868 tarfile_path = os.path.join(backup_path, vol_name)
873 index_name = self.index_name_func(is_full=False)
874 index_path = os.path.join(backup_path, index_name)
875 index_sink = self.open_auxiliary_file(index_path, 'w')
877 def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number):
879 Handles the new volumes
881 volume_name = deltarobj.volume_name_func(backup_path, is_full=False,
882 volume_number=volume_number)
883 volume_path = os.path.join(backup_path, volume_name)
884 deltarobj.vol_no = volume_number
886 # we convert relative paths into absolute because CWD is changed
887 if not os.path.isabs(volume_path):
888 volume_path = os.path.join(cwd, volume_path)
890 deltarobj.logger.debug("opening volume %s" % volume_path)
891 tarobj.open_volume(volume_path)
893 # wraps some args from context into the handler
894 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path)
896 index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "diff", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))
898 s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
899 # calculate checksum and write into the stream
900 crc = binascii.crc32(s) & 0xFFFFffff
903 # start creating the tarfile
904 tarobj = tarfile.TarFile.open(tarfile_path,
905 mode='w' + self.mode,
906 format=tarfile.GNU_FORMAT,
907 concat='#' in self.mode,
908 encryption=self.encryptor,
909 max_volume_size=max_volume_size,
910 new_volume_handler=new_volume_handler,
911 save_to_members=False,
915 # create the iterators, first the previous index iterator, then the
916 # source path directory iterator and collate and iterate them
917 if not os.path.isabs(previous_index_path):
918 previous_index_path = os.path.join(cwd, previous_index_path)
919 index_it = self.iterate_index_path(previous_index_path)
921 os.chdir(source_path)
922 dir_it = self._recursive_walk_dir('.')
923 dir_path_it = self.jsonize_path_iterator(dir_it)
931 # for each file to be in the backup, do:
932 for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
934 # if file is not in the index, it means it's a new file, so we have
939 # if the file is not in the directory iterator, it means that it has
940 # been deleted, so we need to mark it as such
943 # if the file is in both iterators, it means it might have either
944 # not changed (in which case we will just list it in our index but
945 # it will not be included in the tar file), or it might have
946 # changed, in which case we will snapshot it.
947 elif ipath and dpath:
948 if self._equal_stat_dicts(ipath, dpath):
952 # TODO: when creating chained backups (i.e. diffing from another
953 # diff), we will need to detect the type of action in the previous
954 # index, because if it was delete and dpath is None, we should
957 if action == 'snapshot':
958 # calculate stat dict for current file
960 stat['path'] = "snapshot://" + dpath['path']
961 stat['volume'] = self.vol_no
963 self.logger.debug("[STORE] %s" % dpath['path'])
966 tarobj.add(dpath['path'], arcname=stat['path'], recursive=False)
967 # retrieve file offset
968 stat['offset'] = tarobj.get_last_member_offset()
969 except FileNotFoundError as exn:
970 # file vanished since the call to access(3) above
971 self.logger.warning ("object [%s] no longer available in "
972 "file system (error: %s); skipping"
973 % (dpath ["path"], str (exn)))
974 stat = None # prevent indexing
976 elif action == 'delete':
977 path = self.unprefixed(ipath['path'])
979 u'path': u'delete://' + path,
980 u'type': ipath['type']
982 self.logger.debug("[DELETE] %s" % path)
984 # mark it as deleted in the backup
985 tarobj.add("/dev/null", arcname=stat['path'])
986 elif action == 'list':
988 path = self.unprefixed(ipath['path'])
989 stat['path'] = u'list://' + path
990 # unchanged files do not enter in the backup, only in the index
991 self.logger.debug("[UNCHANGED] %s" % path)
994 self.logger.warning('unknown action in create_diff_backup: {0}'
999 # store the stat dict in the index
1000 s = bytes(json.dumps(stat) + '\n', 'UTF-8')
1001 crc = binascii.crc32(s, crc) & 0xffffffff
1004 s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
1005 crc = binascii.crc32(s, crc) & 0xffffffff
1007 s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
1016 def iterate_index_path(self, index_path, strict_validation=True):
1018 Returns an index iterator. Internally, it uses a classic iterator class.
1019 We do that instead of just yielding so that the iterator object can have
1020 an additional function to close the file descriptor that is opened in
1024 class IndexPathIterator(object):
1025 def __init__(self, delta_tar, index_path):
1026 self.delta_tar = delta_tar
1027 self.index_path = index_path
1029 self.extra_data = dict()
1039 def __enter__(self):
1041 Allows this iterator to be used with the "with" statement
1044 self.f = self.delta_tar.open_auxiliary_file \
1047 strict_validation=strict_validation)
1048 # check index header
1049 j, l_no = self.delta_tar._parse_json_line(self.f, 0)
1050 if j.get("type", '') != 'python-delta-tar-index' or\
1051 j.get('version', -1) != 1:
1052 raise Exception("invalid index file format: %s" % json.dumps(j))
1054 self.extra_data = j.get('extra_data', dict())
1056 # find BEGIN-FILE-LIST, ignore other headers
1058 j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
1059 if j.get('type', '') == 'BEGIN-FILE-LIST':
1063 def __exit__(self, type, value, tb):
1065 Allows this iterator to be used with the "with" statement
1072 # read each file in the index and process it to do the restore
1076 j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
1077 except Exception as e:
1082 op_type = j.get('type', '')
1084 # when we detect the end of the list, break the loop
1085 if op_type == 'END-FILE-LIST':
1091 if op_type not in ['directory', 'file', 'link']:
1092 self.delta_tar.logger.warning('unrecognized type to be '
1093 'restored: %s, line %d' % (op_type, l_no))
1095 return self.__next__()
1099 return IndexPathIterator(self, index_path)
1101 def iterate_tar_path(self, tar_path, new_volume_handler=None):
1103 Returns a tar iterator that iterates jsonized member items that contain
1104 an additional "member" field, used by RestoreHelper.
1106 class TarPathIterator(object):
1107 def __init__(self, delta_tar, tar_path, new_volume_handler=None):
1108 self.delta_tar = delta_tar
1109 self.tar_path = tar_path
1111 self.last_member = None
1112 self.new_volume_handler = new_volume_handler
1120 self.tar_obj.close()
1122 def __enter__(self):
1124 Allows this iterator to be used with the "with" statement
1126 if self.tar_obj is None:
1128 if self.delta_tar.password is not None:
1129 decryptor = crypto.Decrypt \
1130 (password=self.delta_tar.password,
1131 key=self.delta_tar.crypto_key,
1133 self.tar_obj = tarfile.TarFile.open(self.tar_path,
1134 mode='r' + self.delta_tar.mode,
1135 format=tarfile.GNU_FORMAT,
1136 concat='#' in self.delta_tar.mode,
1137 encryption=decryptor,
1138 new_volume_handler=self.new_volume_handler,
1139 save_to_members=False,
1143 def __exit__(self, type, value, tb):
1145 Allows this iterator to be used with the "with" statement
1148 self.tar_obj.close()
1153 Read each member and return it as a stat dict
1155 tarinfo = self.tar_obj.__iter__().__next__()
1156 # NOTE: here we compare if tarinfo.path is the same as before
1157 # instead of comparing the tarinfo object itself because the
1158 # object itself might change for multivol tarinfos
1159 if tarinfo is None or (self.last_member is not None and\
1160 self.delta_tar.unprefixed(tarinfo.path) == self.delta_tar.unprefixed(self.last_member.path)):
1163 self.last_member = tarinfo
1166 if tarinfo.isfile():
1168 elif tarinfo.isdir():
1170 elif tarinfo.islnk() or tarinfo.issym():
1175 u'path': tarinfo.path,
1176 u'mode': tarinfo.mode,
1177 u'mtime': tarinfo.mtime,
1178 u'ctime': -1, # cannot restore
1179 u'uid': tarinfo.uid,
1180 u'gid': tarinfo.gid,
1181 u'inode': -1, # cannot restore
1182 u'size': tarinfo.size,
1186 return TarPathIterator(self, tar_path, new_volume_handler)
1188 def jsonize_path_iterator(self, iter, strip=0):
1190 converts the yielded items of an iterator into json path lines.
1192 strip: Strip the smallest prefix containing num leading slashes from
1197 path = iter.__next__()
1199 yield self._stat_dict(path), 0
1201 st = self._stat_dict(path)
1202 st['path'] = "/".join(path.split("/")[strip:])
1204 except StopIteration:
1207 def iterate_disaster_index (self, index):
1209 Mimick the behavior of the other object iterators, just with the inputs
1210 supplied directly as *index*.
1213 class RawIndexIterator(object):
1214 def __init__(self, delta_tar, index):
1215 self.delta_tar = delta_tar
1225 def __enter__(self):
1227 Allows this iterator to be used with the "with" statement
1229 self.iter = self.index.__iter__ ()
1232 def __exit__(self, type, value, tb):
1234 Allows this iterator to be used with the "with" statement
1238 idxent = self.iter.__next__ ()
1241 return RawIndexIterator(self, index)
1243 def collate_iterators(self, it1, it2):
1245 Collate two iterators, so that it returns pairs of the items of each
1246 iterator (if the items are the same), or (None, elem2) or (elem1, None)
1247 when there's no match for the items in the other iterator.
1249 It assumes that the items in both lists are ordered in the same way.
1252 elem1, elem2 = None, None
1256 elem1, l_no = it1.__next__()
1257 except StopIteration:
1259 yield (None, elem2, l_no)
1261 if isinstance(elem2, tuple):
1263 yield (None, elem2, l_no)
1267 elem2 = it2.__next__()
1268 if isinstance(elem2, tuple):
1270 except StopIteration:
1272 yield (elem1, None, l_no)
1273 for elem1, l_no in it1:
1274 yield (elem1, None, l_no)
1277 index1 = self.unprefixed(elem1['path'])
1278 index2 = self.unprefixed(elem2['path'])
1279 i1, i2 = self.compare_indexes(index1, index2)
1281 yield1 = yield2 = None
1288 yield (yield1, yield2, l_no)
1290 def compare_indexes(self, index1, index2):
1292 Compare iterator indexes and return a tuple in the following form:
1293 if index1 < index2, returns (index1, None)
1294 if index1 == index2 returns (index1, index2)
1295 else: returns (None, index2)
1297 l1 = index1.split('/')
1298 l2 = index2.split('/')
1299 length = len(l2) - len(l1)
1302 return (index1, None)
1304 return (None, index2)
1306 for i1, i2 in zip(l1, l2):
1308 return (index1, None)
1310 return (None, index2)
1312 return (index1, index2)
1314 def list_backup(self, backup_tar_path, list_func=None):
1315 if not isinstance(backup_tar_path, str):
1316 raise Exception('Backup tar path must be a string')
1318 if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
1319 raise Exception('Source path "%s" does not exist or is not a '\
1320 'file' % backup_tar_path)
1322 if not os.access(backup_tar_path, os.R_OK):
1323 raise Exception('Source path "%s" is not readable' % backup_tar_path)
1327 def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
1329 Handles the new volumes
1331 volume_name = deltarobj.volume_name_func(backup_path, True,
1332 volume_number, guess_name=True)
1333 volume_path = os.path.join(backup_path, volume_name)
1335 # we convert relative paths into absolute because CWD is changed
1336 if not os.path.isabs(volume_path):
1337 volume_path = os.path.join(cwd, volume_path)
1338 tarobj.open_volume(volume_path, encryption=encryption)
1340 if self.decryptor is None:
1342 self.initialize_encryption (CRYPTO_MODE_DECRYPT,
1343 strict_validation=False)
1345 backup_path = os.path.dirname(backup_tar_path)
1346 if not os.path.isabs(backup_path):
1347 backup_path = os.path.join(cwd, backup_path)
1348 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.decryptor)
1350 tarobj = tarfile.TarFile.open(backup_tar_path,
1351 mode='r' + self.mode,
1352 format=tarfile.GNU_FORMAT,
1353 concat='#' in self.mode,
1354 encryption=self.decryptor,
1355 new_volume_handler=new_volume_handler,
1356 save_to_members=False,
1359 def filter(cls, list_func, tarinfo):
1360 if list_func is None:
1361 self.logger.info(tarinfo.path)
1365 filter = partial(filter, self, list_func)
1367 tarobj.extractall(filter=filter, unlink=True)
1370 def restore_backup(self, target_path, backup_indexes_paths=[],
1371 backup_tar_path=None, restore_callback=None,
1372 disaster=tarfile.TOLERANCE_STRICT, backup_index=None,
1373 strict_validation=True):
1378 - target_path: path to restore.
1379 - backup_indexes_paths: path to backup indexes, in descending date order.
1380 The indexes indicate the location of their respective backup volumes,
1381 and multiple indexes are needed to be able to restore diff backups.
1382 Note that this is an optional parameter: if not suplied, it will
1383 try to restore directly from backup_tar_path.
1384 - backup_tar_path: path to the backup tar file. Used as an alternative
1385 to backup_indexes_paths to restore directly from a tar file without
1386 using any file index. If it's a multivol tarfile, volume_name_func
1388 - restore_callback: callback function to be called during restore.
1389 This is passed to the helper and gets called for every file.
1391 NOTE: If you want to use an index to restore a backup, this function
1392 only supports to do so when the tarfile mode is either uncompressed or
1393 uses concat compress mode, because otherwise it would be very slow.
1395 NOTE: Indices are assumed to follow the same format as the index_mode
1396 specified in the constructor.
1398 Returns the list of files that could not be restored, if there were
1401 # check/sanitize input
1402 if not isinstance(target_path, str):
1403 raise Exception('Target path must be a string')
1405 if backup_indexes_paths is None and backup_tar_path == []:
1406 raise Exception("You have to either provide index paths or a tar path")
1408 if isinstance (backup_index, list) is True:
1410 elif len(backup_indexes_paths) == 0:
1416 if not isinstance(backup_tar_path, str):
1417 raise Exception('Backup tar path must be a string')
1419 if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
1420 raise Exception('Source path "%s" does not exist or is not a '\
1421 'file' % backup_tar_path)
1423 if not os.access(backup_tar_path, os.R_OK):
1424 raise Exception('Source path "%s" is not readable' % backup_tar_path)
1426 if not isinstance(backup_indexes_paths, list):
1427 raise Exception('backup_indexes_paths must be a list')
1429 if self.mode.startswith(':') or self.mode.startswith('|'):
1430 raise Exception('Restore only supports either uncompressed tars'
1431 ' or concat compression when restoring from an index, and '
1432 ' the open mode you provided is "%s"' % self.mode)
1434 for index in backup_indexes_paths:
1435 if not isinstance(index, str):
1436 raise Exception('indices must be strings')
1438 if not os.path.exists(index) or not os.path.isfile(index):
1439 raise Exception('Index path "%s" does not exist or is not a '\
1442 if not os.access(index, os.R_OK):
1443 raise Exception('Index path "%s" is not readable' % index)
1445 # try to create backup path if needed
1446 os.makedirs(target_path, exist_ok=True)
1448 # make backup_tar_path absolute so that iterate_tar_path works fine
1449 if backup_tar_path and not os.path.isabs(backup_tar_path):
1450 backup_tar_path = os.path.abspath(backup_tar_path)
1453 os.chdir(target_path)
1455 # setup for decrypting payload
1456 if self.decryptor is None:
1458 self.initialize_encryption (CRYPTO_MODE_DECRYPT,
1459 strict_validation=strict_validation)
1462 index_it = self.iterate_tar_path(backup_tar_path)
1463 helper = RestoreHelper(self, cwd, backup_path=backup_tar_path,
1464 tarobj=index_it.tar_obj)
1465 elif mode == "diff":
1466 helper = RestoreHelper(self, cwd, backup_indexes_paths,
1469 # get iterator from newest index at _data[0]
1470 index1 = helper._data[0]["path"]
1472 self.iterate_index_path(index1,
1473 strict_validation=strict_validation)
1474 except tarfile.DecryptionError as exn:
1475 self.logger.error("failed to decrypt file [%s]: %s; is this an "
1476 "actual encrypted index file?"
1477 % (index1, str (exn)))
1478 return [(index1, exn)]
1479 except Exception as exn:
1481 self.logger.error("failed to read file [%s]: %s; is this an "
1482 "actual index file?" % (index1, str (exn)))
1483 return [(index1, exn)]
1484 elif mode == "disaster":
1485 index_it = self.iterate_disaster_index (backup_index)
1486 helper = RestoreHelper (self, cwd, backup_path=backup_tar_path,
1487 backup_index=backup_index,
1490 index_decryptor = helper._data[0]["decryptor"]
1492 dir_it = self._recursive_walk_dir('.')
1493 dir_path_it = self.jsonize_path_iterator(dir_it)
1495 failed = [] # irrecoverable files
1497 # for each file to be restored, do:
1498 for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
1500 upath = dpath['path']
1501 op_type = dpath['type']
1503 upath = self.unprefixed(ipath['path'])
1504 op_type = ipath['type']
1507 if self.filter_path(upath, '', op_type == 'directory') == NO_MATCH:
1510 # if types of the file mismatch, the file needs to be deleted
1512 if ipath is not None and dpath is not None and\
1513 dpath['type'] != ipath['type']:
1514 helper.delete(upath)
1516 # if file not found in dpath, we can directly restore from index
1518 # if the file doesn't exist and it needs to be deleted, it
1519 # means that work is already done
1520 if ipath['path'].startswith('delete://'):
1523 self.logger.debug("restore %s" % ipath['path'])
1524 helper.restore(ipath, l_no, restore_callback)
1525 except Exception as e:
1526 iipath = ipath.get ("path", "")
1527 self.logger.error("FAILED to restore: {} ({})"
1529 if disaster != tarfile.TOLERANCE_STRICT:
1530 failed.append ((iipath, e))
1533 # if both files are equal, we have nothing to restore
1534 if self._equal_stat_dicts(ipath, dpath, listsnapshot_equal=True):
1537 # we have to restore the file, but first we need to delete the
1538 # current existing file.
1539 # we don't delete the file if it's a directory, because it might
1540 # just have changed mtime, so it's quite inefficient to remove
1543 if ipath['type'] != 'directory' or ipath['path'].startswith('delete://'):
1544 helper.delete(upath)
1545 self.logger.debug("restore %s" % ipath['path'])
1547 helper.restore(ipath, l_no, restore_callback)
1548 except Exception as e:
1549 if disaster == tarfile.TOLERANCE_STRICT:
1551 failed.append ((ipath.get ("path", ""), e))
1554 # if the file is not in the index (so it comes from the target
1555 # directory) then we have to delete it
1557 self.logger.debug("delete %s" % upath)
1558 helper.delete(upath)
1560 helper.restore_directories_permissions()
1568 def recover_backup(self, target_path, backup_indexes_paths=[],
1569 restore_callback=None):
1571 Walk the index, extracting objects in disaster mode. Bad files are
1572 reported along with a reason.
1574 *Security considerations*: In *recovery mode* the headers of encrypted
1575 objects are assumed damaged and GCM tags are not validated so
1576 modification of cryptographically relevant parts of the header (more
1577 specifically, the initalization vectors) can no longer be detected. If
1578 an attacker can manipulate the encrypted backup set and has access to
1579 the plaintext of some of the contents, they may be able to obtain the
1580 plaintext of other encrypted objects by injecting initialization
1581 vectors. For this reason *recovery mode* should only be used to
1582 emergency situations and the contents of the resulting files should be
1583 validated manually if possible and not be disclosed to untrusted
1586 return self.restore_backup(target_path,
1587 backup_indexes_paths=backup_indexes_paths,
1588 disaster=tarfile.TOLERANCE_RECOVER,
1589 strict_validation=False)
1592 def rescue_backup(self, target_path, backup_tar_path,
1593 restore_callback=None):
1595 More aggressive “unfsck” mode: do not rely on the index data as the
1596 files may be corrupt; skim files for header-like information and
1597 attempt to retrieve the data.
1599 *Security considerations*: As with *recovery mode*, in *rescue mode*
1600 the headers of encrypted objects are assumed damaged and GCM tags are
1601 not validated so modification of cryptographically relevant parts of
1602 the header (more specifically, the initalization vectors) can no longer
1603 be detected. If an attacker can manipulate the encrypted backup set and
1604 has access to the plaintext of some of the contents, they may be able
1605 to obtain the plaintext of other encrypted objects by injecting
1606 initialization vectors. For this reason *rescue mode* should only be
1607 used to emergency situations and the contents of the resulting files
1608 should be validated manually if possible and not be disclosed to
1611 def gen_volume_name (nvol):
1612 return os.path.join (os.path.dirname (backup_tar_path),
1613 self.volume_name_func (backup_tar_path,
1617 backup_index = tarfile.gen_rescue_index (gen_volume_name,
1619 password=self.password,
1620 key=self.crypto_key)
1622 return self.restore_backup(target_path,
1623 backup_index=backup_index,
1624 backup_tar_path=backup_tar_path,
1625 disaster=tarfile.TOLERANCE_RESCUE,
1626 strict_validation=False)
1629 def _parse_json_line(self, f, l_no):
1631 Read line from file like object and process it as JSON.
1636 j = json.loads(l.decode('UTF-8'))
1637 except UnicodeDecodeError as e:
1638 if tuple (l [0:2]) == tarfile.GZ_MAGIC:
1640 ("error parsing line #%d as json: looks like a compressed file (%d B: [%s..])"
1641 % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
1644 ("error parsing line #%d as json: not a text file (%d B: [%s..])"
1645 % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
1647 except ValueError as e:
1648 raise Exception("error parsing this json line "
1649 "(line number %d): %s" % (l_no, l))
1653 class RestoreHelper(object):
1655 Class used to help to restore files from indices
1658 # holds the dicts of data
1665 # list of directories to be restored. This is done as a last step, see
1666 # tarfile.extractall for details.
1669 _disaster = tarfile.TOLERANCE_STRICT
1671 def __init__(self, deltatar, cwd, index_list=None, backup_path=False,
1672 backup_index=None, tarobj=None,
1673 disaster=tarfile.TOLERANCE_STRICT):
1675 Constructor opens the tars and init the data structures.
1679 - Index list must be provided in reverse order (newer first).
1680 - “newer first” apparently means that if there are n backups
1681 provided, the last full backup is at index n-1 and the most recent
1682 diff backup is at index 0.
1683 - Only the first, the second, and the last elements of
1684 ``index_list`` are relevant, others will not be accessed.
1685 - If no ``index_list`` is provided, both ``tarobj`` and
1686 ``backup_path`` must be passed.
1687 - If ``index_list`` is provided, the values of ``tarobj`` and
1688 ``backup_path`` are ignored.
1691 self._directories = []
1692 self._deltatar = deltatar
1694 self._password = deltatar.password
1695 self._crypto_key = deltatar.crypto_key
1696 self._decryptors = []
1697 self._disaster = disaster
1699 # Disable strict checking for linearly increasing IVs when running
1700 # in rescue or recover mode.
1701 strict_validation = disaster == tarfile.TOLERANCE_STRICT
1708 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
1709 self.canchown = True
1711 self.canchown = False
1713 if isinstance (backup_index, list) is True:
1714 decryptor = self._deltatar.decryptor
1716 [{ "curr_vol_no" : None
1720 , "path" : backup_path
1723 , "last_itelement" : None
1725 , "new_volume_handler" :
1726 partial(self.new_volume_handler,
1727 self._deltatar, self._cwd, True,
1728 os.path.dirname(backup_path), decryptor)
1729 , "decryptor" : decryptor
1731 elif index_list is not None:
1732 for index in index_list:
1733 is_full = index == index_list[-1]
1736 if self._password is not None:
1737 decryptor = crypto.Decrypt (password=self._password,
1738 key=self._crypto_key,
1739 strict_ivs=strict_validation)
1741 # make paths absolute to avoid cwd problems
1742 if not os.path.isabs(index):
1743 index = os.path.normpath(os.path.join(cwd, index))
1753 last_itelement = None,
1755 new_volume_handler = partial(self.new_volume_handler,
1756 self._deltatar, self._cwd, is_full,
1757 os.path.dirname(index), decryptor),
1758 decryptor = decryptor
1760 self._data.append(s)
1762 # make paths absolute to avoid cwd problems
1763 if not os.path.isabs(backup_path):
1764 backup_path = os.path.normpath(os.path.join(cwd, backup_path))
1766 # update the new_volume_handler of tar_obj
1767 tarobj.new_volume_handler = partial(self.new_volume_handler,
1768 self._deltatar, self._cwd, True, os.path.dirname(backup_path),
1769 self._deltatar.decryptor)
1778 last_itelement = None,
1780 new_volume_handler = tarobj.new_volume_handler,
1781 decryptor = self._deltatar.decryptor
1783 self._data.append(s)
1788 Closes all open files
1790 for data in self._data:
1792 data['vol_fd'].close()
1793 data['vol_fd'] = None
1795 data['tarobj'].close()
1796 data['tarobj'] = None
1798 def delete(self, path):
1802 if not os.path.exists(path):
1805 # to preserve parent directory mtime, we save it
1806 parent_dir = os.path.dirname(path) or os.getcwd()
1807 parent_dir_mtime = int(os.stat(parent_dir).st_mtime)
1809 if os.path.isdir(path) and not os.path.islink(path):
1814 # now we restore parent_directory mtime
1815 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1817 def restore(self, itpath, l_no, callback=None):
1819 Restore the path from the appropriate backup. Receives the current path
1820 from the newest (=first) index iterator. itpath must be not null.
1821 callback is a custom function that gets called for every file.
1823 NB: This function takes the attribute ``_data`` as input but will only
1824 ever use its first and, if available, second element. Anything else in
1825 ``._data[]`` will be ignored.
1827 path = itpath['path']
1829 # Calls the callback function
1833 if path.startswith('delete://'):
1834 # the file has previously been deleted already in restore_backup in
1835 # all cases so we just need to finish
1838 # get data from newest index (_data[0])
1839 data = self._data[0]
1840 upath = self._deltatar.unprefixed(path)
1842 # to preserve parent directory mtime, we save it
1843 parent_dir = os.path.dirname(upath) or os.getcwd()
1844 os.makedirs(parent_dir, exist_ok=True)
1845 parent_dir_mtime = int(os.stat(parent_dir).st_mtime)
1847 # if path is found in the newest index as to be snapshotted, deal with it
1849 if path.startswith('snapshot://'):
1850 self.restore_file(itpath, data, path, l_no, upath)
1852 # now we restore parent_directory mtime
1853 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1856 # we go from index to index, finding the path in the index, then finding
1857 # the index with the most recent snapshot of the file being restored
1859 # Right now we support diff backups, only. No incremental backups.
1860 # As a result _data[0] is always the diff backup index
1861 # and _data[1] the full backup index.
1862 if len(self._data) == 2:
1863 data = self._data[1]
1864 d, l_no, dpath = self.find_path_in_index(data, upath)
1866 self._deltatar.logger.warning('Error restoring file %s from '
1867 'index, not found in index %s' % (path, data['path']))
1870 cur_path = d.get('path', '')
1871 if cur_path.startswith('delete://'):
1872 self._deltatar.logger.warning(('Strange thing happened, file '
1873 '%s was listed in first index but deleted by another '
1874 'one. Path was ignored and untouched.') % path)
1876 elif cur_path.startswith('snapshot://'):
1877 # this code path is reached when the file is unchanged
1878 # in the newest index and therefore of type 'list://'
1879 self.restore_file(d, data, path, l_no, dpath)
1881 # now we restore parent_directory mtime
1882 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1885 # error code path is reached when:
1886 # a) we have more than two indexes (unsupported atm)
1887 # b) both indexes contain a list:// entry (logic error)
1888 # c) we have just one index and it also contains list://
1889 self._deltatar.logger.warning(('Error restoring file %s from index, '
1890 'snapshot not found in any index') % path)
1892 def find_path_in_index(self, data, upath):
1893 # NOTE: we restart the iterator sometimes because the iterator can be
1894 # walked over completely multiple times, for example if one path if not
1895 # found in one index and we have to go to the next index.
1896 it = data['iterator']
1898 it = data['iterator'] = self._deltatar.iterate_index_path(data["path"])
1899 d, l_no = it.__next__()
1901 d = data['last_itelement']
1902 l_no = data['last_lno']
1905 dpath = self._deltatar.unprefixed(d.get('path', ''))
1907 data['last_itelement'] = d
1908 data['last_lno'] = l_no
1909 return d, l_no, dpath
1911 up, dp = self._deltatar.compare_indexes(upath, dpath)
1912 # any time upath should have appeared before current dpath, it means
1913 # upath is just not in this index and we should stop
1915 data['last_itelement'] = d
1916 data['last_lno'] = l_no
1920 d, l_no = it.__next__()
1921 except StopIteration:
1922 data['last_itelement'] = d
1923 data['last_lno'] = l_no
1926 def restore_directories_permissions(self):
1928 Restore directory permissions when everything have been restored
1935 self._directories.sort(key=operator.attrgetter('name'))
1936 self._directories.reverse()
1938 # Set correct owner, mtime and filemode on directories.
1939 for member in self._directories:
1940 dirpath = member.name
1942 os.chmod(dirpath, member.mode)
1943 os.utime(dirpath, (member.mtime, member.mtime))
1945 # We have to be root to do so.
1947 g = grp.getgrnam(member.gname)[2]
1951 u = pwd.getpwnam(member.uname)[2]
1955 if member.issym and hasattr(os, "lchown"):
1956 os.lchown(dirpath, u, g)
1958 os.chown(dirpath, u, g)
1959 except EnvironmentError:
1960 raise tarfile.ExtractError("could not change owner")
1962 except tarfile.ExtractError as e:
1963 self._deltatar.logger.warning('tarfile: %s' % e)
1966 def new_volume_handler(deltarobj, cwd, is_full, backup_path, decryptor, tarobj, base_name, volume_number):
1968 Set up a new volume and perform the tasks necessary for transitioning
1971 volume_name = deltarobj.volume_name_func(backup_path, is_full,
1972 volume_number, guess_name=True)
1973 volume_path = os.path.join(backup_path, volume_name)
1975 # we convert relative paths into absolute because CWD is changed
1976 if not os.path.isabs(volume_path):
1977 volume_path = os.path.join(cwd, volume_path)
1979 tarobj.open_volume(volume_path, encryption=decryptor)
1981 def restore_file(self, file_data, index_data, path, l_no, unprefixed_path):
1983 Restores a snapshot of a file from a specific backup
1985 op_type = file_data.get('type', -1)
1986 member = file_data.get('member', None)
1987 ismember = bool(member)
1989 # when member is set, then we can assume everything is right and we
1990 # just have to restore the path
1992 vol_no = file_data.get('volume', -1)
1994 if not isinstance(vol_no, int) or vol_no < 0:
1995 self._deltatar.logger.warning('unrecognized type to be restored: '
1996 '%s, line %d' % (op_type, l_no))
1998 # setup the volume that needs to be read. only needed when member is
2000 if index_data['curr_vol_no'] != vol_no:
2001 index_data['curr_vol_no'] = vol_no
2002 backup_path = os.path.dirname(index_data['path'])
2003 vol_name = self._deltatar.volume_name_func(backup_path,
2004 index_data['is_full'], vol_no, guess_name=True)
2005 vol_path = os.path.join(backup_path, vol_name)
2006 if index_data['vol_fd']:
2007 index_data['vol_fd'].close()
2008 index_data['vol_fd'] = open(vol_path, 'rb')
2010 # force reopen of the tarobj because of new volume
2011 if index_data['tarobj']:
2012 index_data['tarobj'].close()
2013 index_data['tarobj'] = None
2015 # seek tarfile if needed
2016 offset = file_data.get('offset', -1)
2017 if index_data['tarobj']:
2018 if self._disaster == tarfile.TOLERANCE_RESCUE:
2019 # force a seek and reopen
2020 index_data['tarobj'].close()
2021 index_data['tarobj'] = None
2024 member = index_data['tarobj'].__iter__().__next__()
2025 except tarfile.DecryptionError:
2027 except tarfile.CompressionError:
2030 if not member or member.path != file_data['path']:
2031 # force a seek and reopen
2032 index_data['tarobj'].close()
2033 index_data['tarobj'] = None
2036 # open the tarfile if needed
2037 if not index_data['tarobj']:
2038 index_data['vol_fd'].seek(offset)
2039 index_data['tarobj'] = tarfile.open(mode="r" + self._deltatar.mode,
2040 fileobj=index_data['vol_fd'],
2041 format=tarfile.GNU_FORMAT,
2042 concat='#' in self._deltatar.mode,
2043 encryption=index_data["decryptor"],
2044 new_volume_handler=index_data['new_volume_handler'],
2045 save_to_members=False,
2046 tolerance=self._disaster)
2048 member = index_data['tarobj'].__iter__().__next__()
2050 member.path = unprefixed_path
2051 member.name = unprefixed_path
2053 if op_type == 'directory':
2054 self.add_member_dir(member)
2055 member = copy.copy(member)
2056 member.mode = 0o0700
2058 # if it's an existing directory, we then don't need to recreate it
2059 # just set the right permissions, mtime and that kind of stuff
2060 if os.path.exists(member.path):
2064 # set current volume number in tarobj, otherwise the extraction of the
2065 # file might fail when trying to extract a multivolume member
2066 index_data['tarobj'].volume_number = index_data['curr_vol_no']
2068 def ignore_symlink (member, *_args):
2069 self._deltatar.logger.warning("Ignoring symlink %s" % member.name)
2071 # finally, restore the file
2072 index_data['tarobj'].extract(member, symlink_cb=ignore_symlink,
2075 def add_member_dir(self, member):
2077 Add member dir to be restored at the end
2079 if not self.canchown:
2080 self._directories.append(DirItem(name=member.name, mode=member.mode,
2081 mtime=member.mtime))
2083 self._directories.append(DirItem(name=member.name, mode=member.mode,
2084 mtime=member.mtime, gname=member.gname, uname=member.uname,
2085 uid=member.uid, gid=member.gid, issym=member.issym()))
2087 class DirItem(object):
2088 def __init__(self, **kwargs):
2089 for k, v in kwargs.items():