3 # Copyright (C) 2013, 2014 Intra2net AG
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published
7 # by the Free Software Foundation; either version 3 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU Lesser General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see
17 # <http://www.gnu.org/licenses/lgpl-3.0.html>
19 DELTATAR_HEADER_VERSION = 1
20 DELTATAR_PARAMETER_VERSION = 1
33 from functools import partial
38 class NullHandler(logging.Handler):
39 def emit(self, record):
43 logging.getLogger("deltatar.DeltaTar").addHandler(NullHandler())
51 # encryption direction
52 CRYPTO_MODE_ENCRYPT = 0
53 CRYPTO_MODE_DECRYPT = 1
55 # The canonical extension for encrypted backup files regardless of the actual
56 # encryption parameters is “.pdtcrypt”. This is analogous to the encryption
57 # header which starts with the eight ASCII bytes “PDTCRYPT”. Historical note:
58 # Since the introduction of the versioned header there no longer any need
59 # for encoding encryption parameters in the file extensions (“.aes128” and
61 PDTCRYPT_EXTENSION = "pdtcrypt"
65 AUXILIARY_FILE_INDEX = 0
66 AUXILIARY_FILE_INFO = 1
68 class DeltaTar(object):
70 Backup class used to create backups
73 # list of files to exclude in the backup creation or restore operation. It
74 # can contain python regular expressions.
77 # list of files to include in the backup creation or restore operation. It
78 # can contain python regular expressions. If empty, all files in the source
79 # path will be backed up (when creating a backup) or all the files in the
80 # backup will be restored (when restoring a backup), but if included_files
81 # is set then only the files include in the list will be processed.
84 # custom filter of files to be backed up (or restored). Unused and unset
85 # by default. The function receives a file path and must return a boolean.
88 # mode in which the delta will be created (when creating a backup) or
89 # opened (when restoring). Accepts modes analog to the tarfile library.
92 # used together with aes modes to encrypt and decrypt backups.
97 # parameter version to use when encrypting; note that this has no effect
98 # on decryption since the required settings are determined from the headers
99 crypto_version = DELTATAR_HEADER_VERSION
100 crypto_paramversion = None
102 # when encrypting or decrypting, these hold crypto handlers; created before
103 # establishing the Tarfile stream iff a password is supplied.
107 # python logger object.
110 # specifies the index mode in the same format as @param mode, but without
111 # the ':', '|' or '#' at the begining. It doesn't make sense to specify
112 # that the index is encrypted if no password is given in the constructor.
115 # current time for this backup. Used for file names and file creation checks
118 # extra data to included in the header of the index file when creating a
122 # valid tarfile modes and their corresponding default file extension
123 __file_extensions_dict = {
132 '#gz.pdtcrypt': '.gz',
137 # valid index modes and their corresponding default file extension
138 __index_extensions_dict = {
142 'gz.pdtcrypt': '.gz',
146 # valid path prefixes
147 __path_prefix_list = [
153 def __init__(self, excluded_files=[], included_files=[],
154 filter_func=None, mode="", password=None,
155 crypto_key=None, nacl=None,
156 crypto_version=DELTATAR_HEADER_VERSION,
157 crypto_paramversion=DELTATAR_PARAMETER_VERSION,
158 logger=None, index_mode=None, index_name_func=None,
159 volume_name_func=None):
161 Constructor. Configures the diff engine.
164 - excluded_files: list of files to exclude in the backup creation or
165 restore operation. It can contain python regular expressions.
167 - included_files: list of files to include in the backup creation or
168 restore operation. It can contain python regular expressions. If
169 empty, all files in the source path will be backed up (when creating a
170 backup) or all the files in the backup will be restored (when
171 restoring a backup), but if included_files is set then only the files
172 include in the list will be processed.
174 - filter_func: custom filter of files to be backed up (or restored).
175 Unused and unset by default. The function receives a file path and
176 must return a boolean.
178 - mode: mode in which the delta will be created (when creating a backup)
179 or opened (when restoring). Accepts the same modes as the tarfile
180 library. Valid modes are:
183 ':' open uncompressed
184 ':gz' open with gzip compression
185 ':bz2' open with bzip2 compression
186 '|' open an uncompressed stream of tar blocks
187 '|gz' open a gzip compressed stream of tar blocks
188 '|bz2' open a bzip2 compressed stream of tar blocks
189 '#gz' open a stream of gzip compressed tar blocks
191 - crypto_key: used to encrypt and decrypt backups. Encryption will
192 be enabled automatically if a key is supplied. Requires a salt to be
195 - nacl: salt that was used to derive the encryption key for embedding
196 in the PDTCRYPT header. Not needed when decrypting and when
197 encrypting with password.
199 - password: used to encrypt and decrypt backups. Encryption will be
200 enabled automatically if a password is supplied.
202 - crypto_version: version of the format, determining the kind of PDT
205 - crypto_paramversion: optionally request encryption conforming to
206 a specific parameter version. Defaults to the standard PDT value
207 which as of 2017 is the only one available.
209 - logger: python logger object. Optional.
211 - index_mode: specifies the index mode in the same format as @param
212 mode, but without the ':', '|' or '#' at the begining. If encryption
213 is requested it will extend to the auxiliary (index, info) files as
214 well. This is an optional parameter that will automatically mimic
215 @param mode by default if not provided. Valid modes are:
218 'gz' open with gzip compression
219 'bz2' open with bzip2 compression
221 - index_name_func: function that sets a custom name for the index file.
222 This function receives a flag to indicate whether the name will be
223 used for a full or diff backup. The backup path will be prepended to
226 - volume_name_func: function that defines the name of tar volumes. It
227 receives the backup_path, if it's a full backup and the volume number,
228 and must return the name for the corresponding volume name. Optional,
229 DeltaTar has default names for tar volumes.
232 if mode not in self.__file_extensions_dict:
233 raise Exception('Unrecognized extension mode=[%s] requested for files'
236 self.excluded_files = excluded_files
237 self.included_files = included_files
238 self.filter_func = filter_func
239 self.logger = logging.getLogger('deltatar.DeltaTar')
241 self.logger.addHandler(logger)
244 if crypto_key is not None:
245 self.crypto_key = crypto_key
246 self.nacl = nacl # encryption only
248 if password is not None:
249 self.password = password
251 if crypto_version is not None:
252 self.crypto_version = crypto_version
254 if crypto_paramversion is not None:
255 self.crypto_paramversion = crypto_paramversion
257 # generate index_mode
258 if index_mode is None:
264 elif mode not in self.__index_extensions_dict:
265 raise Exception('Unrecognized extension mode=[%s] requested for index'
268 self.index_mode = index_mode
269 self.current_time = datetime.datetime.now()
271 if index_name_func is not None:
272 self.index_name_func = index_name_func
274 if volume_name_func is not None:
275 self.volume_name_func = volume_name_func
277 def pick_extension(self, kind, mode=None):
279 Choose the extension depending on a) the kind of file given, b) the
280 processing mode, and c) the current encryption settings.
283 if kind == PDT_TYPE_ARCHIVE:
286 mode = self.__index_extensions_dict [self.index_mode]
288 if self.crypto_key is not None or self.password is not None:
289 ret += "." + PDTCRYPT_EXTENSION
292 def index_name_func(self, is_full): # pylint: disable=method-hidden
294 Callback for setting a custom name for the index file. Depending on
295 whether *is_full* is set, it will create a suitable name for a full
298 prefix = "bfull" if is_full else "bdiff"
299 date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
300 extension = self.pick_extension \
302 self.__index_extensions_dict [self.index_mode])
304 return "%s-%s.index%s" % (prefix, date_str, extension)
306 def volume_name_func(self, backup_path, # pylint: disable=method-hidden
307 is_full, volume_number,
310 function that defines the name of tar volumes. It receives the
311 backup_path, if it's a full backup and the volume number, and must return
312 the name for the corresponding volume name. Optional, DeltaTar has default
313 names for tar volumes.
315 If guess_name is activated, the file is intended not to be created but
316 to be found, and thus the date will be guessed.
318 prefix = "bfull" if is_full else "bdiff"
319 extension = self.pick_extension \
321 self.__file_extensions_dict [self.mode])
324 date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
325 return "%s-%s-%03d%s" % (prefix, date_str, volume_number + 1, extension)
327 prefix = prefix + "-"
328 postfix = "-%03d%s" % (volume_number + 1, extension)
329 for f in os.listdir(backup_path):
330 if f.startswith(prefix) and f.endswith(postfix):
332 raise Exception("volume not found")
335 def filter_path(self, path, source_path="", is_dir=None):
337 Filters a path, given the source_path, using the filtering properties
338 set in the constructor.
339 The filtering order is:
340 1. included_files (if any)
342 3. filter_func (which must return whether the file is accepted or not)
345 if len(source_path) > 0:
346 # ensure that exactly one '/' at end of dir is also removed
347 source_path = source_path.rstrip(os.sep) + os.sep
348 path = path[len(source_path):]
350 # 1. filter included_files
352 if len(self.included_files) > 0:
354 for i in self.included_files:
355 # it can be either a regexp or a string
356 if isinstance(i, str):
357 # if the string matches, then continue
362 # if the string ends with / it's a directory, and if the
363 # path is contained in it, it is included
364 if i.endswith('/') and path.startswith(i):
368 # if the string doesn't end with /, add it and do the same
370 elif path.startswith(i + '/'):
374 # check for PARENT_MATCH
377 if not dir_path.endswith('/'):
380 if i.startswith(dir_path):
383 # if it's a reg exp, then we just check if it matches
384 elif isinstance(i, re._pattern_type):
389 self.logger.warning('Invalid pattern in included_files: %s' % str(i))
391 if match == NO_MATCH:
394 # when a directory is in PARENT_MATCH, it doesn't matter if it's
395 # excluded. It's subfiles will be excluded, but the directory itself
397 if match != PARENT_MATCH:
398 for e in self.excluded_files:
399 # it can be either a regexp or a string
400 if isinstance(e, str):
401 # if the string matches, then exclude
405 # if the string ends with / it's a directory, and if the
406 # path starts with the directory, then exclude
407 if e.endswith('/') and path.startswith(e):
410 # if the string doesn't end with /, do the same check with
412 elif path.startswith(e + '/'):
415 # if it's a reg exp, then we just check if it matches
416 elif isinstance(e, re._pattern_type):
420 self.logger.warning('Invalid pattern in excluded_files: %s' % str(e))
423 return self.filter_func(path)
427 def _recursive_walk_dir(self, source_path, keep_base_dir=False):
429 Walk a directory recursively, yielding each file/directory
432 source_path = source_path.rstrip(os.sep)
437 beginning_size = len(source_path) + 1 # +1 for os.sep
439 queue = [source_path]
442 cur_path = queue.pop(0)
444 # it might have been removed in the mean time
445 if not os.path.exists(cur_path):
448 for filename in sorted(os.listdir(cur_path)):
449 child = os.path.join(cur_path, filename)
450 is_dir = os.path.isdir(child)
451 status = self.filter_path(child, source_path, is_dir)
452 if status == NO_MATCH:
454 if not os.access(child, os.R_OK):
455 self.logger.warning('Error accessing possibly locked file %s' % child)
459 yield child[beginning_size:]
461 if is_dir and (status == MATCH or status == PARENT_MATCH):
464 def _stat_dict(self, path):
466 Returns a dict with the stat data used to compare files
468 stinfo = os.stat(path)
469 mode = stinfo.st_mode
472 if stat.S_ISDIR(mode):
474 elif stat.S_ISREG(mode):
476 elif stat.S_ISLNK(mode):
483 u'mtime': int(stinfo.st_mtime),
484 u'ctime': int(stinfo.st_ctime),
485 u'uid': stinfo.st_uid,
486 u'gid': stinfo.st_gid,
487 u'inode': stinfo.st_ino,
488 u'size': stinfo.st_size
491 def _equal_stat_dicts(self, d1, d2, listsnapshot_equal=False):
493 Return if the dicts are equal in the stat keys
495 keys = [u'type', u'mode',u'size', u'mtime',
496 # not restored: u'inode', u'ctime'
499 # only if user is root, then also check gid/uid. otherwise do not check it,
500 # because tarfile can chown in case of being superuser only
502 # also, skip the check in rpmbuild since the sources end up with the
503 # uid:gid of the packager while the extracted files are 0:0.
504 if hasattr(os, "geteuid") and os.geteuid() == 0 \
505 and os.getenv ("RPMBUILD_OPTIONS") is None:
509 if (not d1 and d2 != None) or (d1 != None and not d2):
512 if self.prefixed(d1.get('path', -1), listsnapshot_equal) != self.prefixed(d2.get('path', -2), listsnapshot_equal):
515 type = d1.get('type', '')
518 # size doesn't matter for directories
519 if type == 'directory' and key == 'size':
521 if d1.get(key, -1) != d2.get(key, -2):
525 def prefixed(self, path, listsnapshot_equal=False):
527 if a path is not prefixed, return it prefixed
529 for prefix in self.__path_prefix_list:
530 if path.startswith(prefix):
531 if listsnapshot_equal and prefix == u'list://':
532 return u'snapshot://' + path[len(prefix):]
534 return u'snapshot://' + path
536 def unprefixed(self, path):
538 remove a path prefix if any
540 for prefix in self.__path_prefix_list:
541 if path.startswith(prefix):
542 return path[len(prefix):]
546 def initialize_encryption (self, mode):
547 password = self.password
548 key = self.crypto_key
551 if key is None and password is None:
553 if mode == CRYPTO_MODE_ENCRYPT:
554 return crypto.Encrypt (password=password,
557 version=self.crypto_version,
558 paramversion=self.crypto_paramversion)
559 if mode == CRYPTO_MODE_DECRYPT:
560 return crypto.Decrypt (password=password, key=key)
562 raise Exception ("invalid encryption mode [%r]" % mode)
565 def open_auxiliary_file(self, path, mode='r', kind=AUXILIARY_FILE_INDEX):
567 Given the specified configuration, opens a file for reading or writing,
568 inheriting the encryption and compression settings from the backup.
569 Returns a file object ready to use.
571 :param mode: IO mode (read or write, ``"r"`` and ``"w"``,
574 :param kind: Role of the file, see AUXILIARY_FILE_* constants.
575 Both the info and the auxiliary file have a globally
576 unique, constant counter value.
579 if self.index_mode.startswith('gz'):
581 elif self.index_mode.startswith('bz2'):
589 crypto_ctx = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
591 crypto_ctx = self.initialize_encryption (CRYPTO_MODE_DECRYPT)
593 if crypto_ctx is not None:
594 if kind == AUXILIARY_FILE_INFO:
595 enccounter = crypto.AES_GCM_IV_CNT_INFOFILE
596 elif kind == AUXILIARY_FILE_INDEX:
597 enccounter = crypto.AES_GCM_IV_CNT_INDEX
599 raise Exception ("invalid kind of aux file %r" % kind)
601 sink = tarfile._Stream(name=path, mode=mode, comptype=comptype,
602 bufsize=tarfile.RECORDSIZE, fileobj=None,
603 encryption=crypto_ctx, enccounter=enccounter)
608 def create_full_backup(self, source_path, backup_path,
609 max_volume_size=None, extra_data=dict()):
611 Creates a full backup.
614 - source_path: source path to the directory to back up.
615 - backup_path: path where the back up will be stored. Backup path will
616 be created if not existent.
617 - max_volume_size: maximum volume size in megabytes. Used to split the
618 backup in volumes. Optional (won't split in volumes by default).
619 - extra_data: a json-serializable dictionary with information that you
620 want to be included in the header of the index file
623 if not isinstance(source_path, str):
624 raise Exception('Source path must be a string')
626 if not isinstance(backup_path, str):
627 raise Exception('Backup path must be a string')
629 if not os.path.exists(source_path) or not os.path.isdir(source_path):
630 raise Exception('Source path "%s" does not exist or is not a '\
631 'directory' % source_path)
633 if max_volume_size != None and (not isinstance(max_volume_size, int) or\
634 max_volume_size < 1):
635 raise Exception('max_volume_size must be a positive integer')
636 if max_volume_size != None:
637 max_volume_size = max_volume_size*1024*1024
639 if not isinstance(extra_data, dict):
640 raise Exception('extra_data must be a dictionary')
643 extra_data_str = json.dumps(extra_data)
645 raise Exception('extra_data is not json-serializable')
647 if not os.access(source_path, os.R_OK):
648 raise Exception('Source path "%s" is not readable' % source_path)
650 # try to create backup path if needed
651 if not os.path.exists(backup_path):
652 os.makedirs(backup_path)
654 if not os.access(backup_path, os.W_OK):
655 raise Exception('Backup path "%s" is not writeable' % backup_path)
657 if source_path.endswith('/'):
658 source_path = source_path[:-1]
660 if backup_path.endswith('/'):
661 backup_path = backup_path[:-1]
663 # update current time
664 self.current_time = datetime.datetime.now()
666 if self.mode not in self.__file_extensions_dict:
667 raise Exception('Unrecognized extension')
669 # setup for encrypting payload
670 if self.encryptor is None:
671 self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
673 # some initialization
676 # generate the first volume name
677 vol_name = self.volume_name_func(backup_path, True, 0)
678 tarfile_path = os.path.join(backup_path, vol_name)
681 index_name = self.index_name_func(True)
682 index_path = os.path.join(backup_path, index_name)
683 index_sink = self.open_auxiliary_file(index_path, 'w')
687 def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
689 Handles the new volumes
691 volume_name = deltarobj.volume_name_func(backup_path, True, volume_number)
692 volume_path = os.path.join(backup_path, volume_name)
693 deltarobj.vol_no = volume_number
695 # we convert relative paths into absolute because CWD is changed
696 if not os.path.isabs(volume_path):
697 volume_path = os.path.join(cwd, volume_path)
699 if tarobj.fileobj is not None:
700 tarobj.fileobj.close()
702 deltarobj.logger.debug("opening volume %s" % volume_path)
704 tarobj.open_volume(volume_path, encryption=encryption)
706 # wraps some args from context into the handler
707 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.encryptor)
709 index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "full", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))
711 s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
712 # calculate checksum and write into the stream
713 crc = binascii.crc32(s) & 0xFFFFffff
716 # start creating the tarfile
717 tarobj = tarfile.TarFile.open(tarfile_path,
718 mode='w' + self.mode,
719 format=tarfile.GNU_FORMAT,
720 concat='#' in self.mode,
721 encryption=self.encryptor,
722 max_volume_size=max_volume_size,
723 new_volume_handler=new_volume_handler,
724 save_to_members=False,
726 os.chdir(source_path)
728 # for each file to be in the backup, do:
729 for path in self._recursive_walk_dir('.'):
730 # calculate stat dict for current file
731 statd = self._stat_dict(path)
732 statd['path'] = u'snapshot://' + statd['path']
733 statd['volume'] = self.vol_no
738 tarobj.add(path, arcname = statd['path'], recursive=False)
739 except FileNotFoundError as exn:
740 # file vanished since the call to access(3) above
741 self.logger.warning ("object [%s] no longer available in "
742 "file system (error: %s); skipping"
744 continue # prevent indexing
746 # retrieve file offset
747 statd['offset'] = tarobj.get_last_member_offset()
748 self.logger.debug("backup %s" % statd['path'])
750 # store the stat dict in the index
751 s = bytes(json.dumps(statd) + '\n', 'UTF-8')
752 crc = binascii.crc32(s, crc) & 0xffffffff
755 s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
756 crc = binascii.crc32(s, crc) & 0xffffffff
758 s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
763 index_sink.close (close_fileobj=True)
765 def create_diff_backup(self, source_path, backup_path, previous_index_path,
766 max_volume_size=None, extra_data=dict()):
771 - source_path: source path to the directory to back up.
772 - backup_path: path where the back up will be stored. Backup path will
773 be created if not existent.
774 - previous_index_path: index of the previous backup, needed to know
775 which files changed since then.
776 - max_volume_size: maximum volume size in megabytes (MB). Used to split
777 the backup in volumes. Optional (won't split in volumes by default).
779 NOTE: previous index is assumed to follow exactly the same format as
780 the index_mode setup in the constructor.
782 # check/sanitize input
783 if not isinstance(source_path, str):
784 raise Exception('Source path must be a string')
786 if not isinstance(backup_path, str):
787 raise Exception('Backup path must be a string')
789 if not os.path.exists(source_path) or not os.path.isdir(source_path):
790 raise Exception('Source path "%s" does not exist or is not a '\
791 'directory' % source_path)
793 if not isinstance(extra_data, dict):
794 raise Exception('extra_data must be a dictionary')
797 extra_data_str = json.dumps(extra_data)
799 raise Exception('extra_data is not json-serializable')
801 if not os.access(source_path, os.R_OK):
802 raise Exception('Source path "%s" is not readable' % source_path)
804 if max_volume_size != None and (not isinstance(max_volume_size, int) or\
805 max_volume_size < 1):
806 raise Exception('max_volume_size must be a positive integer')
807 if max_volume_size != None:
808 max_volume_size = max_volume_size*1024*1024
810 if not isinstance(previous_index_path, str):
811 raise Exception('previous_index_path must be A string')
813 if not os.path.exists(previous_index_path) or not os.path.isfile(previous_index_path):
814 raise Exception('Index path "%s" does not exist or is not a '\
815 'file' % previous_index_path)
817 if not os.access(previous_index_path, os.R_OK):
818 raise Exception('Index path "%s" is not readable' % previous_index_path)
820 # try to create backup path if needed
821 if not os.path.exists(backup_path):
822 os.makedirs(backup_path)
824 if not os.access(backup_path, os.W_OK):
825 raise Exception('Backup path "%s" is not writeable' % backup_path)
827 if source_path.endswith('/'):
828 source_path = source_path[:-1]
830 if backup_path.endswith('/'):
831 backup_path = backup_path[:-1]
833 # update current time
834 self.current_time = datetime.datetime.now()
836 if self.mode not in self.__file_extensions_dict:
837 raise Exception('Unrecognized extension')
839 # setup for encrypting payload
840 if self.encryptor is None:
841 self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
843 # some initialization
846 # generate the first volume name
847 vol_name = self.volume_name_func(backup_path, is_full=False,
849 tarfile_path = os.path.join(backup_path, vol_name)
854 index_name = self.index_name_func(is_full=False)
855 index_path = os.path.join(backup_path, index_name)
856 index_sink = self.open_auxiliary_file(index_path, 'w')
858 def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number):
860 Handles the new volumes
862 volume_name = deltarobj.volume_name_func(backup_path, is_full=False,
863 volume_number=volume_number)
864 volume_path = os.path.join(backup_path, volume_name)
865 deltarobj.vol_no = volume_number
867 # we convert relative paths into absolute because CWD is changed
868 if not os.path.isabs(volume_path):
869 volume_path = os.path.join(cwd, volume_path)
871 deltarobj.logger.debug("opening volume %s" % volume_path)
872 tarobj.open_volume(volume_path)
874 # wraps some args from context into the handler
875 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path)
877 index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "diff", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))
879 s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
880 # calculate checksum and write into the stream
881 crc = binascii.crc32(s) & 0xFFFFffff
884 # start creating the tarfile
885 tarobj = tarfile.TarFile.open(tarfile_path,
886 mode='w' + self.mode,
887 format=tarfile.GNU_FORMAT,
888 concat='#' in self.mode,
889 encryption=self.encryptor,
890 max_volume_size=max_volume_size,
891 new_volume_handler=new_volume_handler,
892 save_to_members=False,
896 # create the iterators, first the previous index iterator, then the
897 # source path directory iterator and collate and iterate them
898 if not os.path.isabs(previous_index_path):
899 previous_index_path = os.path.join(cwd, previous_index_path)
900 index_it = self.iterate_index_path(previous_index_path)
902 os.chdir(source_path)
903 dir_it = self._recursive_walk_dir('.')
904 dir_path_it = self.jsonize_path_iterator(dir_it)
912 # for each file to be in the backup, do:
913 for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
915 # if file is not in the index, it means it's a new file, so we have
920 # if the file is not in the directory iterator, it means that it has
921 # been deleted, so we need to mark it as such
924 # if the file is in both iterators, it means it might have either
925 # not changed (in which case we will just list it in our index but
926 # it will not be included in the tar file), or it might have
927 # changed, in which case we will snapshot it.
928 elif ipath and dpath:
929 if self._equal_stat_dicts(ipath, dpath):
933 # TODO: when creating chained backups (i.e. diffing from another
934 # diff), we will need to detect the type of action in the previous
935 # index, because if it was delete and dpath is None, we should
938 if action == 'snapshot':
939 # calculate stat dict for current file
941 stat['path'] = "snapshot://" + dpath['path']
942 stat['volume'] = self.vol_no
944 self.logger.debug("[STORE] %s" % dpath['path'])
947 tarobj.add(dpath['path'], arcname=stat['path'], recursive=False)
948 # retrieve file offset
949 stat['offset'] = tarobj.get_last_member_offset()
950 except FileNotFoundError as exn:
951 # file vanished since the call to access(3) above
952 self.logger.warning ("object [%s] no longer available in "
953 "file system (error: %s); skipping"
954 % (dpath ["path"], str (exn)))
955 stat = None # prevent indexing
957 elif action == 'delete':
958 path = self.unprefixed(ipath['path'])
960 u'path': u'delete://' + path,
961 u'type': ipath['type']
963 self.logger.debug("[DELETE] %s" % path)
965 # mark it as deleted in the backup
966 tarobj.add("/dev/null", arcname=stat['path'])
967 elif action == 'list':
969 path = self.unprefixed(ipath['path'])
970 stat['path'] = u'list://' + path
971 # unchanged files do not enter in the backup, only in the index
972 self.logger.debug("[UNCHANGED] %s" % path)
975 self.logger.warning('unknown action in create_diff_backup: {0}'
980 # store the stat dict in the index
981 s = bytes(json.dumps(stat) + '\n', 'UTF-8')
982 crc = binascii.crc32(s, crc) & 0xffffffff
985 s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
986 crc = binascii.crc32(s, crc) & 0xffffffff
988 s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
997 def iterate_index_path(self, index_path):
999 Returns an index iterator. Internally, it uses a classic iterator class.
1000 We do that instead of just yielding so that the iterator object can have
1001 an additional function to close the file descriptor that is opened in
1005 class IndexPathIterator(object):
1006 def __init__(self, delta_tar, index_path):
1007 self.delta_tar = delta_tar
1008 self.index_path = index_path
1010 self.extra_data = dict()
1020 def __enter__(self):
1022 Allows this iterator to be used with the "with" statement
1025 self.f = self.delta_tar.open_auxiliary_file(self.index_path, 'r')
1026 # check index header
1027 j, l_no = self.delta_tar._parse_json_line(self.f, 0)
1028 if j.get("type", '') != 'python-delta-tar-index' or\
1029 j.get('version', -1) != 1:
1030 raise Exception("invalid index file format: %s" % json.dumps(j))
1032 self.extra_data = j.get('extra_data', dict())
1034 # find BEGIN-FILE-LIST, ignore other headers
1036 j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
1037 if j.get('type', '') == 'BEGIN-FILE-LIST':
1041 def __exit__(self, type, value, tb):
1043 Allows this iterator to be used with the "with" statement
1050 # read each file in the index and process it to do the restore
1054 j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
1055 except Exception as e:
1060 op_type = j.get('type', '')
1062 # when we detect the end of the list, break the loop
1063 if op_type == 'END-FILE-LIST':
1069 if op_type not in ['directory', 'file', 'link']:
1070 self.delta_tar.logger.warning('unrecognized type to be '
1071 'restored: %s, line %d' % (op_type, l_no))
1073 return self.__next__()
1077 return IndexPathIterator(self, index_path)
1079 def iterate_tar_path(self, tar_path, new_volume_handler=None):
1081 Returns a tar iterator that iterates jsonized member items that contain
1082 an additional "member" field, used by RestoreHelper.
1084 class TarPathIterator(object):
1085 def __init__(self, delta_tar, tar_path, new_volume_handler=None):
1086 self.delta_tar = delta_tar
1087 self.tar_path = tar_path
1089 self.last_member = None
1090 self.new_volume_handler = new_volume_handler
1098 self.tar_obj.close()
1100 def __enter__(self):
1102 Allows this iterator to be used with the "with" statement
1104 if self.tar_obj is None:
1106 if self.delta_tar.password is not None:
1107 decryptor = crypto.Decrypt \
1108 (password=self.delta_tar.password,
1109 key=self.delta_tar.crypto_key)
1110 self.tar_obj = tarfile.TarFile.open(self.tar_path,
1111 mode='r' + self.delta_tar.mode,
1112 format=tarfile.GNU_FORMAT,
1113 concat='#' in self.delta_tar.mode,
1114 encryption=decryptor,
1115 new_volume_handler=self.new_volume_handler,
1116 save_to_members=False,
1120 def __exit__(self, type, value, tb):
1122 Allows this iterator to be used with the "with" statement
1125 self.tar_obj.close()
1130 Read each member and return it as a stat dict
1132 tarinfo = self.tar_obj.__iter__().__next__()
1133 # NOTE: here we compare if tarinfo.path is the same as before
1134 # instead of comparing the tarinfo object itself because the
1135 # object itself might change for multivol tarinfos
1136 if tarinfo is None or (self.last_member is not None and\
1137 self.delta_tar.unprefixed(tarinfo.path) == self.delta_tar.unprefixed(self.last_member.path)):
1140 self.last_member = tarinfo
1143 if tarinfo.isfile():
1145 elif tarinfo.isdir():
1147 elif tarinfo.islnk() or tarinfo.issym():
1152 u'path': tarinfo.path,
1153 u'mode': tarinfo.mode,
1154 u'mtime': tarinfo.mtime,
1155 u'ctime': -1, # cannot restore
1156 u'uid': tarinfo.uid,
1157 u'gid': tarinfo.gid,
1158 u'inode': -1, # cannot restore
1159 u'size': tarinfo.size,
1163 return TarPathIterator(self, tar_path, new_volume_handler)
1165 def jsonize_path_iterator(self, iter, strip=0):
1167 converts the yielded items of an iterator into json path lines.
1169 strip: Strip the smallest prefix containing num leading slashes from
1174 path = iter.__next__()
1176 yield self._stat_dict(path), 0
1178 st = self._stat_dict(path)
1179 st['path'] = "/".join(path.split("/")[strip:])
1181 except StopIteration:
1184 def iterate_disaster_index (self, index):
1186 Mimick the behavior of the other object iterators, just with the inputs
1187 supplied directly as *index*.
1190 class RawIndexIterator(object):
1191 def __init__(self, delta_tar, index):
1192 self.delta_tar = delta_tar
1202 def __enter__(self):
1204 Allows this iterator to be used with the "with" statement
1206 self.iter = self.index.__iter__ ()
1209 def __exit__(self, type, value, tb):
1211 Allows this iterator to be used with the "with" statement
1215 idxent = self.iter.__next__ ()
1218 return RawIndexIterator(self, index)
1220 def collate_iterators(self, it1, it2):
1222 Collate two iterators, so that it returns pairs of the items of each
1223 iterator (if the items are the same), or (None, elem2) or (elem1, None)
1224 when there's no match for the items in the other iterator.
1226 It assumes that the items in both lists are ordered in the same way.
1229 elem1, elem2 = None, None
1233 elem1, l_no = it1.__next__()
1234 except StopIteration:
1236 yield (None, elem2, l_no)
1238 if isinstance(elem2, tuple):
1240 yield (None, elem2, l_no)
1244 elem2 = it2.__next__()
1245 if isinstance(elem2, tuple):
1247 except StopIteration:
1249 yield (elem1, None, l_no)
1250 for elem1, l_no in it1:
1251 yield (elem1, None, l_no)
1254 index1 = self.unprefixed(elem1['path'])
1255 index2 = self.unprefixed(elem2['path'])
1256 i1, i2 = self.compare_indexes(index1, index2)
1258 yield1 = yield2 = None
1265 yield (yield1, yield2, l_no)
1267 def compare_indexes(self, index1, index2):
1269 Compare iterator indexes and return a tuple in the following form:
1270 if index1 < index2, returns (index1, None)
1271 if index1 == index2 returns (index1, index2)
1272 else: returns (None, index2)
1274 l1 = index1.split('/')
1275 l2 = index2.split('/')
1276 length = len(l2) - len(l1)
1279 return (index1, None)
1281 return (None, index2)
1283 for i1, i2 in zip(l1, l2):
1285 return (index1, None)
1287 return (None, index2)
1289 return (index1, index2)
1291 def list_backup(self, backup_tar_path, list_func=None):
1292 if not isinstance(backup_tar_path, str):
1293 raise Exception('Backup tar path must be a string')
1295 if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
1296 raise Exception('Source path "%s" does not exist or is not a '\
1297 'file' % backup_tar_path)
1299 if not os.access(backup_tar_path, os.R_OK):
1300 raise Exception('Source path "%s" is not readable' % backup_tar_path)
1304 def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
1306 Handles the new volumes
1308 volume_name = deltarobj.volume_name_func(backup_path, True,
1309 volume_number, guess_name=True)
1310 volume_path = os.path.join(backup_path, volume_name)
1312 # we convert relative paths into absolute because CWD is changed
1313 if not os.path.isabs(volume_path):
1314 volume_path = os.path.join(cwd, volume_path)
1315 tarobj.open_volume(volume_path, encryption=encryption)
1317 if self.decryptor is None:
1318 self.decryptor = self.initialize_encryption (CRYPTO_MODE_DECRYPT)
1320 backup_path = os.path.dirname(backup_tar_path)
1321 if not os.path.isabs(backup_path):
1322 backup_path = os.path.join(cwd, backup_path)
1323 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.decryptor)
1325 tarobj = tarfile.TarFile.open(backup_tar_path,
1326 mode='r' + self.mode,
1327 format=tarfile.GNU_FORMAT,
1328 concat='#' in self.mode,
1329 encryption=self.decryptor,
1330 new_volume_handler=new_volume_handler,
1331 save_to_members=False,
1334 def filter(cls, list_func, tarinfo):
1335 if list_func is None:
1336 self.logger.info(tarinfo.path)
1340 filter = partial(filter, self, list_func)
1342 tarobj.extractall(filter=filter)
1345 def restore_backup(self, target_path, backup_indexes_paths=[],
1346 backup_tar_path=None, restore_callback=None,
1347 disaster=tarfile.TOLERANCE_STRICT, backup_index=None):
1352 - target_path: path to restore.
1353 - backup_indexes_paths: path to backup indexes, in descending date order.
1354 The indexes indicate the location of their respective backup volumes,
1355 and multiple indexes are needed to be able to restore diff backups.
1356 Note that this is an optional parameter: if not suplied, it will
1357 try to restore directly from backup_tar_path.
1358 - backup_tar_path: path to the backup tar file. Used as an alternative
1359 to backup_indexes_paths to restore directly from a tar file without
1360 using any file index. If it's a multivol tarfile, volume_name_func
1362 - restore_callback: callback function to be called during restore.
1363 This is passed to the helper and gets called for every file.
1365 NOTE: If you want to use an index to restore a backup, this function
1366 only supports to do so when the tarfile mode is either uncompressed or
1367 uses concat compress mode, because otherwise it would be very slow.
1369 NOTE: Indices are assumed to follow the same format as the index_mode
1370 specified in the constructor.
1372 Returns the list of files that could not be restored, if there were
1375 # check/sanitize input
1376 if not isinstance(target_path, str):
1377 raise Exception('Target path must be a string')
1379 if backup_indexes_paths is None and backup_tar_path == []:
1380 raise Exception("You have to either provide index paths or a tar path")
1382 if isinstance (backup_index, list) is True:
1384 elif len(backup_indexes_paths) == 0:
1390 if not isinstance(backup_tar_path, str):
1391 raise Exception('Backup tar path must be a string')
1393 if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
1394 raise Exception('Source path "%s" does not exist or is not a '\
1395 'file' % backup_tar_path)
1397 if not os.access(backup_tar_path, os.R_OK):
1398 raise Exception('Source path "%s" is not readable' % backup_tar_path)
1400 if not isinstance(backup_indexes_paths, list):
1401 raise Exception('backup_indexes_paths must be a list')
1403 if self.mode.startswith(':') or self.mode.startswith('|'):
1404 raise Exception('Restore only supports either uncompressed tars'
1405 ' or concat compression when restoring from an index, and '
1406 ' the open mode you provided is "%s"' % self.mode)
1408 for index in backup_indexes_paths:
1409 if not isinstance(index, str):
1410 raise Exception('indices must be strings')
1412 if not os.path.exists(index) or not os.path.isfile(index):
1413 raise Exception('Index path "%s" does not exist or is not a '\
1416 if not os.access(index, os.R_OK):
1417 raise Exception('Index path "%s" is not readable' % index)
1419 # try to create backup path if needed
1420 if not os.path.exists(target_path):
1421 os.makedirs(target_path)
1423 # make backup_tar_path absolute so that iterate_tar_path works fine
1424 if backup_tar_path and not os.path.isabs(backup_tar_path):
1425 backup_tar_path = os.path.abspath(backup_tar_path)
1428 os.chdir(target_path)
1430 # setup for decrypting payload
1431 if self.decryptor is None:
1432 self.decryptor = self.initialize_encryption (CRYPTO_MODE_DECRYPT)
1435 index_it = self.iterate_tar_path(backup_tar_path)
1436 helper = RestoreHelper(self, cwd, backup_path=backup_tar_path,
1437 tarobj=index_it.tar_obj)
1438 elif mode == "diff":
1439 helper = RestoreHelper(self, cwd, backup_indexes_paths,
1442 # get iterator from newest index at _data[0]
1443 index1 = helper._data[0]["path"]
1444 index_it = self.iterate_index_path(index1)
1445 except tarfile.DecryptionError as exn:
1446 self.logger.error("failed to decrypt file [%s]: %s; is this an "
1447 "actual encrypted index file?"
1448 % (index1, str (exn)))
1449 return [(index1, exn)]
1450 except Exception as exn:
1452 self.logger.error("failed to read file [%s]: %s; is this an "
1453 "actual index file?" % (index1, str (exn)))
1454 return [(index1, exn)]
1455 elif mode == "disaster":
1456 index_it = self.iterate_disaster_index (backup_index)
1457 helper = RestoreHelper (self, cwd, backup_path=backup_tar_path,
1458 backup_index=backup_index,
1462 dir_it = self._recursive_walk_dir('.')
1463 dir_path_it = self.jsonize_path_iterator(dir_it)
1465 failed = [] # irrecoverable files
1467 # for each file to be restored, do:
1468 for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
1470 upath = dpath['path']
1471 op_type = dpath['type']
1473 upath = self.unprefixed(ipath['path'])
1474 op_type = ipath['type']
1477 if self.filter_path(upath, '', op_type == 'directory') == NO_MATCH:
1480 # if types of the file mismatch, the file needs to be deleted
1482 if ipath is not None and dpath is not None and\
1483 dpath['type'] != ipath['type']:
1484 helper.delete(upath)
1486 # if file not found in dpath, we can directly restore from index
1488 # if the file doesn't exist and it needs to be deleted, it
1489 # means that work is already done
1490 if ipath['path'].startswith('delete://'):
1493 self.logger.debug("restore %s" % ipath['path'])
1494 helper.restore(ipath, l_no, restore_callback)
1495 except Exception as e:
1496 iipath = ipath.get ("path", "")
1497 self.logger.error("FAILED to restore: {} ({})"
1499 if disaster != tarfile.TOLERANCE_STRICT:
1500 failed.append ((iipath, e))
1503 # if both files are equal, we have nothing to restore
1504 if self._equal_stat_dicts(ipath, dpath, listsnapshot_equal=True):
1507 # we have to restore the file, but first we need to delete the
1508 # current existing file.
1509 # we don't delete the file if it's a directory, because it might
1510 # just have changed mtime, so it's quite inefficient to remove
1513 if ipath['type'] != 'directory' or ipath['path'].startswith('delete://'):
1514 helper.delete(upath)
1515 self.logger.debug("restore %s" % ipath['path'])
1517 helper.restore(ipath, l_no, restore_callback)
1518 except Exception as e:
1519 if disaster == tarfile.TOLERANCE_STRICT:
1521 failed.append ((ipath.get ("path", ""), e))
1524 # if the file is not in the index (so it comes from the target
1525 # directory) then we have to delete it
1527 self.logger.debug("delete %s" % upath)
1528 helper.delete(upath)
1530 helper.restore_directories_permissions()
1538 def recover_backup(self, target_path, backup_indexes_paths=[],
1539 restore_callback=None):
1541 Walk the index, extracting objects in disaster mode. Bad files are
1542 reported along with a reason.
1544 return self.restore_backup(target_path,
1545 backup_indexes_paths=backup_indexes_paths,
1546 disaster=tarfile.TOLERANCE_RECOVER)
1549 def rescue_backup(self, target_path, backup_tar_path,
1550 restore_callback=None):
1552 More aggressive “unfsck” mode: do not rely on the index data as the
1553 files may be corrupt; skim files for header-like information and
1554 attempt to retrieve the data.
1556 def gen_volume_name (nvol):
1557 return os.path.join (os.path.dirname (backup_tar_path),
1558 self.volume_name_func (backup_tar_path,
1562 backup_index = tarfile.gen_rescue_index (gen_volume_name,
1564 password=self.password,
1565 key=self.crypto_key)
1567 return self.restore_backup(target_path,
1568 backup_index=backup_index,
1569 backup_tar_path=backup_tar_path,
1570 disaster=tarfile.TOLERANCE_RESCUE)
1573 def _parse_json_line(self, f, l_no):
1575 Read line from file like object and process it as JSON.
1580 j = json.loads(l.decode('UTF-8'))
1581 except UnicodeDecodeError as e:
1582 if tuple (l [0:2]) == tarfile.GZ_MAGIC:
1584 ("error parsing line #%d as json: looks like a compressed file (%d B: [%s..])"
1585 % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
1588 ("error parsing line #%d as json: not a text file (%d B: [%s..])"
1589 % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
1591 except ValueError as e:
1592 raise Exception("error parsing this json line "
1593 "(line number %d): %s" % (l_no, l))
1597 class RestoreHelper(object):
1599 Class used to help to restore files from indices
1602 # holds the dicts of data
1609 # list of directories to be restored. This is done as a last step, see
1610 # tarfile.extractall for details.
1613 _disaster = tarfile.TOLERANCE_STRICT
1615 def __init__(self, deltatar, cwd, index_list=None, backup_path=False,
1616 backup_index=None, tarobj=None,
1617 disaster=tarfile.TOLERANCE_STRICT):
1619 Constructor opens the tars and init the data structures.
1623 - Index list must be provided in reverse order (newer first).
1624 - “newer first” apparently means that if there are n backups
1625 provided, the last full backup is at index n-1 and the most recent
1626 diff backup is at index 0.
1627 - Only the first, the second, and the last elements of
1628 ``index_list`` are relevant, others will not be accessed.
1629 - If no ``index_list`` is provided, both ``tarobj`` and
1630 ``backup_path`` must be passed.
1631 - If ``index_list`` is provided, the values of ``tarobj`` and
1632 ``backup_path`` are ignored.
1635 self._directories = []
1636 self._deltatar = deltatar
1638 self._password = deltatar.password
1639 self._crypto_key = deltatar.crypto_key
1640 self._decryptors = []
1641 self._disaster = disaster
1648 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
1649 self.canchown = True
1651 self.canchown = False
1653 if isinstance (backup_index, list) is True:
1654 decryptor = self._deltatar.decryptor
1656 [{ "curr_vol_no" : None
1660 , "path" : backup_path
1663 , "last_itelement" : None
1665 , "new_volume_handler" :
1666 partial(self.new_volume_handler,
1667 self._deltatar, self._cwd, True,
1668 os.path.dirname(backup_path), decryptor)
1669 , "decryptor" : decryptor
1671 elif index_list is not None:
1672 for index in index_list:
1673 is_full = index == index_list[-1]
1676 if self._password is not None:
1677 decryptor = crypto.Decrypt (password=self._password,
1678 key=self._crypto_key)
1680 # make paths absolute to avoid cwd problems
1681 if not os.path.isabs(index):
1682 index = os.path.normpath(os.path.join(cwd, index))
1692 last_itelement = None,
1694 new_volume_handler = partial(self.new_volume_handler,
1695 self._deltatar, self._cwd, is_full,
1696 os.path.dirname(index), decryptor),
1697 decryptor = decryptor
1699 self._data.append(s)
1701 # make paths absolute to avoid cwd problems
1702 if not os.path.isabs(backup_path):
1703 backup_path = os.path.normpath(os.path.join(cwd, backup_path))
1705 # update the new_volume_handler of tar_obj
1706 tarobj.new_volume_handler = partial(self.new_volume_handler,
1707 self._deltatar, self._cwd, True, os.path.dirname(backup_path),
1708 self._deltatar.decryptor)
1717 last_itelement = None,
1719 new_volume_handler = tarobj.new_volume_handler,
1720 decryptor = self._deltatar.decryptor
1722 self._data.append(s)
1727 Closes all open files
1729 for data in self._data:
1731 data['vol_fd'].close()
1732 data['vol_fd'] = None
1734 data['tarobj'].close()
1735 data['tarobj'] = None
1737 def delete(self, path):
1741 if not os.path.exists(path):
1744 # to preserve parent directory mtime, we save it
1745 parent_dir = os.path.dirname(path) or os.getcwd()
1746 parent_dir_mtime = int(os.stat(parent_dir).st_mtime)
1748 if os.path.isdir(path) and not os.path.islink(path):
1753 # now we restore parent_directory mtime
1754 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1756 def restore(self, itpath, l_no, callback=None):
1758 Restore the path from the appropriate backup. Receives the current path
1759 from the newest (=first) index iterator. itpath must be not null.
1760 callback is a custom function that gets called for every file.
1762 NB: This function takes the attribute ``_data`` as input but will only
1763 ever use its first and, if available, second element. Anything else in
1764 ``._data[]`` will be ignored.
1766 path = itpath['path']
1768 # Calls the callback function
1772 if path.startswith('delete://'):
1773 # the file has previously been deleted already in restore_backup in
1774 # all cases so we just need to finish
1777 # get data from newest index (_data[0])
1778 data = self._data[0]
1779 upath = self._deltatar.unprefixed(path)
1781 # to preserve parent directory mtime, we save it
1782 parent_dir = os.path.dirname(upath) or os.getcwd()
1783 if not os.path.exists(parent_dir):
1784 os.makedirs(parent_dir)
1785 parent_dir_mtime = int(os.stat(parent_dir).st_mtime)
1787 # if path is found in the newest index as to be snapshotted, deal with it
1789 if path.startswith('snapshot://'):
1790 self.restore_file(itpath, data, path, l_no, upath)
1792 # now we restore parent_directory mtime
1793 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1796 # we go from index to index, finding the path in the index, then finding
1797 # the index with the most recent snapshot of the file being restored
1799 # Right now we support diff backups, only. No incremental backups.
1800 # As a result _data[0] is always the diff backup index
1801 # and _data[1] the full backup index.
1802 if len(self._data) == 2:
1803 data = self._data[1]
1804 d, l_no, dpath = self.find_path_in_index(data, upath)
1806 self._deltatar.logger.warning('Error restoring file %s from '
1807 'index, not found in index %s' % (path, data['path']))
1810 cur_path = d.get('path', '')
1811 if cur_path.startswith('delete://'):
1812 self._deltatar.logger.warning(('Strange thing happened, file '
1813 '%s was listed in first index but deleted by another '
1814 'one. Path was ignored and untouched.') % path)
1816 elif cur_path.startswith('snapshot://'):
1817 # this code path is reached when the file is unchanged
1818 # in the newest index and therefore of type 'list://'
1819 self.restore_file(d, data, path, l_no, dpath)
1821 # now we restore parent_directory mtime
1822 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1825 # error code path is reached when:
1826 # a) we have more than two indexes (unsupported atm)
1827 # b) both indexes contain a list:// entry (logic error)
1828 # c) we have just one index and it also contains list://
1829 self._deltatar.logger.warning(('Error restoring file %s from index, '
1830 'snapshot not found in any index') % path)
1832 def find_path_in_index(self, data, upath):
1833 # NOTE: we restart the iterator sometimes because the iterator can be
1834 # walked over completely multiple times, for example if one path if not
1835 # found in one index and we have to go to the next index.
1836 it = data['iterator']
1838 it = data['iterator'] = self._deltatar.iterate_index_path(data["path"])
1839 d, l_no = it.__next__()
1841 d = data['last_itelement']
1842 l_no = data['last_lno']
1845 dpath = self._deltatar.unprefixed(d.get('path', ''))
1847 data['last_itelement'] = d
1848 data['last_lno'] = l_no
1849 return d, l_no, dpath
1851 up, dp = self._deltatar.compare_indexes(upath, dpath)
1852 # any time upath should have appeared before current dpath, it means
1853 # upath is just not in this index and we should stop
1855 data['last_itelement'] = d
1856 data['last_lno'] = l_no
1860 d, l_no = it.__next__()
1861 except StopIteration:
1862 data['last_itelement'] = d
1863 data['last_lno'] = l_no
1866 def restore_directories_permissions(self):
1868 Restore directory permissions when everything have been restored
1875 self._directories.sort(key=operator.attrgetter('name'))
1876 self._directories.reverse()
1878 # Set correct owner, mtime and filemode on directories.
1879 for member in self._directories:
1880 dirpath = member.name
1882 os.chmod(dirpath, member.mode)
1883 os.utime(dirpath, (member.mtime, member.mtime))
1885 # We have to be root to do so.
1887 g = grp.getgrnam(member.gname)[2]
1891 u = pwd.getpwnam(member.uname)[2]
1895 if member.issym and hasattr(os, "lchown"):
1896 os.lchown(dirpath, u, g)
1898 os.chown(dirpath, u, g)
1899 except EnvironmentError:
1900 raise tarfile.ExtractError("could not change owner")
1902 except tarfile.ExtractError as e:
1903 self._deltatar.logger.warning('tarfile: %s' % e)
1906 def new_volume_handler(deltarobj, cwd, is_full, backup_path, encryption, tarobj, base_name, volume_number):
1908 Handles the new volumes
1910 volume_name = deltarobj.volume_name_func(backup_path, is_full,
1911 volume_number, guess_name=True)
1912 volume_path = os.path.join(backup_path, volume_name)
1914 # we convert relative paths into absolute because CWD is changed
1915 if not os.path.isabs(volume_path):
1916 volume_path = os.path.join(cwd, volume_path)
1917 tarobj.open_volume(volume_path, encryption=encryption)
1919 def restore_file(self, file_data, index_data, path, l_no, unprefixed_path):
1921 Restores a snapshot of a file from a specific backup
1923 op_type = file_data.get('type', -1)
1924 member = file_data.get('member', None)
1925 ismember = bool(member)
1927 # when member is set, then we can assume everything is right and we
1928 # just have to restore the path
1930 vol_no = file_data.get('volume', -1)
1932 if not isinstance(vol_no, int) or vol_no < 0:
1933 self._deltatar.logger.warning('unrecognized type to be restored: '
1934 '%s, line %d' % (op_type, l_no))
1936 # setup the volume that needs to be read. only needed when member is
1938 if index_data['curr_vol_no'] != vol_no:
1939 index_data['curr_vol_no'] = vol_no
1940 backup_path = os.path.dirname(index_data['path'])
1941 vol_name = self._deltatar.volume_name_func(backup_path,
1942 index_data['is_full'], vol_no, guess_name=True)
1943 vol_path = os.path.join(backup_path, vol_name)
1944 if index_data['vol_fd']:
1945 index_data['vol_fd'].close()
1946 index_data['vol_fd'] = open(vol_path, 'rb')
1948 # force reopen of the tarobj because of new volume
1949 if index_data['tarobj']:
1950 index_data['tarobj'].close()
1951 index_data['tarobj'] = None
1953 # seek tarfile if needed
1954 offset = file_data.get('offset', -1)
1955 if index_data['tarobj']:
1956 if self._disaster == tarfile.TOLERANCE_RESCUE:
1957 # force a seek and reopen
1958 index_data['tarobj'].close()
1959 index_data['tarobj'] = None
1962 member = index_data['tarobj'].__iter__().__next__()
1963 except tarfile.DecryptionError:
1965 except tarfile.CompressionError:
1968 if not member or member.path != file_data['path']:
1969 # force a seek and reopen
1970 index_data['tarobj'].close()
1971 index_data['tarobj'] = None
1974 # open the tarfile if needed
1975 if not index_data['tarobj']:
1976 index_data['vol_fd'].seek(offset)
1977 index_data['tarobj'] = tarfile.open(mode="r" + self._deltatar.mode,
1978 fileobj=index_data['vol_fd'],
1979 format=tarfile.GNU_FORMAT,
1980 concat='#' in self._deltatar.mode,
1981 encryption=index_data["decryptor"],
1982 new_volume_handler=index_data['new_volume_handler'],
1983 save_to_members=False,
1984 tolerance=self._disaster)
1986 member = index_data['tarobj'].__iter__().__next__()
1988 member.path = unprefixed_path
1989 member.name = unprefixed_path
1991 if op_type == 'directory':
1992 self.add_member_dir(member)
1993 member = copy.copy(member)
1994 member.mode = 0o0700
1996 # if it's an existing directory, we then don't need to recreate it
1997 # just set the right permissions, mtime and that kind of stuff
1998 if os.path.exists(member.path):
2002 # set current volume number in tarobj, otherwise the extraction of the
2003 # file might fail when trying to extract a multivolume member
2004 index_data['tarobj'].volume_number = index_data['curr_vol_no']
2006 def ignore_symlink (member, *_args):
2007 self._deltatar.logger.warning("Ignoring symlink %s" % member.name)
2009 # finally, restore the file
2010 index_data['tarobj'].extract(member, symlink_cb=ignore_symlink)
2012 def add_member_dir(self, member):
2014 Add member dir to be restored at the end
2016 if not self.canchown:
2017 self._directories.append(DirItem(name=member.name, mode=member.mode,
2018 mtime=member.mtime))
2020 self._directories.append(DirItem(name=member.name, mode=member.mode,
2021 mtime=member.mtime, gname=member.gname, uname=member.uname,
2022 uid=member.uid, gid=member.gid, issym=member.issym()))
2024 class DirItem(object):
2025 def __init__(self, **kwargs):
2026 for k, v in kwargs.items():