bpo-32713: Fix tarfile.itn for large/negative float values. (GH-5434)
[python-delta-tar] / deltatar / deltatar.py
... / ...
CommitLineData
1#!/usr/bin/env python3
2
3# Copyright (C) 2013, 2014 Intra2net AG
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU Lesser General Public License as published
7# by the Free Software Foundation; either version 3 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU Lesser General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program. If not, see
17# <http://www.gnu.org/licenses/lgpl-3.0.html>
18
19DELTATAR_HEADER_VERSION = 1
20DELTATAR_PARAMETER_VERSION = 1
21
22import logging
23import datetime
24import binascii
25import io
26import operator
27import os
28import copy
29import shutil
30import re
31import stat
32import json
33import typing
34from functools import partial
35
36from . import tarfile
37from . import crypto
38
39class NullHandler(logging.Handler):
40 def emit(self, record):
41 pass
42
43
44logging.getLogger("deltatar.DeltaTar").addHandler(NullHandler())
45
46
47# match mode
48NO_MATCH = False
49MATCH = True
50PARENT_MATCH = 2
51
52# encryption direction
53CRYPTO_MODE_ENCRYPT = 0
54CRYPTO_MODE_DECRYPT = 1
55
56# The canonical extension for encrypted backup files regardless of the actual
57# encryption parameters is “.pdtcrypt”. This is analogous to the encryption
58# header which starts with the eight ASCII bytes “PDTCRYPT”. Historical note:
59# Since the introduction of the versioned header there no longer any need
60# for encoding encryption parameters in the file extensions (“.aes128” and
61# suchlike).
62PDTCRYPT_EXTENSION = "pdtcrypt"
63PDT_TYPE_ARCHIVE = 0
64PDT_TYPE_AUX = 1
65
66AUXILIARY_FILE_INDEX = 0
67AUXILIARY_FILE_INFO = 1
68
69class DeltaTar(object):
70 '''
71 Backup class used to create backups
72 '''
73
74 # list of files to exclude in the backup creation or restore operation. It
75 # can contain python regular expressions.
76 excluded_files = []
77
78 # list of files to include in the backup creation or restore operation. It
79 # can contain python regular expressions. If empty, all files in the source
80 # path will be backed up (when creating a backup) or all the files in the
81 # backup will be restored (when restoring a backup), but if included_files
82 # is set then only the files include in the list will be processed.
83 included_files = []
84
85 # custom filter of files to be backed up (or restored). Unused and unset
86 # by default. The function receives a file path and must return a boolean.
87 filter_func = None
88
89 # mode in which the delta will be created (when creating a backup) or
90 # opened (when restoring). Accepts modes analog to the tarfile library.
91 mode = ""
92
93 # used together with aes modes to encrypt and decrypt backups.
94 password = None
95 crypto_key = None
96 nacl = None
97
98 # parameter version to use when encrypting; note that this has no effect
99 # on decryption since the required settings are determined from the headers
100 crypto_version = DELTATAR_HEADER_VERSION
101 crypto_paramversion = None
102
103 # when encrypting or decrypting, these hold crypto handlers; created before
104 # establishing the Tarfile stream iff a password is supplied.
105 encryptor = None
106 decryptor = None
107
108 # python logger object.
109 logger = None
110
111 # specifies the index mode in the same format as @param mode, but without
112 # the ':', '|' or '#' at the begining. It doesn't make sense to specify
113 # that the index is encrypted if no password is given in the constructor.
114 index_mode = None
115
116 # current time for this backup. Used for file names and file creation checks
117 current_time = None
118
119 # extra data to included in the header of the index file when creating a
120 # backup
121 extra_data = dict()
122
123 # valid tarfile modes and their corresponding default file extension
124 __file_extensions_dict = {
125 '': '',
126 ':': '',
127 ':gz': '.gz',
128 ':bz2': '.bz2',
129 '|': '',
130 '|gz': '.gz',
131 '|bz2': '.bz2',
132 '#gz': '.gz',
133 '#gz.pdtcrypt': '.gz',
134 '#pdtcrypt': '',
135 '#': '',
136 }
137
138 # valid index modes and their corresponding default file extension
139 __index_extensions_dict = {
140 '': '',
141 'gz': '.gz',
142 'bz2': '.bz2',
143 'gz.pdtcrypt': '.gz',
144 'pdtcrypt': '',
145 }
146
147 # valid path prefixes
148 __path_prefix_list = [
149 u'snapshot://',
150 u'list://',
151 u'delete://'
152 ]
153
154 def __init__(self, excluded_files=[], included_files=[],
155 filter_func=None, mode="", password=None,
156 crypto_key=None, nacl=None,
157 crypto_version=DELTATAR_HEADER_VERSION,
158 crypto_paramversion=DELTATAR_PARAMETER_VERSION,
159 logger=None, index_mode=None, index_name_func=None,
160 volume_name_func=None):
161 '''
162 Constructor. Configures the diff engine.
163
164 Parameters:
165 - excluded_files: list of files to exclude in the backup creation or
166 restore operation. It can contain python regular expressions.
167
168 - included_files: list of files to include in the backup creation or
169 restore operation. It can contain python regular expressions. If
170 empty, all files in the source path will be backed up (when creating a
171 backup) or all the files in the backup will be restored (when
172 restoring a backup), but if included_files is set then only the files
173 include in the list will be processed.
174
175 - filter_func: custom filter of files to be backed up (or restored).
176 Unused and unset by default. The function receives a file path and
177 must return a boolean.
178
179 - mode: mode in which the delta will be created (when creating a backup)
180 or opened (when restoring). Accepts the same modes as the tarfile
181 library. Valid modes are:
182
183 '' open uncompressed
184 ':' open uncompressed
185 ':gz' open with gzip compression
186 ':bz2' open with bzip2 compression
187 '|' open an uncompressed stream of tar blocks
188 '|gz' open a gzip compressed stream of tar blocks
189 '|bz2' open a bzip2 compressed stream of tar blocks
190 '#gz' open a stream of gzip compressed tar blocks
191
192 - crypto_key: used to encrypt and decrypt backups. Encryption will
193 be enabled automatically if a key is supplied. Requires a salt to be
194 passed as well.
195
196 - nacl: salt that was used to derive the encryption key for embedding
197 in the PDTCRYPT header. Not needed when decrypting and when
198 encrypting with password.
199
200 - password: used to encrypt and decrypt backups. Encryption will be
201 enabled automatically if a password is supplied.
202
203 - crypto_version: version of the format, determining the kind of PDT
204 object header.
205
206 - crypto_paramversion: optionally request encryption conforming to
207 a specific parameter version. Defaults to the standard PDT value
208 which as of 2017 is the only one available.
209
210 - logger: python logger object. Optional.
211
212 - index_mode: specifies the index mode in the same format as @param
213 mode, but without the ':', '|' or '#' at the begining. If encryption
214 is requested it will extend to the auxiliary (index, info) files as
215 well. This is an optional parameter that will automatically mimic
216 @param mode by default if not provided. Valid modes are:
217
218 '' open uncompressed
219 'gz' open with gzip compression
220 'bz2' open with bzip2 compression
221
222 - index_name_func: function that sets a custom name for the index file.
223 This function receives a flag to indicate whether the name will be
224 used for a full or diff backup. The backup path will be prepended to
225 its return value.
226
227 - volume_name_func: function that defines the name of tar volumes. It
228 receives the backup_path, if it's a full backup and the volume number,
229 and must return the name for the corresponding volume name. Optional,
230 DeltaTar has default names for tar volumes.
231 '''
232
233 if mode not in self.__file_extensions_dict:
234 raise Exception('Unrecognized extension mode=[%s] requested for files'
235 % str(mode))
236
237 self.excluded_files = excluded_files
238 self.included_files = included_files
239 self.filter_func = filter_func
240 self.logger = logging.getLogger('deltatar.DeltaTar')
241 if logger:
242 self.logger.addHandler(logger)
243 self.mode = mode
244
245 if crypto_key is not None:
246 self.crypto_key = crypto_key
247 self.nacl = nacl # encryption only
248
249 if password is not None:
250 self.password = password
251
252 if crypto_version is not None:
253 self.crypto_version = crypto_version
254
255 if crypto_paramversion is not None:
256 self.crypto_paramversion = crypto_paramversion
257
258 # generate index_mode
259 if index_mode is None:
260 index_mode = ''
261 if 'gz' in mode:
262 index_mode = "gz"
263 elif 'bz2' in mode:
264 index_mode = "bz2"
265 elif mode not in self.__index_extensions_dict:
266 raise Exception('Unrecognized extension mode=[%s] requested for index'
267 % str(mode))
268
269 self.index_mode = index_mode
270 self.current_time = datetime.datetime.now()
271
272 if index_name_func is not None:
273 self.index_name_func = index_name_func
274
275 if volume_name_func is not None:
276 self.volume_name_func = volume_name_func
277
278 def pick_extension(self, kind, mode=None):
279 """
280 Choose the extension depending on a) the kind of file given, b) the
281 processing mode, and c) the current encryption settings.
282 """
283 ret = ""
284 if kind == PDT_TYPE_ARCHIVE:
285 ret += ".tar"
286 if mode is None:
287 mode = self.__index_extensions_dict [self.index_mode]
288 ret += mode
289 if self.crypto_key is not None or self.password is not None:
290 ret += "." + PDTCRYPT_EXTENSION
291 return ret
292
293 def index_name_func(self, is_full): # pylint: disable=method-hidden
294 '''
295 Callback for setting a custom name for the index file. Depending on
296 whether *is_full* is set, it will create a suitable name for a full
297 or a diff backup.
298 '''
299 prefix = "bfull" if is_full else "bdiff"
300 date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
301 extension = self.pick_extension \
302 (PDT_TYPE_AUX,
303 self.__index_extensions_dict [self.index_mode])
304
305 return "%s-%s.index%s" % (prefix, date_str, extension)
306
307 def volume_name_func(self, backup_path, # pylint: disable=method-hidden
308 is_full, volume_number,
309 guess_name=False):
310 '''
311 function that defines the name of tar volumes. It receives the
312 backup_path, if it's a full backup and the volume number, and must return
313 the name for the corresponding volume name. Optional, DeltaTar has default
314 names for tar volumes.
315
316 If guess_name is activated, the file is intended not to be created but
317 to be found, and thus the date will be guessed.
318 '''
319 prefix = "bfull" if is_full else "bdiff"
320 extension = self.pick_extension \
321 (PDT_TYPE_ARCHIVE,
322 self.__file_extensions_dict [self.mode])
323
324 if not guess_name:
325 date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
326 return "%s-%s-%03d%s" % (prefix, date_str, volume_number + 1, extension)
327 else:
328 prefix = prefix + "-"
329 postfix = "-%03d%s" % (volume_number + 1, extension)
330 for f in os.listdir(backup_path):
331 if f.startswith(prefix) and f.endswith(postfix):
332 return f
333 raise Exception("volume not found")
334
335
336 def filter_path(self, path, source_path="", is_dir=None):
337 '''
338 Filters a path, given the source_path, using the filtering properties
339 set in the constructor.
340 The filtering order is:
341 1. included_files (if any)
342 2. excluded_files
343 3. filter_func (which must return whether the file is accepted or not)
344 '''
345
346 if len(source_path) > 0:
347 # ensure that exactly one '/' at end of dir is also removed
348 source_path = source_path.rstrip(os.sep) + os.sep
349 path = path[len(source_path):]
350
351 # 1. filter included_files
352 match = MATCH
353 if len(self.included_files) > 0:
354 match = NO_MATCH
355 for i in self.included_files:
356 # it can be either a regexp or a string
357 if isinstance(i, str):
358 # if the string matches, then continue
359 if i == path:
360 match = MATCH
361 break
362
363 # if the string ends with / it's a directory, and if the
364 # path is contained in it, it is included
365 if i.endswith('/') and path.startswith(i):
366 match = MATCH
367 break
368
369 # if the string doesn't end with /, add it and do the same
370 # check
371 elif path.startswith(i + '/'):
372 match = MATCH
373 break
374
375 # check for PARENT_MATCH
376 if is_dir:
377 dir_path = path
378 if not dir_path.endswith('/'):
379 dir_path += '/'
380
381 if i.startswith(dir_path):
382 match = PARENT_MATCH
383
384 # if it's a reg exp, then we just check if it matches
385 elif isinstance(i, typing.Pattern):
386 if i.match(path):
387 match = MATCH
388 break
389 else:
390 self.logger.warning('Invalid pattern in included_files: %s' % str(i))
391
392 if match == NO_MATCH:
393 return NO_MATCH
394
395 # when a directory is in PARENT_MATCH, it doesn't matter if it's
396 # excluded. It's subfiles will be excluded, but the directory itself
397 # won't
398 if match != PARENT_MATCH:
399 for e in self.excluded_files:
400 # it can be either a regexp or a string
401 if isinstance(e, str):
402 # if the string matches, then exclude
403 if e == path:
404 return NO_MATCH
405
406 # if the string ends with / it's a directory, and if the
407 # path starts with the directory, then exclude
408 if e.endswith('/') and path.startswith(e):
409 return NO_MATCH
410
411 # if the string doesn't end with /, do the same check with
412 # the slash added
413 elif path.startswith(e + '/'):
414 return NO_MATCH
415
416 # if it's a reg exp, then we just check if it matches
417 elif isinstance(e, typing.Pattern):
418 if e.match(path):
419 return NO_MATCH
420 else:
421 self.logger.warning('Invalid pattern in excluded_files: %s' % str(e))
422
423 if self.filter_func:
424 return self.filter_func(path)
425
426 return match
427
428 def _recursive_walk_dir(self, source_path, keep_base_dir=False):
429 '''
430 Walk a directory recursively, yielding each file/directory
431
432 Returns the path of an entity. If ``keep_base_dir`` is set,
433 the path returned contains the prefix ``source_path``; otherwise it is
434 relative to the prefix.
435 '''
436
437 source_path = source_path.rstrip(os.sep)
438
439 if keep_base_dir:
440 beginning_size = 0
441 else:
442 beginning_size = len(source_path) + 1 # +1 for os.sep
443
444 queue = [source_path]
445
446 while queue:
447 cur_path = queue.pop(0)
448
449 try:
450 dfd = os.open (cur_path, os.O_DIRECTORY)
451 except FileNotFoundError as exn:
452 self.logger.warning ("failed to open entity [%s] as directory; "
453 "file system (error: %s); skipping"
454 % (cur_path, str (exn)))
455 continue
456
457 try:
458 for filename in sorted(os.listdir(dfd)):
459 child = os.path.join(cur_path, filename)
460 is_dir = os.path.isdir(child)
461 status = self.filter_path(child, source_path, is_dir)
462 if status == NO_MATCH:
463 continue
464 if not os.access(child, os.R_OK):
465 self.logger.warning('Error accessing possibly locked file %s' % child)
466 continue
467
468 if status == MATCH:
469 yield child[beginning_size:]
470
471 if is_dir and (status == MATCH or status == PARENT_MATCH):
472 queue.append(child)
473 finally:
474 os.close (dfd)
475
476 def _stat_dict(self, path):
477 '''
478 Returns a dict with the stat data used to compare files
479 '''
480 stinfo = os.stat(path)
481 mode = stinfo.st_mode
482
483 ptype = None
484 if stat.S_ISDIR(mode):
485 ptype = u'directory'
486 elif stat.S_ISREG(mode):
487 ptype = u'file'
488 elif stat.S_ISLNK(mode):
489 ptype = u'link'
490
491 return {
492 u'type': ptype,
493 u'path': path,
494 u'mode': mode,
495 u'mtime': int(stinfo.st_mtime),
496 u'ctime': int(stinfo.st_ctime),
497 u'uid': stinfo.st_uid,
498 u'gid': stinfo.st_gid,
499 u'inode': stinfo.st_ino,
500 u'size': stinfo.st_size
501 }
502
503 def _equal_stat_dicts(self, d1, d2, listsnapshot_equal=False):
504 '''
505 Return if the dicts are equal in the stat keys
506 '''
507 keys = [u'type', u'mode',u'size', u'mtime',
508 # not restored: u'inode', u'ctime'
509 ]
510
511 # only if user is root, then also check gid/uid. otherwise do not check it,
512 # because tarfile can chown in case of being superuser only
513 #
514 # also, skip the check in rpmbuild since the sources end up with the
515 # uid:gid of the packager while the extracted files are 0:0.
516 if hasattr(os, "geteuid") and os.geteuid() == 0 \
517 and os.getenv ("RPMBUILD_OPTIONS") is None:
518 keys.append('gid')
519 keys.append('uid')
520
521 if (not d1 and d2 != None) or (d1 != None and not d2):
522 return False
523
524 if self.prefixed(d1.get('path', -1), listsnapshot_equal) != self.prefixed(d2.get('path', -2), listsnapshot_equal):
525 return False
526
527 type = d1.get('type', '')
528
529 for key in keys:
530 # size doesn't matter for directories
531 if type == 'directory' and key == 'size':
532 continue
533 if d1.get(key, -1) != d2.get(key, -2):
534 return False
535 return True
536
537 def prefixed(self, path, listsnapshot_equal=False):
538 '''
539 if a path is not prefixed, return it prefixed
540 '''
541 for prefix in self.__path_prefix_list:
542 if path.startswith(prefix):
543 if listsnapshot_equal and prefix == u'list://':
544 return u'snapshot://' + path[len(prefix):]
545 return path
546 return u'snapshot://' + path
547
548 def unprefixed(self, path):
549 '''
550 remove a path prefix if any
551 '''
552 for prefix in self.__path_prefix_list:
553 if path.startswith(prefix):
554 return path[len(prefix):]
555 return path
556
557
558 def initialize_encryption (self, mode, strict_validation=True):
559 """
560 :type strict_validation: bool
561 :param strict_validation: Enable strict IV checking in the crypto
562 layer. Should be disabled when dealing with
563 potentially corrupted data.
564 """
565 password = self.password
566 key = self.crypto_key
567 nacl = self.nacl
568
569 if key is None and password is None:
570 return
571 if mode == CRYPTO_MODE_ENCRYPT:
572 return crypto.Encrypt (password=password,
573 key=key,
574 nacl=nacl,
575 version=self.crypto_version,
576 paramversion=self.crypto_paramversion)
577 if mode == CRYPTO_MODE_DECRYPT:
578 return crypto.Decrypt (password=password, key=key,
579 strict_ivs=strict_validation)
580
581 raise Exception ("invalid encryption mode [%r]" % mode)
582
583
584 def open_auxiliary_file(self, path, mode='r', kind=AUXILIARY_FILE_INDEX,
585 strict_validation=True):
586 '''
587 Given the specified configuration, opens a file for reading or writing,
588 inheriting the encryption and compression settings from the backup.
589 Returns a file object ready to use.
590
591 :param mode: IO mode (read or write, ``"r"`` and ``"w"``,
592 respectively).
593 :type mode: str
594 :param kind: Role of the file, see AUXILIARY_FILE_* constants.
595 Both the info and the auxiliary file have a globally
596 unique, constant counter value.
597 :type kind: str
598 '''
599 if self.index_mode.startswith('gz'):
600 comptype = 'gz'
601 elif self.index_mode.startswith('bz2'):
602 comptype = 'bz2'
603 else:
604 comptype = 'tar'
605
606 crypto_ctx = None
607 enccounter = None
608 if mode == "w":
609 crypto_ctx = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
610 elif mode == "r":
611 crypto_ctx = self.initialize_encryption (CRYPTO_MODE_DECRYPT,
612 strict_validation=strict_validation)
613
614 if crypto_ctx is not None:
615 if kind == AUXILIARY_FILE_INFO:
616 enccounter = crypto.AES_GCM_IV_CNT_INFOFILE
617 elif kind == AUXILIARY_FILE_INDEX:
618 enccounter = crypto.AES_GCM_IV_CNT_INDEX
619 else:
620 raise Exception ("invalid kind of aux file %r" % kind)
621
622 sink = tarfile._Stream(name=path, mode=mode, comptype=comptype,
623 bufsize=tarfile.RECORDSIZE, fileobj=None,
624 encryption=crypto_ctx, enccounter=enccounter)
625
626 return sink
627
628
629 def create_full_backup(self, source_path, backup_path,
630 max_volume_size=None, extra_data=dict()):
631 '''
632 Creates a full backup.
633
634 Parameters:
635 - source_path: source path to the directory to back up.
636 - backup_path: path where the back up will be stored. Backup path will
637 be created if not existent.
638 - max_volume_size: maximum volume size in megabytes. Used to split the
639 backup in volumes. Optional (won't split in volumes by default).
640 - extra_data: a json-serializable dictionary with information that you
641 want to be included in the header of the index file
642 '''
643 # check input
644 if not isinstance(source_path, str):
645 raise Exception('Source path must be a string')
646
647 if not isinstance(backup_path, str):
648 raise Exception('Backup path must be a string')
649
650 if not os.path.exists(source_path) or not os.path.isdir(source_path):
651 raise Exception('Source path "%s" does not exist or is not a '\
652 'directory' % source_path)
653
654 if max_volume_size != None and (not isinstance(max_volume_size, int) or\
655 max_volume_size < 1):
656 raise Exception('max_volume_size must be a positive integer')
657 if max_volume_size != None:
658 max_volume_size = max_volume_size*1024*1024
659
660 if not isinstance(extra_data, dict):
661 raise Exception('extra_data must be a dictionary')
662
663 try:
664 extra_data_str = json.dumps(extra_data)
665 except:
666 raise Exception('extra_data is not json-serializable')
667
668 if not os.access(source_path, os.R_OK):
669 raise Exception('Source path "%s" is not readable' % source_path)
670
671 # try to create backup path if needed
672 os.makedirs(backup_path, exist_ok=True)
673
674 if not os.access(backup_path, os.W_OK):
675 raise Exception('Backup path "%s" is not writeable' % backup_path)
676
677 if source_path.endswith('/'):
678 source_path = source_path[:-1]
679
680 if backup_path.endswith('/'):
681 backup_path = backup_path[:-1]
682
683 # update current time
684 self.current_time = datetime.datetime.now()
685
686 if self.mode not in self.__file_extensions_dict:
687 raise Exception('Unrecognized extension')
688
689 # setup for encrypting payload
690 if self.encryptor is None:
691 self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
692
693 # some initialization
694 self.vol_no = 0
695
696 # generate the first volume name
697 vol_name = self.volume_name_func(backup_path, True, 0)
698 tarfile_path = os.path.join(backup_path, vol_name)
699
700 # init index
701 index_name = self.index_name_func(True)
702 index_path = os.path.join(backup_path, index_name)
703 index_sink = self.open_auxiliary_file(index_path, 'w')
704
705 cwd = os.getcwd()
706
707 def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
708 '''
709 Handles the new volumes
710 '''
711 volume_name = deltarobj.volume_name_func(backup_path, True, volume_number)
712 volume_path = os.path.join(backup_path, volume_name)
713 deltarobj.vol_no = volume_number
714
715 # we convert relative paths into absolute because CWD is changed
716 if not os.path.isabs(volume_path):
717 volume_path = os.path.join(cwd, volume_path)
718
719 if tarobj.fileobj is not None:
720 tarobj.fileobj.close()
721
722 deltarobj.logger.debug("opening volume %s" % volume_path)
723
724 tarobj.open_volume(volume_path, encryption=encryption)
725
726 # wraps some args from context into the handler
727 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.encryptor)
728
729 index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "full", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))
730
731 s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
732 # calculate checksum and write into the stream
733 crc = binascii.crc32(s) & 0xFFFFffff
734 index_sink.write(s)
735
736 # start creating the tarfile
737 tarobj = tarfile.TarFile.open(tarfile_path,
738 mode='w' + self.mode,
739 format=tarfile.GNU_FORMAT,
740 concat='#' in self.mode,
741 encryption=self.encryptor,
742 max_volume_size=max_volume_size,
743 new_volume_handler=new_volume_handler,
744 save_to_members=False,
745 dereference=True)
746 os.chdir(source_path)
747
748 # for each file to be in the backup, do:
749 for path in self._recursive_walk_dir('.'):
750
751 try: # backup file
752 # calculate stat dict for current file
753 statd = self._stat_dict(path)
754 statd['path'] = u'snapshot://' + statd['path']
755 statd['volume'] = self.vol_no
756
757 # backup file
758 tarobj.add(path, arcname = statd['path'], recursive=False)
759 except FileNotFoundError as exn:
760 # file vanished since the call to access(3) above
761 self.logger.warning ("object [%s] no longer available in "
762 "file system (error: %s); skipping"
763 % (path, str (exn)))
764 continue # prevent indexing
765
766 # retrieve file offset
767 statd['offset'] = tarobj.get_last_member_offset()
768 self.logger.debug("backup %s" % statd['path'])
769
770 # store the stat dict in the index
771 s = bytes(json.dumps(statd) + '\n', 'UTF-8')
772 crc = binascii.crc32(s, crc) & 0xffffffff
773 index_sink.write(s)
774
775 s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
776 crc = binascii.crc32(s, crc) & 0xffffffff
777 index_sink.write(s)
778 s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
779 index_sink.write(s)
780
781 os.chdir(cwd)
782 tarobj.close()
783 index_sink.close (close_fileobj=True)
784
785 def create_diff_backup(self, source_path, backup_path, previous_index_path,
786 max_volume_size=None, extra_data=dict()):
787 '''
788 Creates a backup.
789
790 Parameters:
791 - source_path: source path to the directory to back up.
792 - backup_path: path where the back up will be stored. Backup path will
793 be created if not existent.
794 - previous_index_path: index of the previous backup, needed to know
795 which files changed since then.
796 - max_volume_size: maximum volume size in megabytes (MB). Used to split
797 the backup in volumes. Optional (won't split in volumes by default).
798
799 NOTE: previous index is assumed to follow exactly the same format as
800 the index_mode setup in the constructor.
801 '''
802 # check/sanitize input
803 if not isinstance(source_path, str):
804 raise Exception('Source path must be a string')
805
806 if not isinstance(backup_path, str):
807 raise Exception('Backup path must be a string')
808
809 if not os.path.exists(source_path) or not os.path.isdir(source_path):
810 raise Exception('Source path "%s" does not exist or is not a '\
811 'directory' % source_path)
812
813 if not isinstance(extra_data, dict):
814 raise Exception('extra_data must be a dictionary')
815
816 try:
817 extra_data_str = json.dumps(extra_data)
818 except:
819 raise Exception('extra_data is not json-serializable')
820
821 if not os.access(source_path, os.R_OK):
822 raise Exception('Source path "%s" is not readable' % source_path)
823
824 if max_volume_size != None and (not isinstance(max_volume_size, int) or\
825 max_volume_size < 1):
826 raise Exception('max_volume_size must be a positive integer')
827 if max_volume_size != None:
828 max_volume_size = max_volume_size*1024*1024
829
830 if not isinstance(previous_index_path, str):
831 raise Exception('previous_index_path must be A string')
832
833 if not os.path.exists(previous_index_path) or not os.path.isfile(previous_index_path):
834 raise Exception('Index path "%s" does not exist or is not a '\
835 'file' % previous_index_path)
836
837 if not os.access(previous_index_path, os.R_OK):
838 raise Exception('Index path "%s" is not readable' % previous_index_path)
839
840 # try to create backup path if needed
841 os.makedirs(backup_path, exist_ok=True)
842
843 if not os.access(backup_path, os.W_OK):
844 raise Exception('Backup path "%s" is not writeable' % backup_path)
845
846 if source_path.endswith('/'):
847 source_path = source_path[:-1]
848
849 if backup_path.endswith('/'):
850 backup_path = backup_path[:-1]
851
852 # update current time
853 self.current_time = datetime.datetime.now()
854
855 if self.mode not in self.__file_extensions_dict:
856 raise Exception('Unrecognized extension')
857
858 # setup for encrypting payload
859 if self.encryptor is None:
860 self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
861
862 # some initialization
863 self.vol_no = 0
864
865 # generate the first volume name
866 vol_name = self.volume_name_func(backup_path, is_full=False,
867 volume_number=0)
868 tarfile_path = os.path.join(backup_path, vol_name)
869
870 # init index
871 cwd = os.getcwd()
872
873 index_name = self.index_name_func(is_full=False)
874 index_path = os.path.join(backup_path, index_name)
875 index_sink = self.open_auxiliary_file(index_path, 'w')
876
877 def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number):
878 '''
879 Handles the new volumes
880 '''
881 volume_name = deltarobj.volume_name_func(backup_path, is_full=False,
882 volume_number=volume_number)
883 volume_path = os.path.join(backup_path, volume_name)
884 deltarobj.vol_no = volume_number
885
886 # we convert relative paths into absolute because CWD is changed
887 if not os.path.isabs(volume_path):
888 volume_path = os.path.join(cwd, volume_path)
889
890 deltarobj.logger.debug("opening volume %s" % volume_path)
891 tarobj.open_volume(volume_path)
892
893 # wraps some args from context into the handler
894 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path)
895
896 index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "diff", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))
897
898 s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
899 # calculate checksum and write into the stream
900 crc = binascii.crc32(s) & 0xFFFFffff
901 index_sink.write(s)
902
903 # start creating the tarfile
904 tarobj = tarfile.TarFile.open(tarfile_path,
905 mode='w' + self.mode,
906 format=tarfile.GNU_FORMAT,
907 concat='#' in self.mode,
908 encryption=self.encryptor,
909 max_volume_size=max_volume_size,
910 new_volume_handler=new_volume_handler,
911 save_to_members=False,
912 dereference=True)
913
914
915 # create the iterators, first the previous index iterator, then the
916 # source path directory iterator and collate and iterate them
917 if not os.path.isabs(previous_index_path):
918 previous_index_path = os.path.join(cwd, previous_index_path)
919 index_it = self.iterate_index_path(previous_index_path)
920
921 os.chdir(source_path)
922 dir_it = self._recursive_walk_dir('.')
923 dir_path_it = self.jsonize_path_iterator(dir_it)
924
925 def pr(path):
926 if not path:
927 return "None"
928 else:
929 return path["path"]
930
931 # for each file to be in the backup, do:
932 for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
933 action = None
934 # if file is not in the index, it means it's a new file, so we have
935 # to take a snapshot
936
937 if not ipath:
938 action = 'snapshot'
939 # if the file is not in the directory iterator, it means that it has
940 # been deleted, so we need to mark it as such
941 elif not dpath:
942 action = 'delete'
943 # if the file is in both iterators, it means it might have either
944 # not changed (in which case we will just list it in our index but
945 # it will not be included in the tar file), or it might have
946 # changed, in which case we will snapshot it.
947 elif ipath and dpath:
948 if self._equal_stat_dicts(ipath, dpath):
949 action = 'list'
950 else:
951 action = 'snapshot'
952 # TODO: when creating chained backups (i.e. diffing from another
953 # diff), we will need to detect the type of action in the previous
954 # index, because if it was delete and dpath is None, we should
955 # discard the file
956
957 if action == 'snapshot':
958 # calculate stat dict for current file
959 stat = dpath.copy()
960 stat['path'] = "snapshot://" + dpath['path']
961 stat['volume'] = self.vol_no
962
963 self.logger.debug("[STORE] %s" % dpath['path'])
964
965 try: # backup file
966 tarobj.add(dpath['path'], arcname=stat['path'], recursive=False)
967 # retrieve file offset
968 stat['offset'] = tarobj.get_last_member_offset()
969 except FileNotFoundError as exn:
970 # file vanished since the call to access(3) above
971 self.logger.warning ("object [%s] no longer available in "
972 "file system (error: %s); skipping"
973 % (dpath ["path"], str (exn)))
974 stat = None # prevent indexing
975
976 elif action == 'delete':
977 path = self.unprefixed(ipath['path'])
978 stat = {
979 u'path': u'delete://' + path,
980 u'type': ipath['type']
981 }
982 self.logger.debug("[DELETE] %s" % path)
983
984 # mark it as deleted in the backup
985 tarobj.add("/dev/null", arcname=stat['path'])
986 elif action == 'list':
987 stat = dpath.copy()
988 path = self.unprefixed(ipath['path'])
989 stat['path'] = u'list://' + path
990 # unchanged files do not enter in the backup, only in the index
991 self.logger.debug("[UNCHANGED] %s" % path)
992 else:
993 # should not happen
994 self.logger.warning('unknown action in create_diff_backup: {0}'
995 ''.format(action))
996 stat = None
997
998 if stat:
999 # store the stat dict in the index
1000 s = bytes(json.dumps(stat) + '\n', 'UTF-8')
1001 crc = binascii.crc32(s, crc) & 0xffffffff
1002 index_sink.write(s)
1003
1004 s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
1005 crc = binascii.crc32(s, crc) & 0xffffffff
1006 index_sink.write(s)
1007 s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
1008 index_sink.write(s)
1009
1010 index_it.release()
1011 os.chdir(cwd)
1012 tarobj.close()
1013 index_sink.close()
1014
1015
1016 def iterate_index_path(self, index_path, strict_validation=True):
1017 '''
1018 Returns an index iterator. Internally, it uses a classic iterator class.
1019 We do that instead of just yielding so that the iterator object can have
1020 an additional function to close the file descriptor that is opened in
1021 the constructor.
1022 '''
1023
1024 class IndexPathIterator(object):
1025 def __init__(self, delta_tar, index_path):
1026 self.delta_tar = delta_tar
1027 self.index_path = index_path
1028 self.f = None
1029 self.extra_data = dict()
1030 self.__enter__()
1031
1032 def __iter__(self):
1033 return self
1034
1035 def release(self):
1036 if self.f:
1037 self.f.close()
1038
1039 def __enter__(self):
1040 '''
1041 Allows this iterator to be used with the "with" statement
1042 '''
1043 if self.f is None:
1044 self.f = self.delta_tar.open_auxiliary_file \
1045 (self.index_path,
1046 'r',
1047 strict_validation=strict_validation)
1048 # check index header
1049 j, l_no = self.delta_tar._parse_json_line(self.f, 0)
1050 if j.get("type", '') != 'python-delta-tar-index' or\
1051 j.get('version', -1) != 1:
1052 raise Exception("invalid index file format: %s" % json.dumps(j))
1053
1054 self.extra_data = j.get('extra_data', dict())
1055
1056 # find BEGIN-FILE-LIST, ignore other headers
1057 while True:
1058 j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
1059 if j.get('type', '') == 'BEGIN-FILE-LIST':
1060 break
1061 return self
1062
1063 def __exit__(self, type, value, tb):
1064 '''
1065 Allows this iterator to be used with the "with" statement
1066 '''
1067 if self.f:
1068 self.f.close()
1069 self.f = None
1070
1071 def __next__(self):
1072 # read each file in the index and process it to do the restore
1073 j = {}
1074 l_no = -1
1075 try:
1076 j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
1077 except Exception as e:
1078 if self.f:
1079 self.f.close()
1080 raise e
1081
1082 op_type = j.get('type', '')
1083
1084 # when we detect the end of the list, break the loop
1085 if op_type == 'END-FILE-LIST':
1086 if self.f:
1087 self.f.close()
1088 raise StopIteration
1089
1090 # check input
1091 if op_type not in ['directory', 'file', 'link']:
1092 self.delta_tar.logger.warning('unrecognized type to be '
1093 'restored: %s, line %d' % (op_type, l_no))
1094 # iterate again
1095 return self.__next__()
1096
1097 return j, l_no
1098
1099 return IndexPathIterator(self, index_path)
1100
1101 def iterate_tar_path(self, tar_path, new_volume_handler=None):
1102 '''
1103 Returns a tar iterator that iterates jsonized member items that contain
1104 an additional "member" field, used by RestoreHelper.
1105 '''
1106 class TarPathIterator(object):
1107 def __init__(self, delta_tar, tar_path, new_volume_handler=None):
1108 self.delta_tar = delta_tar
1109 self.tar_path = tar_path
1110 self.tar_obj = None
1111 self.last_member = None
1112 self.new_volume_handler = new_volume_handler
1113 self.__enter__()
1114
1115 def __iter__(self):
1116 return self
1117
1118 def release(self):
1119 if self.tar_obj:
1120 self.tar_obj.close()
1121
1122 def __enter__(self):
1123 '''
1124 Allows this iterator to be used with the "with" statement
1125 '''
1126 if self.tar_obj is None:
1127 decryptor = None
1128 if self.delta_tar.password is not None:
1129 decryptor = crypto.Decrypt \
1130 (password=self.delta_tar.password,
1131 key=self.delta_tar.crypto_key,
1132 strict_ivs=False)
1133 self.tar_obj = tarfile.TarFile.open(self.tar_path,
1134 mode='r' + self.delta_tar.mode,
1135 format=tarfile.GNU_FORMAT,
1136 concat='#' in self.delta_tar.mode,
1137 encryption=decryptor,
1138 new_volume_handler=self.new_volume_handler,
1139 save_to_members=False,
1140 dereference=True)
1141 return self
1142
1143 def __exit__(self, type, value, tb):
1144 '''
1145 Allows this iterator to be used with the "with" statement
1146 '''
1147 if self.tar_obj:
1148 self.tar_obj.close()
1149 self.tar_obj = None
1150
1151 def __next__(self):
1152 '''
1153 Read each member and return it as a stat dict
1154 '''
1155 tarinfo = self.tar_obj.__iter__().__next__()
1156 # NOTE: here we compare if tarinfo.path is the same as before
1157 # instead of comparing the tarinfo object itself because the
1158 # object itself might change for multivol tarinfos
1159 if tarinfo is None or (self.last_member is not None and\
1160 self.delta_tar.unprefixed(tarinfo.path) == self.delta_tar.unprefixed(self.last_member.path)):
1161 raise StopIteration
1162
1163 self.last_member = tarinfo
1164
1165 ptype = 'unknown'
1166 if tarinfo.isfile():
1167 ptype = 'file'
1168 elif tarinfo.isdir():
1169 ptype = 'directory'
1170 elif tarinfo.islnk() or tarinfo.issym():
1171 ptype = 'link'
1172
1173 return {
1174 u'type': ptype,
1175 u'path': tarinfo.path,
1176 u'mode': tarinfo.mode,
1177 u'mtime': tarinfo.mtime,
1178 u'ctime': -1, # cannot restore
1179 u'uid': tarinfo.uid,
1180 u'gid': tarinfo.gid,
1181 u'inode': -1, # cannot restore
1182 u'size': tarinfo.size,
1183 u'member': tarinfo
1184 }, 0
1185
1186 return TarPathIterator(self, tar_path, new_volume_handler)
1187
1188 def jsonize_path_iterator(self, iter, strip=0):
1189 '''
1190 converts the yielded items of an iterator into json path lines.
1191
1192 strip: Strip the smallest prefix containing num leading slashes from
1193 the file path.
1194 '''
1195 while True:
1196 try:
1197 path = iter.__next__()
1198 if strip == 0:
1199 yield self._stat_dict(path), 0
1200 else:
1201 st = self._stat_dict(path)
1202 st['path'] = "/".join(path.split("/")[strip:])
1203 yield st, 0
1204 except StopIteration:
1205 break
1206
1207 def iterate_disaster_index (self, index):
1208 """
1209 Mimick the behavior of the other object iterators, just with the inputs
1210 supplied directly as *index*.
1211 """
1212
1213 class RawIndexIterator(object):
1214 def __init__(self, delta_tar, index):
1215 self.delta_tar = delta_tar
1216 self.index = index
1217 self.__enter__()
1218
1219 def __iter__(self):
1220 return self
1221
1222 def release(self):
1223 pass
1224
1225 def __enter__(self):
1226 '''
1227 Allows this iterator to be used with the "with" statement
1228 '''
1229 self.iter = self.index.__iter__ ()
1230 return self
1231
1232 def __exit__(self, type, value, tb):
1233 '''
1234 Allows this iterator to be used with the "with" statement
1235 '''
1236
1237 def __next__(self):
1238 idxent = self.iter.__next__ ()
1239 return idxent, 0
1240
1241 return RawIndexIterator(self, index)
1242
1243 def collate_iterators(self, it1, it2):
1244 '''
1245 Collate two iterators, so that it returns pairs of the items of each
1246 iterator (if the items are the same), or (None, elem2) or (elem1, None)
1247 when there's no match for the items in the other iterator.
1248
1249 It assumes that the items in both lists are ordered in the same way.
1250 '''
1251 l_no = 0
1252 elem1, elem2 = None, None
1253 while True:
1254 if not elem1:
1255 try:
1256 elem1, l_no = it1.__next__()
1257 except StopIteration:
1258 if elem2:
1259 yield (None, elem2, l_no)
1260 for elem2 in it2:
1261 if isinstance(elem2, tuple):
1262 elem2 = elem2[0]
1263 yield (None, elem2, l_no)
1264 break
1265 if not elem2:
1266 try:
1267 elem2 = it2.__next__()
1268 if isinstance(elem2, tuple):
1269 elem2 = elem2[0]
1270 except StopIteration:
1271 if elem1:
1272 yield (elem1, None, l_no)
1273 for elem1, l_no in it1:
1274 yield (elem1, None, l_no)
1275 break
1276
1277 index1 = self.unprefixed(elem1['path'])
1278 index2 = self.unprefixed(elem2['path'])
1279 i1, i2 = self.compare_indexes(index1, index2)
1280
1281 yield1 = yield2 = None
1282 if i1 is not None:
1283 yield1 = elem1
1284 elem1 = None
1285 if i2 is not None:
1286 yield2 = elem2
1287 elem2 = None
1288 yield (yield1, yield2, l_no)
1289
1290 def compare_indexes(self, index1, index2):
1291 '''
1292 Compare iterator indexes and return a tuple in the following form:
1293 if index1 < index2, returns (index1, None)
1294 if index1 == index2 returns (index1, index2)
1295 else: returns (None, index2)
1296 '''
1297 l1 = index1.split('/')
1298 l2 = index2.split('/')
1299 length = len(l2) - len(l1)
1300
1301 if length > 0:
1302 return (index1, None)
1303 elif length < 0:
1304 return (None, index2)
1305
1306 for i1, i2 in zip(l1, l2):
1307 if i1 < i2:
1308 return (index1, None)
1309 elif i1 > i2:
1310 return (None, index2)
1311
1312 return (index1, index2)
1313
1314 def list_backup(self, backup_tar_path, list_func=None):
1315 if not isinstance(backup_tar_path, str):
1316 raise Exception('Backup tar path must be a string')
1317
1318 if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
1319 raise Exception('Source path "%s" does not exist or is not a '\
1320 'file' % backup_tar_path)
1321
1322 if not os.access(backup_tar_path, os.R_OK):
1323 raise Exception('Source path "%s" is not readable' % backup_tar_path)
1324
1325 cwd = os.getcwd()
1326
1327 def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
1328 '''
1329 Handles the new volumes
1330 '''
1331 volume_name = deltarobj.volume_name_func(backup_path, True,
1332 volume_number, guess_name=True)
1333 volume_path = os.path.join(backup_path, volume_name)
1334
1335 # we convert relative paths into absolute because CWD is changed
1336 if not os.path.isabs(volume_path):
1337 volume_path = os.path.join(cwd, volume_path)
1338 tarobj.open_volume(volume_path, encryption=encryption)
1339
1340 if self.decryptor is None:
1341 self.decryptor = \
1342 self.initialize_encryption (CRYPTO_MODE_DECRYPT,
1343 strict_validation=False)
1344
1345 backup_path = os.path.dirname(backup_tar_path)
1346 if not os.path.isabs(backup_path):
1347 backup_path = os.path.join(cwd, backup_path)
1348 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.decryptor)
1349
1350 tarobj = tarfile.TarFile.open(backup_tar_path,
1351 mode='r' + self.mode,
1352 format=tarfile.GNU_FORMAT,
1353 concat='#' in self.mode,
1354 encryption=self.decryptor,
1355 new_volume_handler=new_volume_handler,
1356 save_to_members=False,
1357 dereference=True)
1358
1359 def filter(cls, list_func, tarinfo):
1360 if list_func is None:
1361 self.logger.info(tarinfo.path)
1362 else:
1363 list_func(tarinfo)
1364 return False
1365 filter = partial(filter, self, list_func)
1366
1367 tarobj.extractall(filter=filter, unlink=True)
1368 tarobj.close()
1369
1370 def restore_backup(self, target_path, backup_indexes_paths=[],
1371 backup_tar_path=None, restore_callback=None,
1372 disaster=tarfile.TOLERANCE_STRICT, backup_index=None,
1373 strict_validation=True):
1374 '''
1375 Restores a backup.
1376
1377 Parameters:
1378 - target_path: path to restore.
1379 - backup_indexes_paths: path to backup indexes, in descending date order.
1380 The indexes indicate the location of their respective backup volumes,
1381 and multiple indexes are needed to be able to restore diff backups.
1382 Note that this is an optional parameter: if not suplied, it will
1383 try to restore directly from backup_tar_path.
1384 - backup_tar_path: path to the backup tar file. Used as an alternative
1385 to backup_indexes_paths to restore directly from a tar file without
1386 using any file index. If it's a multivol tarfile, volume_name_func
1387 will be called.
1388 - restore_callback: callback function to be called during restore.
1389 This is passed to the helper and gets called for every file.
1390
1391 NOTE: If you want to use an index to restore a backup, this function
1392 only supports to do so when the tarfile mode is either uncompressed or
1393 uses concat compress mode, because otherwise it would be very slow.
1394
1395 NOTE: Indices are assumed to follow the same format as the index_mode
1396 specified in the constructor.
1397
1398 Returns the list of files that could not be restored, if there were
1399 any.
1400 '''
1401 # check/sanitize input
1402 if not isinstance(target_path, str):
1403 raise Exception('Target path must be a string')
1404
1405 if backup_indexes_paths is None and backup_tar_path == []:
1406 raise Exception("You have to either provide index paths or a tar path")
1407
1408 if isinstance (backup_index, list) is True:
1409 mode = "disaster"
1410 elif len(backup_indexes_paths) == 0:
1411 mode = "tar"
1412 else:
1413 mode = "diff"
1414
1415 if mode == "tar":
1416 if not isinstance(backup_tar_path, str):
1417 raise Exception('Backup tar path must be a string')
1418
1419 if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
1420 raise Exception('Source path "%s" does not exist or is not a '\
1421 'file' % backup_tar_path)
1422
1423 if not os.access(backup_tar_path, os.R_OK):
1424 raise Exception('Source path "%s" is not readable' % backup_tar_path)
1425 else:
1426 if not isinstance(backup_indexes_paths, list):
1427 raise Exception('backup_indexes_paths must be a list')
1428
1429 if self.mode.startswith(':') or self.mode.startswith('|'):
1430 raise Exception('Restore only supports either uncompressed tars'
1431 ' or concat compression when restoring from an index, and '
1432 ' the open mode you provided is "%s"' % self.mode)
1433
1434 for index in backup_indexes_paths:
1435 if not isinstance(index, str):
1436 raise Exception('indices must be strings')
1437
1438 if not os.path.exists(index) or not os.path.isfile(index):
1439 raise Exception('Index path "%s" does not exist or is not a '\
1440 'file' % index)
1441
1442 if not os.access(index, os.R_OK):
1443 raise Exception('Index path "%s" is not readable' % index)
1444
1445 # try to create backup path if needed
1446 os.makedirs(target_path, exist_ok=True)
1447
1448 # make backup_tar_path absolute so that iterate_tar_path works fine
1449 if backup_tar_path and not os.path.isabs(backup_tar_path):
1450 backup_tar_path = os.path.abspath(backup_tar_path)
1451
1452 cwd = os.getcwd()
1453 os.chdir(target_path)
1454
1455 # setup for decrypting payload
1456 if self.decryptor is None:
1457 self.decryptor = \
1458 self.initialize_encryption (CRYPTO_MODE_DECRYPT,
1459 strict_validation=strict_validation)
1460
1461 if mode == 'tar':
1462 index_it = self.iterate_tar_path(backup_tar_path)
1463 helper = RestoreHelper(self, cwd, backup_path=backup_tar_path,
1464 tarobj=index_it.tar_obj)
1465 elif mode == "diff":
1466 helper = RestoreHelper(self, cwd, backup_indexes_paths,
1467 disaster=disaster)
1468 try:
1469 # get iterator from newest index at _data[0]
1470 index1 = helper._data[0]["path"]
1471 index_it = \
1472 self.iterate_index_path(index1,
1473 strict_validation=strict_validation)
1474 except tarfile.DecryptionError as exn:
1475 self.logger.error("failed to decrypt file [%s]: %s; is this an "
1476 "actual encrypted index file?"
1477 % (index1, str (exn)))
1478 return [(index1, exn)]
1479 except Exception as exn:
1480 # compressed files
1481 self.logger.error("failed to read file [%s]: %s; is this an "
1482 "actual index file?" % (index1, str (exn)))
1483 return [(index1, exn)]
1484 elif mode == "disaster":
1485 index_it = self.iterate_disaster_index (backup_index)
1486 helper = RestoreHelper (self, cwd, backup_path=backup_tar_path,
1487 backup_index=backup_index,
1488 disaster=disaster)
1489
1490 index_decryptor = helper._data[0]["decryptor"]
1491
1492 dir_it = self._recursive_walk_dir('.')
1493 dir_path_it = self.jsonize_path_iterator(dir_it)
1494
1495 failed = [] # irrecoverable files
1496
1497 # for each file to be restored, do:
1498 for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
1499 if not ipath:
1500 upath = dpath['path']
1501 op_type = dpath['type']
1502 else:
1503 upath = self.unprefixed(ipath['path'])
1504 op_type = ipath['type']
1505
1506 # filter paths
1507 if self.filter_path(upath, '', op_type == 'directory') == NO_MATCH:
1508 continue
1509
1510 # if types of the file mismatch, the file needs to be deleted
1511 # and re-restored
1512 if ipath is not None and dpath is not None and\
1513 dpath['type'] != ipath['type']:
1514 helper.delete(upath)
1515
1516 # if file not found in dpath, we can directly restore from index
1517 if not dpath:
1518 # if the file doesn't exist and it needs to be deleted, it
1519 # means that work is already done
1520 if ipath['path'].startswith('delete://'):
1521 continue
1522 try:
1523 self.logger.debug("restore %s" % ipath['path'])
1524 helper.restore(ipath, l_no, restore_callback)
1525 except Exception as e:
1526 iipath = ipath.get ("path", "")
1527 self.logger.error("FAILED to restore: {} ({})"
1528 .format(iipath, e))
1529 if disaster != tarfile.TOLERANCE_STRICT:
1530 failed.append ((iipath, e))
1531 continue
1532
1533 # if both files are equal, we have nothing to restore
1534 if self._equal_stat_dicts(ipath, dpath, listsnapshot_equal=True):
1535 continue
1536
1537 # we have to restore the file, but first we need to delete the
1538 # current existing file.
1539 # we don't delete the file if it's a directory, because it might
1540 # just have changed mtime, so it's quite inefficient to remove
1541 # it
1542 if ipath:
1543 if ipath['type'] != 'directory' or ipath['path'].startswith('delete://'):
1544 helper.delete(upath)
1545 self.logger.debug("restore %s" % ipath['path'])
1546 try:
1547 helper.restore(ipath, l_no, restore_callback)
1548 except Exception as e:
1549 if disaster == tarfile.TOLERANCE_STRICT:
1550 raise
1551 failed.append ((ipath.get ("path", ""), e))
1552 continue
1553
1554 # if the file is not in the index (so it comes from the target
1555 # directory) then we have to delete it
1556 else:
1557 self.logger.debug("delete %s" % upath)
1558 helper.delete(upath)
1559
1560 helper.restore_directories_permissions()
1561 index_it.release()
1562 os.chdir(cwd)
1563 helper.cleanup()
1564
1565 return failed
1566
1567
1568 def recover_backup(self, target_path, backup_indexes_paths=[],
1569 restore_callback=None):
1570 """
1571 Walk the index, extracting objects in disaster mode. Bad files are
1572 reported along with a reason.
1573
1574 *Security considerations*: In *recovery mode* the headers of encrypted
1575 objects are assumed damaged and GCM tags are not validated so
1576 modification of cryptographically relevant parts of the header (more
1577 specifically, the initalization vectors) can no longer be detected. If
1578 an attacker can manipulate the encrypted backup set and has access to
1579 the plaintext of some of the contents, they may be able to obtain the
1580 plaintext of other encrypted objects by injecting initialization
1581 vectors. For this reason *recovery mode* should only be used to
1582 emergency situations and the contents of the resulting files should be
1583 validated manually if possible and not be disclosed to untrusted
1584 parties.
1585 """
1586 return self.restore_backup(target_path,
1587 backup_indexes_paths=backup_indexes_paths,
1588 disaster=tarfile.TOLERANCE_RECOVER,
1589 strict_validation=False)
1590
1591
1592 def rescue_backup(self, target_path, backup_tar_path,
1593 restore_callback=None):
1594 """
1595 More aggressive “unfsck” mode: do not rely on the index data as the
1596 files may be corrupt; skim files for header-like information and
1597 attempt to retrieve the data.
1598
1599 *Security considerations*: As with *recovery mode*, in *rescue mode*
1600 the headers of encrypted objects are assumed damaged and GCM tags are
1601 not validated so modification of cryptographically relevant parts of
1602 the header (more specifically, the initalization vectors) can no longer
1603 be detected. If an attacker can manipulate the encrypted backup set and
1604 has access to the plaintext of some of the contents, they may be able
1605 to obtain the plaintext of other encrypted objects by injecting
1606 initialization vectors. For this reason *rescue mode* should only be
1607 used to emergency situations and the contents of the resulting files
1608 should be validated manually if possible and not be disclosed to
1609 untrusted parties.
1610 """
1611 def gen_volume_name (nvol):
1612 return os.path.join (os.path.dirname (backup_tar_path),
1613 self.volume_name_func (backup_tar_path,
1614 True,
1615 nvol))
1616
1617 backup_index = tarfile.gen_rescue_index (gen_volume_name,
1618 self.mode,
1619 password=self.password,
1620 key=self.crypto_key)
1621
1622 return self.restore_backup(target_path,
1623 backup_index=backup_index,
1624 backup_tar_path=backup_tar_path,
1625 disaster=tarfile.TOLERANCE_RESCUE,
1626 strict_validation=False)
1627
1628
1629 def _parse_json_line(self, f, l_no):
1630 '''
1631 Read line from file like object and process it as JSON.
1632 '''
1633 l = f.readline()
1634 l_no += 1
1635 try:
1636 j = json.loads(l.decode('UTF-8'))
1637 except UnicodeDecodeError as e:
1638 if tuple (l [0:2]) == tarfile.GZ_MAGIC:
1639 raise Exception \
1640 ("error parsing line #%d as json: looks like a compressed file (%d B: [%s..])"
1641 % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
1642 from e
1643 raise Exception \
1644 ("error parsing line #%d as json: not a text file (%d B: [%s..])"
1645 % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
1646 from e
1647 except ValueError as e:
1648 raise Exception("error parsing this json line "
1649 "(line number %d): %s" % (l_no, l))
1650 return j, l_no
1651
1652
1653class RestoreHelper(object):
1654 '''
1655 Class used to help to restore files from indices
1656 '''
1657
1658 # holds the dicts of data
1659 _data = []
1660
1661 _deltatar = None
1662
1663 _cwd = None
1664
1665 # list of directories to be restored. This is done as a last step, see
1666 # tarfile.extractall for details.
1667 _directories = []
1668
1669 _disaster = tarfile.TOLERANCE_STRICT
1670
1671 def __init__(self, deltatar, cwd, index_list=None, backup_path=False,
1672 backup_index=None, tarobj=None,
1673 disaster=tarfile.TOLERANCE_STRICT):
1674 '''
1675 Constructor opens the tars and init the data structures.
1676
1677 Assumptions:
1678
1679 - Index list must be provided in reverse order (newer first).
1680 - “newer first” apparently means that if there are n backups
1681 provided, the last full backup is at index n-1 and the most recent
1682 diff backup is at index 0.
1683 - Only the first, the second, and the last elements of
1684 ``index_list`` are relevant, others will not be accessed.
1685 - If no ``index_list`` is provided, both ``tarobj`` and
1686 ``backup_path`` must be passed.
1687 - If ``index_list`` is provided, the values of ``tarobj`` and
1688 ``backup_path`` are ignored.
1689 '''
1690 self._data = []
1691 self._directories = []
1692 self._deltatar = deltatar
1693 self._cwd = cwd
1694 self._password = deltatar.password
1695 self._crypto_key = deltatar.crypto_key
1696 self._decryptors = []
1697 self._disaster = disaster
1698
1699 # Disable strict checking for linearly increasing IVs when running
1700 # in rescue or recover mode.
1701 strict_validation = disaster == tarfile.TOLERANCE_STRICT
1702
1703 try:
1704 import grp, pwd
1705 except ImportError:
1706 grp = pwd = None
1707
1708 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
1709 self.canchown = True
1710 else:
1711 self.canchown = False
1712
1713 if isinstance (backup_index, list) is True:
1714 decryptor = self._deltatar.decryptor
1715 self._data = \
1716 [{ "curr_vol_no" : None
1717 , "vol_fd" : None
1718 , "offset" : -1
1719 , "tarobj" : None
1720 , "path" : backup_path
1721 , "is_full" : True
1722 , "iterator" : None
1723 , "last_itelement" : None
1724 , "last_lno" : 0
1725 , "new_volume_handler" :
1726 partial(self.new_volume_handler,
1727 self._deltatar, self._cwd, True,
1728 os.path.dirname(backup_path), decryptor)
1729 , "decryptor" : decryptor
1730 }]
1731 elif index_list is not None:
1732 for index in index_list:
1733 is_full = index == index_list[-1]
1734
1735 decryptor = None
1736 if self._password is not None:
1737 decryptor = crypto.Decrypt (password=self._password,
1738 key=self._crypto_key,
1739 strict_ivs=strict_validation)
1740
1741 # make paths absolute to avoid cwd problems
1742 if not os.path.isabs(index):
1743 index = os.path.normpath(os.path.join(cwd, index))
1744
1745 s = dict(
1746 curr_vol_no = None,
1747 vol_fd = None,
1748 offset = -1,
1749 tarobj = None,
1750 path = index,
1751 is_full = is_full,
1752 iterator = None,
1753 last_itelement = None,
1754 last_lno = 0,
1755 new_volume_handler = partial(self.new_volume_handler,
1756 self._deltatar, self._cwd, is_full,
1757 os.path.dirname(index), decryptor),
1758 decryptor = decryptor
1759 )
1760 self._data.append(s)
1761 else:
1762 # make paths absolute to avoid cwd problems
1763 if not os.path.isabs(backup_path):
1764 backup_path = os.path.normpath(os.path.join(cwd, backup_path))
1765
1766 # update the new_volume_handler of tar_obj
1767 tarobj.new_volume_handler = partial(self.new_volume_handler,
1768 self._deltatar, self._cwd, True, os.path.dirname(backup_path),
1769 self._deltatar.decryptor)
1770 s = dict(
1771 curr_vol_no = None,
1772 vol_fd = None,
1773 offset = -1,
1774 tarobj = tarobj,
1775 path = backup_path,
1776 is_full = True,
1777 iterator = None,
1778 last_itelement = None,
1779 last_lno = 0,
1780 new_volume_handler = tarobj.new_volume_handler,
1781 decryptor = self._deltatar.decryptor
1782 )
1783 self._data.append(s)
1784
1785
1786 def cleanup(self):
1787 '''
1788 Closes all open files
1789 '''
1790 for data in self._data:
1791 if data['vol_fd']:
1792 data['vol_fd'].close()
1793 data['vol_fd'] = None
1794 if data['tarobj']:
1795 data['tarobj'].close()
1796 data['tarobj'] = None
1797
1798 def delete(self, path):
1799 '''
1800 Delete a file
1801 '''
1802 if not os.path.exists(path):
1803 return
1804
1805 # to preserve parent directory mtime, we save it
1806 parent_dir = os.path.dirname(path) or os.getcwd()
1807 parent_dir_mtime = int(os.stat(parent_dir).st_mtime)
1808
1809 if os.path.isdir(path) and not os.path.islink(path):
1810 shutil.rmtree(path)
1811 else:
1812 os.unlink(path)
1813
1814 # now we restore parent_directory mtime
1815 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1816
1817 def restore(self, itpath, l_no, callback=None):
1818 '''
1819 Restore the path from the appropriate backup. Receives the current path
1820 from the newest (=first) index iterator. itpath must be not null.
1821 callback is a custom function that gets called for every file.
1822
1823 NB: This function takes the attribute ``_data`` as input but will only
1824 ever use its first and, if available, second element. Anything else in
1825 ``._data[]`` will be ignored.
1826 '''
1827 path = itpath['path']
1828
1829 # Calls the callback function
1830 if callback:
1831 callback()
1832
1833 if path.startswith('delete://'):
1834 # the file has previously been deleted already in restore_backup in
1835 # all cases so we just need to finish
1836 return
1837
1838 # get data from newest index (_data[0])
1839 data = self._data[0]
1840 upath = self._deltatar.unprefixed(path)
1841
1842 # to preserve parent directory mtime, we save it
1843 parent_dir = os.path.dirname(upath) or os.getcwd()
1844 os.makedirs(parent_dir, exist_ok=True)
1845 parent_dir_mtime = int(os.stat(parent_dir).st_mtime)
1846
1847 # if path is found in the newest index as to be snapshotted, deal with it
1848 # and finish
1849 if path.startswith('snapshot://'):
1850 self.restore_file(itpath, data, path, l_no, upath)
1851
1852 # now we restore parent_directory mtime
1853 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1854 return
1855
1856 # we go from index to index, finding the path in the index, then finding
1857 # the index with the most recent snapshot of the file being restored
1858 #
1859 # Right now we support diff backups, only. No incremental backups.
1860 # As a result _data[0] is always the diff backup index
1861 # and _data[1] the full backup index.
1862 if len(self._data) == 2:
1863 data = self._data[1]
1864 d, l_no, dpath = self.find_path_in_index(data, upath)
1865 if not d:
1866 self._deltatar.logger.warning('Error restoring file %s from '
1867 'index, not found in index %s' % (path, data['path']))
1868 return
1869
1870 cur_path = d.get('path', '')
1871 if cur_path.startswith('delete://'):
1872 self._deltatar.logger.warning(('Strange thing happened, file '
1873 '%s was listed in first index but deleted by another '
1874 'one. Path was ignored and untouched.') % path)
1875 return
1876 elif cur_path.startswith('snapshot://'):
1877 # this code path is reached when the file is unchanged
1878 # in the newest index and therefore of type 'list://'
1879 self.restore_file(d, data, path, l_no, dpath)
1880
1881 # now we restore parent_directory mtime
1882 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1883 return
1884
1885 # error code path is reached when:
1886 # a) we have more than two indexes (unsupported atm)
1887 # b) both indexes contain a list:// entry (logic error)
1888 # c) we have just one index and it also contains list://
1889 self._deltatar.logger.warning(('Error restoring file %s from index, '
1890 'snapshot not found in any index') % path)
1891
1892 def find_path_in_index(self, data, upath):
1893 # NOTE: we restart the iterator sometimes because the iterator can be
1894 # walked over completely multiple times, for example if one path if not
1895 # found in one index and we have to go to the next index.
1896 it = data['iterator']
1897 if it is None:
1898 it = data['iterator'] = self._deltatar.iterate_index_path(data["path"])
1899 d, l_no = it.__next__()
1900 else:
1901 d = data['last_itelement']
1902 l_no = data['last_lno']
1903
1904 while True:
1905 dpath = self._deltatar.unprefixed(d.get('path', ''))
1906 if upath == dpath:
1907 data['last_itelement'] = d
1908 data['last_lno'] = l_no
1909 return d, l_no, dpath
1910
1911 up, dp = self._deltatar.compare_indexes(upath, dpath)
1912 # any time upath should have appeared before current dpath, it means
1913 # upath is just not in this index and we should stop
1914 if dp is None:
1915 data['last_itelement'] = d
1916 data['last_lno'] = l_no
1917 return None, 0, ''
1918
1919 try:
1920 d, l_no = it.__next__()
1921 except StopIteration:
1922 data['last_itelement'] = d
1923 data['last_lno'] = l_no
1924 return None, 0, ''
1925
1926 def restore_directories_permissions(self):
1927 '''
1928 Restore directory permissions when everything have been restored
1929 '''
1930 try:
1931 import grp, pwd
1932 except ImportError:
1933 grp = pwd = None
1934
1935 self._directories.sort(key=operator.attrgetter('name'))
1936 self._directories.reverse()
1937
1938 # Set correct owner, mtime and filemode on directories.
1939 for member in self._directories:
1940 dirpath = member.name
1941 try:
1942 os.chmod(dirpath, member.mode)
1943 os.utime(dirpath, (member.mtime, member.mtime))
1944 if self.canchown:
1945 # We have to be root to do so.
1946 try:
1947 g = grp.getgrnam(member.gname)[2]
1948 except KeyError:
1949 g = member.gid
1950 try:
1951 u = pwd.getpwnam(member.uname)[2]
1952 except KeyError:
1953 u = member.uid
1954 try:
1955 if member.issym and hasattr(os, "lchown"):
1956 os.lchown(dirpath, u, g)
1957 else:
1958 os.chown(dirpath, u, g)
1959 except EnvironmentError:
1960 raise tarfile.ExtractError("could not change owner")
1961
1962 except tarfile.ExtractError as e:
1963 self._deltatar.logger.warning('tarfile: %s' % e)
1964
1965 @staticmethod
1966 def new_volume_handler(deltarobj, cwd, is_full, backup_path, decryptor, tarobj, base_name, volume_number):
1967 '''
1968 Set up a new volume and perform the tasks necessary for transitioning
1969 to the next one.
1970 '''
1971 volume_name = deltarobj.volume_name_func(backup_path, is_full,
1972 volume_number, guess_name=True)
1973 volume_path = os.path.join(backup_path, volume_name)
1974
1975 # we convert relative paths into absolute because CWD is changed
1976 if not os.path.isabs(volume_path):
1977 volume_path = os.path.join(cwd, volume_path)
1978
1979 tarobj.open_volume(volume_path, encryption=decryptor)
1980
1981 def restore_file(self, file_data, index_data, path, l_no, unprefixed_path):
1982 '''
1983 Restores a snapshot of a file from a specific backup
1984 '''
1985 op_type = file_data.get('type', -1)
1986 member = file_data.get('member', None)
1987 ismember = bool(member)
1988
1989 # when member is set, then we can assume everything is right and we
1990 # just have to restore the path
1991 if member is None:
1992 vol_no = file_data.get('volume', -1)
1993 # sanity check
1994 if not isinstance(vol_no, int) or vol_no < 0:
1995 self._deltatar.logger.warning('unrecognized type to be restored: '
1996 '%s, line %d' % (op_type, l_no))
1997
1998 # setup the volume that needs to be read. only needed when member is
1999 # not set
2000 if index_data['curr_vol_no'] != vol_no:
2001 index_data['curr_vol_no'] = vol_no
2002 backup_path = os.path.dirname(index_data['path'])
2003 vol_name = self._deltatar.volume_name_func(backup_path,
2004 index_data['is_full'], vol_no, guess_name=True)
2005 vol_path = os.path.join(backup_path, vol_name)
2006 if index_data['vol_fd']:
2007 index_data['vol_fd'].close()
2008 index_data['vol_fd'] = open(vol_path, 'rb')
2009
2010 # force reopen of the tarobj because of new volume
2011 if index_data['tarobj']:
2012 index_data['tarobj'].close()
2013 index_data['tarobj'] = None
2014
2015 # seek tarfile if needed
2016 offset = file_data.get('offset', -1)
2017 if index_data['tarobj']:
2018 if self._disaster == tarfile.TOLERANCE_RESCUE:
2019 # force a seek and reopen
2020 index_data['tarobj'].close()
2021 index_data['tarobj'] = None
2022 else:
2023 try:
2024 member = index_data['tarobj'].__iter__().__next__()
2025 except tarfile.DecryptionError:
2026 pass
2027 except tarfile.CompressionError:
2028 pass
2029
2030 if not member or member.path != file_data['path']:
2031 # force a seek and reopen
2032 index_data['tarobj'].close()
2033 index_data['tarobj'] = None
2034
2035
2036 # open the tarfile if needed
2037 if not index_data['tarobj']:
2038 index_data['vol_fd'].seek(offset)
2039 index_data['tarobj'] = tarfile.open(mode="r" + self._deltatar.mode,
2040 fileobj=index_data['vol_fd'],
2041 format=tarfile.GNU_FORMAT,
2042 concat='#' in self._deltatar.mode,
2043 encryption=index_data["decryptor"],
2044 new_volume_handler=index_data['new_volume_handler'],
2045 save_to_members=False,
2046 tolerance=self._disaster)
2047
2048 member = index_data['tarobj'].__iter__().__next__()
2049
2050 member.path = unprefixed_path
2051 member.name = unprefixed_path
2052
2053 if op_type == 'directory':
2054 self.add_member_dir(member)
2055 member = copy.copy(member)
2056 member.mode = 0o0700
2057
2058 # if it's an existing directory, we then don't need to recreate it
2059 # just set the right permissions, mtime and that kind of stuff
2060 if os.path.exists(member.path):
2061 return
2062
2063 if not ismember:
2064 # set current volume number in tarobj, otherwise the extraction of the
2065 # file might fail when trying to extract a multivolume member
2066 index_data['tarobj'].volume_number = index_data['curr_vol_no']
2067
2068 def ignore_symlink (member, *_args):
2069 self._deltatar.logger.warning("Ignoring symlink %s" % member.name)
2070
2071 # finally, restore the file
2072 index_data['tarobj'].extract(member, symlink_cb=ignore_symlink,
2073 unlink=True)
2074
2075 def add_member_dir(self, member):
2076 '''
2077 Add member dir to be restored at the end
2078 '''
2079 if not self.canchown:
2080 self._directories.append(DirItem(name=member.name, mode=member.mode,
2081 mtime=member.mtime))
2082 else:
2083 self._directories.append(DirItem(name=member.name, mode=member.mode,
2084 mtime=member.mtime, gname=member.gname, uname=member.uname,
2085 uid=member.uid, gid=member.gid, issym=member.issym()))
2086
2087class DirItem(object):
2088 def __init__(self, **kwargs):
2089 for k, v in kwargs.items():
2090 setattr(self, k, v)