python 3.7 support: Track cpython removal of re._pattern_type
[python-delta-tar] / deltatar / deltatar.py
CommitLineData
6b2fa38f 1#!/usr/bin/env python3
0708a374 2
51797cd6 3# Copyright (C) 2013, 2014 Intra2net AG
0708a374
ERE
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU Lesser General Public License as published
7# by the Free Software Foundation; either version 3 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU Lesser General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program. If not, see
17# <http://www.gnu.org/licenses/lgpl-3.0.html>
18
938c2d54
PG
19DELTATAR_HEADER_VERSION = 1
20DELTATAR_PARAMETER_VERSION = 1
3fdea6d4 21
0708a374
ERE
22import logging
23import datetime
6c678f3a 24import binascii
938c2d54 25import io
0501fe0a 26import operator
0708a374 27import os
0501fe0a 28import copy
82de3376 29import shutil
8a8fadda 30import re
e82f14f5
ERE
31import stat
32import json
c9ee0159 33import typing
0708a374
ERE
34from functools import partial
35
36from . import tarfile
2ae46844 37from . import crypto
0708a374 38
0708a374
ERE
39class NullHandler(logging.Handler):
40 def emit(self, record):
41 pass
24ddf0a2
ERE
42
43
0708a374
ERE
44logging.getLogger("deltatar.DeltaTar").addHandler(NullHandler())
45
974408b5
ERE
46
47# match mode
48NO_MATCH = False
49MATCH = True
50PARENT_MATCH = 2
51
133d30da
PG
52# encryption direction
53CRYPTO_MODE_ENCRYPT = 0
54CRYPTO_MODE_DECRYPT = 1
55
13cc7dfc
PG
56# The canonical extension for encrypted backup files regardless of the actual
57# encryption parameters is “.pdtcrypt”. This is analogous to the encryption
58# header which starts with the eight ASCII bytes “PDTCRYPT”. Historical note:
59# Since the introduction of the versioned header there no longer any need
60# for encoding encryption parameters in the file extensions (“.aes128” and
61# suchlike).
62PDTCRYPT_EXTENSION = "pdtcrypt"
2cdd9faf
PG
63PDT_TYPE_ARCHIVE = 0
64PDT_TYPE_AUX = 1
13cc7dfc 65
9eccb1c2
PG
66AUXILIARY_FILE_INDEX = 0
67AUXILIARY_FILE_INFO = 1
68
0708a374
ERE
69class DeltaTar(object):
70 '''
71 Backup class used to create backups
72 '''
73
74 # list of files to exclude in the backup creation or restore operation. It
75 # can contain python regular expressions.
76 excluded_files = []
77
78 # list of files to include in the backup creation or restore operation. It
79 # can contain python regular expressions. If empty, all files in the source
80 # path will be backed up (when creating a backup) or all the files in the
a83fa4ed 81 # backup will be restored (when restoring a backup), but if included_files
0708a374
ERE
82 # is set then only the files include in the list will be processed.
83 included_files = []
84
85 # custom filter of files to be backed up (or restored). Unused and unset
86 # by default. The function receives a file path and must return a boolean.
87 filter_func = None
88
da26094a
ERE
89 # mode in which the delta will be created (when creating a backup) or
90 # opened (when restoring). Accepts modes analog to the tarfile library.
91 mode = ""
0708a374
ERE
92
93 # used together with aes modes to encrypt and decrypt backups.
94 password = None
1f3fd7b0
PG
95 crypto_key = None
96 nacl = None
0708a374 97
dbee011c
PG
98 # parameter version to use when encrypting; note that this has no effect
99 # on decryption since the required settings are determined from the headers
54f909ca 100 crypto_version = DELTATAR_HEADER_VERSION
dbee011c
PG
101 crypto_paramversion = None
102
133d30da 103 # when encrypting or decrypting, these hold crypto handlers; created before
2ae46844 104 # establishing the Tarfile stream iff a password is supplied.
133d30da
PG
105 encryptor = None
106 decryptor = None
2ae46844 107
0708a374
ERE
108 # python logger object.
109 logger = None
110
3a7e1a50
ERE
111 # specifies the index mode in the same format as @param mode, but without
112 # the ':', '|' or '#' at the begining. It doesn't make sense to specify
2ae46844 113 # that the index is encrypted if no password is given in the constructor.
3a7e1a50 114 index_mode = None
0708a374
ERE
115
116 # current time for this backup. Used for file names and file creation checks
117 current_time = None
118
9eae9a1f
ERE
119 # extra data to included in the header of the index file when creating a
120 # backup
121 extra_data = dict()
122
0708a374
ERE
123 # valid tarfile modes and their corresponding default file extension
124 __file_extensions_dict = {
da26094a
ERE
125 '': '',
126 ':': '',
127 ':gz': '.gz',
128 ':bz2': '.bz2',
129 '|': '',
130 '|gz': '.gz',
131 '|bz2': '.bz2',
132 '#gz': '.gz',
6e99d23a
PG
133 '#gz.pdtcrypt': '.gz',
134 '#pdtcrypt': '',
d1c38f40 135 '#': '',
0708a374
ERE
136 }
137
3a7e1a50
ERE
138 # valid index modes and their corresponding default file extension
139 __index_extensions_dict = {
140 '': '',
141 'gz': '.gz',
142 'bz2': '.bz2',
6e99d23a
PG
143 'gz.pdtcrypt': '.gz',
144 'pdtcrypt': '',
3a7e1a50
ERE
145 }
146
8adbe50d
ERE
147 # valid path prefixes
148 __path_prefix_list = [
149 u'snapshot://',
150 u'list://',
151 u'delete://'
152 ]
153
0708a374 154 def __init__(self, excluded_files=[], included_files=[],
da26094a 155 filter_func=None, mode="", password=None,
1f3fd7b0 156 crypto_key=None, nacl=None,
54f909ca 157 crypto_version=DELTATAR_HEADER_VERSION,
dbee011c 158 crypto_paramversion=DELTATAR_PARAMETER_VERSION,
3a7e1a50 159 logger=None, index_mode=None, index_name_func=None,
0708a374
ERE
160 volume_name_func=None):
161 '''
162 Constructor. Configures the diff engine.
163
164 Parameters:
165 - excluded_files: list of files to exclude in the backup creation or
166 restore operation. It can contain python regular expressions.
167
168 - included_files: list of files to include in the backup creation or
169 restore operation. It can contain python regular expressions. If
170 empty, all files in the source path will be backed up (when creating a
171 backup) or all the files in the backup will be restored (when
a83fa4ed 172 restoring a backup), but if included_files is set then only the files
0708a374
ERE
173 include in the list will be processed.
174
175 - filter_func: custom filter of files to be backed up (or restored).
176 Unused and unset by default. The function receives a file path and
177 must return a boolean.
178
179 - mode: mode in which the delta will be created (when creating a backup)
180 or opened (when restoring). Accepts the same modes as the tarfile
181 library. Valid modes are:
182
da26094a
ERE
183 '' open uncompressed
184 ':' open uncompressed
185 ':gz' open with gzip compression
186 ':bz2' open with bzip2 compression
187 '|' open an uncompressed stream of tar blocks
188 '|gz' open a gzip compressed stream of tar blocks
189 '|bz2' open a bzip2 compressed stream of tar blocks
190 '#gz' open a stream of gzip compressed tar blocks
0708a374 191
1f3fd7b0
PG
192 - crypto_key: used to encrypt and decrypt backups. Encryption will
193 be enabled automatically if a key is supplied. Requires a salt to be
194 passed as well.
195
196 - nacl: salt that was used to derive the encryption key for embedding
197 in the PDTCRYPT header. Not needed when decrypting and when
198 encrypting with password.
199
6e99d23a
PG
200 - password: used to encrypt and decrypt backups. Encryption will be
201 enabled automatically if a password is supplied.
0708a374 202
54f909ca
PG
203 - crypto_version: version of the format, determining the kind of PDT
204 object header.
205
dbee011c
PG
206 - crypto_paramversion: optionally request encryption conforming to
207 a specific parameter version. Defaults to the standard PDT value
208 which as of 2017 is the only one available.
209
0708a374
ERE
210 - logger: python logger object. Optional.
211
3a7e1a50 212 - index_mode: specifies the index mode in the same format as @param
6e99d23a
PG
213 mode, but without the ':', '|' or '#' at the begining. If encryption
214 is requested it will extend to the auxiliary (index, info) files as
215 well. This is an optional parameter that will automatically mimic
216 @param mode by default if not provided. Valid modes are:
3a7e1a50
ERE
217
218 '' open uncompressed
219 'gz' open with gzip compression
220 'bz2' open with bzip2 compression
0708a374
ERE
221
222 - index_name_func: function that sets a custom name for the index file.
2cc6e32b
PG
223 This function receives a flag to indicate whether the name will be
224 used for a full or diff backup. The backup path will be prepended to
225 its return value.
0708a374
ERE
226
227 - volume_name_func: function that defines the name of tar volumes. It
228 receives the backup_path, if it's a full backup and the volume number,
229 and must return the name for the corresponding volume name. Optional,
230 DeltaTar has default names for tar volumes.
231 '''
232
da26094a 233 if mode not in self.__file_extensions_dict:
8a54d5dd
PG
234 raise Exception('Unrecognized extension mode=[%s] requested for files'
235 % str(mode))
0708a374
ERE
236
237 self.excluded_files = excluded_files
238 self.included_files = included_files
239 self.filter_func = filter_func
240 self.logger = logging.getLogger('deltatar.DeltaTar')
241 if logger:
242 self.logger.addHandler(logger)
243 self.mode = mode
2ae46844 244
1f3fd7b0
PG
245 if crypto_key is not None:
246 self.crypto_key = crypto_key
247 self.nacl = nacl # encryption only
248
2ae46844
PG
249 if password is not None:
250 self.password = password
3a7e1a50 251
54f909ca
PG
252 if crypto_version is not None:
253 self.crypto_version = crypto_version
254
dbee011c
PG
255 if crypto_paramversion is not None:
256 self.crypto_paramversion = crypto_paramversion
257
3a7e1a50
ERE
258 # generate index_mode
259 if index_mode is None:
260 index_mode = ''
6e99d23a 261 if 'gz' in mode:
3a7e1a50
ERE
262 index_mode = "gz"
263 elif 'bz2' in mode:
264 index_mode = "bz2"
265 elif mode not in self.__index_extensions_dict:
8a54d5dd
PG
266 raise Exception('Unrecognized extension mode=[%s] requested for index'
267 % str(mode))
3a7e1a50
ERE
268
269 self.index_mode = index_mode
0708a374
ERE
270 self.current_time = datetime.datetime.now()
271
272 if index_name_func is not None:
273 self.index_name_func = index_name_func
274
275 if volume_name_func is not None:
276 self.volume_name_func = volume_name_func
277
e54cfec5 278 def pick_extension(self, kind, mode=None):
2cdd9faf
PG
279 """
280 Choose the extension depending on a) the kind of file given, b) the
281 processing mode, and c) the current encryption settings.
282 """
283 ret = ""
284 if kind == PDT_TYPE_ARCHIVE:
285 ret += ".tar"
e54cfec5
PG
286 if mode is None:
287 mode = self.__index_extensions_dict [self.index_mode]
2cdd9faf 288 ret += mode
a83fa4ed 289 if self.crypto_key is not None or self.password is not None:
2cdd9faf
PG
290 ret += "." + PDTCRYPT_EXTENSION
291 return ret
292
f0287fb7 293 def index_name_func(self, is_full): # pylint: disable=method-hidden
0708a374 294 '''
2cc6e32b
PG
295 Callback for setting a custom name for the index file. Depending on
296 whether *is_full* is set, it will create a suitable name for a full
297 or a diff backup.
0708a374
ERE
298 '''
299 prefix = "bfull" if is_full else "bdiff"
f7940c31 300 date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
2cdd9faf
PG
301 extension = self.pick_extension \
302 (PDT_TYPE_AUX,
303 self.__index_extensions_dict [self.index_mode])
0708a374 304
da26094a 305 return "%s-%s.index%s" % (prefix, date_str, extension)
0708a374 306
f0287fb7
CH
307 def volume_name_func(self, backup_path, # pylint: disable=method-hidden
308 is_full, volume_number,
309 guess_name=False):
0708a374
ERE
310 '''
311 function that defines the name of tar volumes. It receives the
312 backup_path, if it's a full backup and the volume number, and must return
313 the name for the corresponding volume name. Optional, DeltaTar has default
314 names for tar volumes.
df86af81
ERE
315
316 If guess_name is activated, the file is intended not to be created but
317 to be found, and thus the date will be guessed.
0708a374
ERE
318 '''
319 prefix = "bfull" if is_full else "bdiff"
2cdd9faf
PG
320 extension = self.pick_extension \
321 (PDT_TYPE_ARCHIVE,
322 self.__file_extensions_dict [self.mode])
0708a374 323
df86af81 324 if not guess_name:
f7940c31 325 date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
2cdd9faf 326 return "%s-%s-%03d%s" % (prefix, date_str, volume_number + 1, extension)
df86af81
ERE
327 else:
328 prefix = prefix + "-"
90b75470 329 postfix = "-%03d%s" % (volume_number + 1, extension)
86a6e741
ERE
330 for f in os.listdir(backup_path):
331 if f.startswith(prefix) and f.endswith(postfix):
332 return f
df86af81
ERE
333 raise Exception("volume not found")
334
0708a374 335
974408b5 336 def filter_path(self, path, source_path="", is_dir=None):
8a8fadda
ERE
337 '''
338 Filters a path, given the source_path, using the filtering properties
339 set in the constructor.
340 The filtering order is:
341 1. included_files (if any)
342 2. excluded_files
343 3. filter_func (which must return whether the file is accepted or not)
344 '''
75059f3c 345
c1af2184 346 if len(source_path) > 0:
75059f3c
CH
347 # ensure that exactly one '/' at end of dir is also removed
348 source_path = source_path.rstrip(os.sep) + os.sep
8a8fadda
ERE
349 path = path[len(source_path):]
350
351 # 1. filter included_files
974408b5 352 match = MATCH
8a8fadda 353 if len(self.included_files) > 0:
974408b5 354 match = NO_MATCH
8a8fadda
ERE
355 for i in self.included_files:
356 # it can be either a regexp or a string
be60ffd0 357 if isinstance(i, str):
8a8fadda
ERE
358 # if the string matches, then continue
359 if i == path:
974408b5 360 match = MATCH
c1af2184 361 break
8a8fadda
ERE
362
363 # if the string ends with / it's a directory, and if the
7b07645e 364 # path is contained in it, it is included
c1af2184 365 if i.endswith('/') and path.startswith(i):
974408b5 366 match = MATCH
c1af2184 367 break
8a8fadda
ERE
368
369 # if the string doesn't end with /, add it and do the same
370 # check
c1af2184 371 elif path.startswith(i + '/'):
974408b5 372 match = MATCH
c1af2184 373 break
8a8fadda 374
974408b5
ERE
375 # check for PARENT_MATCH
376 if is_dir:
377 dir_path = path
378 if not dir_path.endswith('/'):
379 dir_path += '/'
380
381 if i.startswith(dir_path):
382 match = PARENT_MATCH
383
8a8fadda 384 # if it's a reg exp, then we just check if it matches
c9ee0159 385 elif isinstance(i, typing.Pattern):
c1af2184 386 if i.match(path):
974408b5 387 match = MATCH
c1af2184 388 break
8a8fadda 389 else:
4bda6f45 390 self.logger.warning('Invalid pattern in included_files: %s' % str(i))
8a8fadda 391
974408b5
ERE
392 if match == NO_MATCH:
393 return NO_MATCH
c1af2184 394
974408b5
ERE
395 # when a directory is in PARENT_MATCH, it doesn't matter if it's
396 # excluded. It's subfiles will be excluded, but the directory itself
397 # won't
398 if match != PARENT_MATCH:
8a8fadda
ERE
399 for e in self.excluded_files:
400 # it can be either a regexp or a string
be60ffd0 401 if isinstance(e, str):
8a8fadda 402 # if the string matches, then exclude
c1af2184 403 if e == path:
974408b5 404 return NO_MATCH
8a8fadda
ERE
405
406 # if the string ends with / it's a directory, and if the
407 # path starts with the directory, then exclude
c1af2184 408 if e.endswith('/') and path.startswith(e):
974408b5 409 return NO_MATCH
8a8fadda
ERE
410
411 # if the string doesn't end with /, do the same check with
412 # the slash added
c1af2184 413 elif path.startswith(e + '/'):
974408b5 414 return NO_MATCH
8a8fadda
ERE
415
416 # if it's a reg exp, then we just check if it matches
c9ee0159 417 elif isinstance(e, typing.Pattern):
c1af2184 418 if e.match(path):
974408b5 419 return NO_MATCH
8a8fadda 420 else:
4bda6f45 421 self.logger.warning('Invalid pattern in excluded_files: %s' % str(e))
8a8fadda
ERE
422
423 if self.filter_func:
424 return self.filter_func(path)
425
974408b5 426 return match
8a8fadda 427
283fbd5e 428 def _recursive_walk_dir(self, source_path, keep_base_dir=False):
0708a374
ERE
429 '''
430 Walk a directory recursively, yielding each file/directory
c059a221
PG
431
432 Returns the path of an entity. If ``keep_base_dir`` is set,
433 the path returned contains the prefix ``source_path``; otherwise it is
434 relative to the prefix.
0708a374
ERE
435 '''
436
283fbd5e 437 source_path = source_path.rstrip(os.sep)
0708a374 438
283fbd5e 439 if keep_base_dir:
adf7dac4 440 beginning_size = 0
283fbd5e
CH
441 else:
442 beginning_size = len(source_path) + 1 # +1 for os.sep
443
444 queue = [source_path]
445
d07c8065 446 while queue:
df86af81 447 cur_path = queue.pop(0)
0708a374 448
e76ca7e0
PG
449 try:
450 dfd = os.open (cur_path, os.O_DIRECTORY)
451 except FileNotFoundError as exn:
452 self.logger.warning ("failed to open entity [%s] as directory; "
453 "file system (error: %s); skipping"
454 % (cur_path, str (exn)))
d86735e4
ERE
455 continue
456
c059a221
PG
457 try:
458 for filename in sorted(os.listdir(dfd)):
459 child = os.path.join(cur_path, filename)
460 is_dir = os.path.isdir(child)
461 status = self.filter_path(child, source_path, is_dir)
462 if status == NO_MATCH:
463 continue
464 if not os.access(child, os.R_OK):
465 self.logger.warning('Error accessing possibly locked file %s' % child)
466 continue
467
468 if status == MATCH:
469 yield child[beginning_size:]
470
471 if is_dir and (status == MATCH or status == PARENT_MATCH):
472 queue.append(child)
473 finally:
474 os.close (dfd)
0708a374 475
e82f14f5
ERE
476 def _stat_dict(self, path):
477 '''
478 Returns a dict with the stat data used to compare files
479 '''
480 stinfo = os.stat(path)
481 mode = stinfo.st_mode
482
483 ptype = None
484 if stat.S_ISDIR(mode):
d07c8065 485 ptype = u'directory'
e82f14f5 486 elif stat.S_ISREG(mode):
d07c8065 487 ptype = u'file'
e82f14f5 488 elif stat.S_ISLNK(mode):
d07c8065 489 ptype = u'link'
e82f14f5
ERE
490
491 return {
d07c8065 492 u'type': ptype,
be60ffd0 493 u'path': path,
d07c8065 494 u'mode': mode,
0501fe0a
ERE
495 u'mtime': int(stinfo.st_mtime),
496 u'ctime': int(stinfo.st_ctime),
d07c8065
ERE
497 u'uid': stinfo.st_uid,
498 u'gid': stinfo.st_gid,
499 u'inode': stinfo.st_ino,
500 u'size': stinfo.st_size
e82f14f5
ERE
501 }
502
df99a044 503 def _equal_stat_dicts(self, d1, d2, listsnapshot_equal=False):
d07c8065
ERE
504 '''
505 Return if the dicts are equal in the stat keys
506 '''
fc8fdcbc 507 keys = [u'type', u'mode',u'size', u'mtime',
d041935c 508 # not restored: u'inode', u'ctime'
df99a044 509 ]
8adbe50d 510
fc8fdcbc 511 # only if user is root, then also check gid/uid. otherwise do not check it,
d041935c 512 # because tarfile can chown in case of being superuser only
50d70ca9
PG
513 #
514 # also, skip the check in rpmbuild since the sources end up with the
515 # uid:gid of the packager while the extracted files are 0:0.
516 if hasattr(os, "geteuid") and os.geteuid() == 0 \
517 and os.getenv ("RPMBUILD_OPTIONS") is None:
fc8fdcbc
ERE
518 keys.append('gid')
519 keys.append('uid')
520
ea6d3c3e 521 if (not d1 and d2 != None) or (d1 != None and not d2):
8adbe50d
ERE
522 return False
523
cbac9f0b
ERE
524 if self.prefixed(d1.get('path', -1), listsnapshot_equal) != self.prefixed(d2.get('path', -2), listsnapshot_equal):
525 return False
8adbe50d 526
fc8fdcbc
ERE
527 type = d1.get('type', '')
528
d07c8065 529 for key in keys:
fc8fdcbc
ERE
530 # size doesn't matter for directories
531 if type == 'directory' and key == 'size':
532 continue
d07c8065
ERE
533 if d1.get(key, -1) != d2.get(key, -2):
534 return False
535 return True
536
df99a044 537 def prefixed(self, path, listsnapshot_equal=False):
8adbe50d
ERE
538 '''
539 if a path is not prefixed, return it prefixed
540 '''
541 for prefix in self.__path_prefix_list:
542 if path.startswith(prefix):
df99a044
ERE
543 if listsnapshot_equal and prefix == u'list://':
544 return u'snapshot://' + path[len(prefix):]
8adbe50d
ERE
545 return path
546 return u'snapshot://' + path
547
548 def unprefixed(self, path):
549 '''
550 remove a path prefix if any
551 '''
552 for prefix in self.__path_prefix_list:
553 if path.startswith(prefix):
554 return path[len(prefix):]
555 return path
556
133d30da
PG
557
558 def initialize_encryption (self, mode):
559 password = self.password
1f3fd7b0
PG
560 key = self.crypto_key
561 nacl = self.nacl
133d30da 562
1f3fd7b0 563 if key is None and password is None:
133d30da
PG
564 return
565 if mode == CRYPTO_MODE_ENCRYPT:
1f3fd7b0
PG
566 return crypto.Encrypt (password=password,
567 key=key,
568 nacl=nacl,
54f909ca 569 version=self.crypto_version,
774ca538 570 paramversion=self.crypto_paramversion)
133d30da 571 if mode == CRYPTO_MODE_DECRYPT:
1f3fd7b0 572 return crypto.Decrypt (password=password, key=key)
133d30da
PG
573
574 raise Exception ("invalid encryption mode [%r]" % mode)
575
576
9eccb1c2 577 def open_auxiliary_file(self, path, mode='r', kind=AUXILIARY_FILE_INDEX):
3a7e1a50 578 '''
9eccb1c2
PG
579 Given the specified configuration, opens a file for reading or writing,
580 inheriting the encryption and compression settings from the backup.
581 Returns a file object ready to use.
3fdea6d4 582
c8c72fe1
PG
583 :param mode: IO mode (read or write, ``"r"`` and ``"w"``,
584 respectively).
585 :type mode: str
774ca538
PG
586 :param kind: Role of the file, see AUXILIARY_FILE_* constants.
587 Both the info and the auxiliary file have a globally
588 unique, constant counter value.
3fdea6d4 589 :type kind: str
3a7e1a50 590 '''
3a7e1a50
ERE
591 if self.index_mode.startswith('gz'):
592 comptype = 'gz'
593 elif self.index_mode.startswith('bz2'):
594 comptype = 'bz2'
595 else:
596 comptype = 'tar'
597
133d30da 598 crypto_ctx = None
6de9444a 599 enccounter = None
133d30da 600 if mode == "w":
774ca538 601 crypto_ctx = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
133d30da 602 elif mode == "r":
774ca538 603 crypto_ctx = self.initialize_encryption (CRYPTO_MODE_DECRYPT)
133d30da 604
3031b7ae
PG
605 if crypto_ctx is not None:
606 if kind == AUXILIARY_FILE_INFO:
607 enccounter = crypto.AES_GCM_IV_CNT_INFOFILE
608 elif kind == AUXILIARY_FILE_INDEX:
609 enccounter = crypto.AES_GCM_IV_CNT_INDEX
610 else:
611 raise Exception ("invalid kind of aux file %r" % kind)
612
c8c72fe1 613 sink = tarfile._Stream(name=path, mode=mode, comptype=comptype,
3fdea6d4 614 bufsize=tarfile.RECORDSIZE, fileobj=None,
6de9444a 615 encryption=crypto_ctx, enccounter=enccounter)
c8c72fe1
PG
616
617 return sink
618
3a7e1a50 619
0708a374 620 def create_full_backup(self, source_path, backup_path,
d4a05db6 621 max_volume_size=None, extra_data=dict()):
0708a374
ERE
622 '''
623 Creates a full backup.
624
625 Parameters:
626 - source_path: source path to the directory to back up.
627 - backup_path: path where the back up will be stored. Backup path will
628 be created if not existent.
d5361dac
ERE
629 - max_volume_size: maximum volume size in megabytes. Used to split the
630 backup in volumes. Optional (won't split in volumes by default).
9eae9a1f
ERE
631 - extra_data: a json-serializable dictionary with information that you
632 want to be included in the header of the index file
0708a374
ERE
633 '''
634 # check input
be60ffd0 635 if not isinstance(source_path, str):
0708a374
ERE
636 raise Exception('Source path must be a string')
637
be60ffd0 638 if not isinstance(backup_path, str):
0708a374
ERE
639 raise Exception('Backup path must be a string')
640
641 if not os.path.exists(source_path) or not os.path.isdir(source_path):
642 raise Exception('Source path "%s" does not exist or is not a '\
643 'directory' % source_path)
644
d07c8065
ERE
645 if max_volume_size != None and (not isinstance(max_volume_size, int) or\
646 max_volume_size < 1):
647 raise Exception('max_volume_size must be a positive integer')
d5361dac
ERE
648 if max_volume_size != None:
649 max_volume_size = max_volume_size*1024*1024
650
9eae9a1f
ERE
651 if not isinstance(extra_data, dict):
652 raise Exception('extra_data must be a dictionary')
653
654 try:
655 extra_data_str = json.dumps(extra_data)
656 except:
657 raise Exception('extra_data is not json-serializable')
658
0708a374
ERE
659 if not os.access(source_path, os.R_OK):
660 raise Exception('Source path "%s" is not readable' % source_path)
661
662 # try to create backup path if needed
37ab0f57 663 os.makedirs(backup_path, exist_ok=True)
0708a374
ERE
664
665 if not os.access(backup_path, os.W_OK):
666 raise Exception('Backup path "%s" is not writeable' % backup_path)
667
668 if source_path.endswith('/'):
669 source_path = source_path[:-1]
670
671 if backup_path.endswith('/'):
672 backup_path = backup_path[:-1]
673
674 # update current time
675 self.current_time = datetime.datetime.now()
676
677 if self.mode not in self.__file_extensions_dict:
678 raise Exception('Unrecognized extension')
679
2ae46844 680 # setup for encrypting payload
774ca538
PG
681 if self.encryptor is None:
682 self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
2ae46844 683
0708a374 684 # some initialization
11684b1d 685 self.vol_no = 0
0708a374
ERE
686
687 # generate the first volume name
688 vol_name = self.volume_name_func(backup_path, True, 0)
689 tarfile_path = os.path.join(backup_path, vol_name)
690
774ca538
PG
691 # init index
692 index_name = self.index_name_func(True)
693 index_path = os.path.join(backup_path, index_name)
694 index_sink = self.open_auxiliary_file(index_path, 'w')
e82f14f5 695
d5361dac
ERE
696 cwd = os.getcwd()
697
b7c47f38 698 def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
0708a374
ERE
699 '''
700 Handles the new volumes
701 '''
d5361dac
ERE
702 volume_name = deltarobj.volume_name_func(backup_path, True, volume_number)
703 volume_path = os.path.join(backup_path, volume_name)
11684b1d 704 deltarobj.vol_no = volume_number
d5361dac
ERE
705
706 # we convert relative paths into absolute because CWD is changed
707 if not os.path.isabs(volume_path):
708 volume_path = os.path.join(cwd, volume_path)
11684b1d 709
8e019196
ERE
710 if tarobj.fileobj is not None:
711 tarobj.fileobj.close()
712
b008f989
ERE
713 deltarobj.logger.debug("opening volume %s" % volume_path)
714
b7c47f38 715 tarobj.open_volume(volume_path, encryption=encryption)
d5361dac
ERE
716
717 # wraps some args from context into the handler
133d30da 718 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.encryptor)
0708a374 719
774ca538 720 index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "full", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))
6c678f3a 721
be60ffd0 722 s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
6c678f3a 723 # calculate checksum and write into the stream
c2ffe2ec 724 crc = binascii.crc32(s) & 0xFFFFffff
774ca538 725 index_sink.write(s)
e82f14f5 726
0708a374
ERE
727 # start creating the tarfile
728 tarobj = tarfile.TarFile.open(tarfile_path,
da26094a 729 mode='w' + self.mode,
0708a374 730 format=tarfile.GNU_FORMAT,
d1c38f40 731 concat='#' in self.mode,
133d30da 732 encryption=self.encryptor,
0708a374 733 max_volume_size=max_volume_size,
ea625b04 734 new_volume_handler=new_volume_handler,
e2b59b34
ERE
735 save_to_members=False,
736 dereference=True)
e5c6ca04 737 os.chdir(source_path)
55b8686d
ERE
738
739 # for each file to be in the backup, do:
e82f14f5 740 for path in self._recursive_walk_dir('.'):
3e9b81bb
PG
741
742 try: # backup file
fd743c26
PG
743 # calculate stat dict for current file
744 statd = self._stat_dict(path)
745 statd['path'] = u'snapshot://' + statd['path']
746 statd['volume'] = self.vol_no
747
748 # backup file
3e9b81bb
PG
749 tarobj.add(path, arcname = statd['path'], recursive=False)
750 except FileNotFoundError as exn:
751 # file vanished since the call to access(3) above
752 self.logger.warning ("object [%s] no longer available in "
753 "file system (error: %s); skipping"
754 % (path, str (exn)))
755 continue # prevent indexing
11684b1d 756
55b8686d 757 # retrieve file offset
253d4cdd 758 statd['offset'] = tarobj.get_last_member_offset()
b008f989 759 self.logger.debug("backup %s" % statd['path'])
6c678f3a 760
d041935c 761 # store the stat dict in the index
be60ffd0 762 s = bytes(json.dumps(statd) + '\n', 'UTF-8')
6c678f3a 763 crc = binascii.crc32(s, crc) & 0xffffffff
774ca538 764 index_sink.write(s)
e82f14f5 765
be60ffd0 766 s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
6c678f3a 767 crc = binascii.crc32(s, crc) & 0xffffffff
774ca538 768 index_sink.write(s)
be60ffd0 769 s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
774ca538
PG
770 index_sink.write(s)
771
e5c6ca04 772 os.chdir(cwd)
0708a374 773 tarobj.close()
c8c72fe1 774 index_sink.close (close_fileobj=True)
938c2d54 775
0708a374 776 def create_diff_backup(self, source_path, backup_path, previous_index_path,
d4a05db6 777 max_volume_size=None, extra_data=dict()):
0708a374
ERE
778 '''
779 Creates a backup.
780
781 Parameters:
782 - source_path: source path to the directory to back up.
783 - backup_path: path where the back up will be stored. Backup path will
784 be created if not existent.
785 - previous_index_path: index of the previous backup, needed to know
786 which files changed since then.
787 - max_volume_size: maximum volume size in megabytes (MB). Used to split
788 the backup in volumes. Optional (won't split in volumes by default).
3a7e1a50
ERE
789
790 NOTE: previous index is assumed to follow exactly the same format as
791 the index_mode setup in the constructor.
0708a374 792 '''
d07c8065 793 # check/sanitize input
be60ffd0 794 if not isinstance(source_path, str):
d07c8065
ERE
795 raise Exception('Source path must be a string')
796
be60ffd0 797 if not isinstance(backup_path, str):
d07c8065
ERE
798 raise Exception('Backup path must be a string')
799
800 if not os.path.exists(source_path) or not os.path.isdir(source_path):
801 raise Exception('Source path "%s" does not exist or is not a '\
802 'directory' % source_path)
803
9eae9a1f
ERE
804 if not isinstance(extra_data, dict):
805 raise Exception('extra_data must be a dictionary')
806
807 try:
808 extra_data_str = json.dumps(extra_data)
809 except:
810 raise Exception('extra_data is not json-serializable')
811
d07c8065
ERE
812 if not os.access(source_path, os.R_OK):
813 raise Exception('Source path "%s" is not readable' % source_path)
814
815 if max_volume_size != None and (not isinstance(max_volume_size, int) or\
816 max_volume_size < 1):
817 raise Exception('max_volume_size must be a positive integer')
818 if max_volume_size != None:
819 max_volume_size = max_volume_size*1024*1024
820
be60ffd0 821 if not isinstance(previous_index_path, str):
d07c8065
ERE
822 raise Exception('previous_index_path must be A string')
823
824 if not os.path.exists(previous_index_path) or not os.path.isfile(previous_index_path):
825 raise Exception('Index path "%s" does not exist or is not a '\
826 'file' % previous_index_path)
827
828 if not os.access(previous_index_path, os.R_OK):
829 raise Exception('Index path "%s" is not readable' % previous_index_path)
830
831 # try to create backup path if needed
37ab0f57 832 os.makedirs(backup_path, exist_ok=True)
d07c8065
ERE
833
834 if not os.access(backup_path, os.W_OK):
835 raise Exception('Backup path "%s" is not writeable' % backup_path)
836
837 if source_path.endswith('/'):
838 source_path = source_path[:-1]
839
840 if backup_path.endswith('/'):
841 backup_path = backup_path[:-1]
842
843 # update current time
844 self.current_time = datetime.datetime.now()
845
846 if self.mode not in self.__file_extensions_dict:
847 raise Exception('Unrecognized extension')
848
2ae46844 849 # setup for encrypting payload
774ca538
PG
850 if self.encryptor is None:
851 self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
133d30da 852
d07c8065
ERE
853 # some initialization
854 self.vol_no = 0
855
856 # generate the first volume name
df86af81
ERE
857 vol_name = self.volume_name_func(backup_path, is_full=False,
858 volume_number=0)
d07c8065
ERE
859 tarfile_path = os.path.join(backup_path, vol_name)
860
938c2d54 861 # init index
d07c8065
ERE
862 cwd = os.getcwd()
863
3031b7ae
PG
864 index_name = self.index_name_func(is_full=False)
865 index_path = os.path.join(backup_path, index_name)
866 index_sink = self.open_auxiliary_file(index_path, 'w')
867
d07c8065
ERE
868 def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number):
869 '''
870 Handles the new volumes
871 '''
df86af81
ERE
872 volume_name = deltarobj.volume_name_func(backup_path, is_full=False,
873 volume_number=volume_number)
d07c8065
ERE
874 volume_path = os.path.join(backup_path, volume_name)
875 deltarobj.vol_no = volume_number
876
877 # we convert relative paths into absolute because CWD is changed
878 if not os.path.isabs(volume_path):
879 volume_path = os.path.join(cwd, volume_path)
880
f624ff3d 881 deltarobj.logger.debug("opening volume %s" % volume_path)
d07c8065
ERE
882 tarobj.open_volume(volume_path)
883
884 # wraps some args from context into the handler
885 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path)
886
3031b7ae 887 index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "diff", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))
d07c8065 888
be60ffd0 889 s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
d07c8065 890 # calculate checksum and write into the stream
c2ffe2ec 891 crc = binascii.crc32(s) & 0xFFFFffff
3031b7ae 892 index_sink.write(s)
d07c8065
ERE
893
894 # start creating the tarfile
895 tarobj = tarfile.TarFile.open(tarfile_path,
896 mode='w' + self.mode,
897 format=tarfile.GNU_FORMAT,
d1c38f40 898 concat='#' in self.mode,
133d30da 899 encryption=self.encryptor,
d07c8065 900 max_volume_size=max_volume_size,
ea625b04 901 new_volume_handler=new_volume_handler,
e2b59b34
ERE
902 save_to_members=False,
903 dereference=True)
d07c8065 904
aae127d0
ERE
905
906 # create the iterators, first the previous index iterator, then the
907 # source path directory iterator and collate and iterate them
908 if not os.path.isabs(previous_index_path):
909 previous_index_path = os.path.join(cwd, previous_index_path)
910 index_it = self.iterate_index_path(previous_index_path)
911
d07c8065 912 os.chdir(source_path)
aae127d0
ERE
913 dir_it = self._recursive_walk_dir('.')
914 dir_path_it = self.jsonize_path_iterator(dir_it)
d07c8065 915
df86af81
ERE
916 def pr(path):
917 if not path:
918 return "None"
919 else:
920 return path["path"]
8edb2e3c 921
d07c8065 922 # for each file to be in the backup, do:
df86af81 923 for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
aae127d0
ERE
924 action = None
925 # if file is not in the index, it means it's a new file, so we have
926 # to take a snapshot
df86af81 927
aae127d0
ERE
928 if not ipath:
929 action = 'snapshot'
930 # if the file is not in the directory iterator, it means that it has
d041935c 931 # been deleted, so we need to mark it as such
aae127d0
ERE
932 elif not dpath:
933 action = 'delete'
934 # if the file is in both iterators, it means it might have either
935 # not changed (in which case we will just list it in our index but
936 # it will not be included in the tar file), or it might have
e8d95fe5 937 # changed, in which case we will snapshot it.
aae127d0
ERE
938 elif ipath and dpath:
939 if self._equal_stat_dicts(ipath, dpath):
940 action = 'list'
941 else:
942 action = 'snapshot'
943 # TODO: when creating chained backups (i.e. diffing from another
944 # diff), we will need to detect the type of action in the previous
945 # index, because if it was delete and dpath is None, we should
946 # discard the file
947
948 if action == 'snapshot':
949 # calculate stat dict for current file
950 stat = dpath.copy()
be60ffd0 951 stat['path'] = "snapshot://" + dpath['path']
aae127d0
ERE
952 stat['volume'] = self.vol_no
953
50f43227
ERE
954 self.logger.debug("[STORE] %s" % dpath['path'])
955
3e9b81bb
PG
956 try: # backup file
957 tarobj.add(dpath['path'], arcname=stat['path'], recursive=False)
958 # retrieve file offset
959 stat['offset'] = tarobj.get_last_member_offset()
960 except FileNotFoundError as exn:
961 # file vanished since the call to access(3) above
962 self.logger.warning ("object [%s] no longer available in "
963 "file system (error: %s); skipping"
964 % (dpath ["path"], str (exn)))
965 stat = None # prevent indexing
aae127d0 966
aae127d0 967 elif action == 'delete':
50f43227 968 path = self.unprefixed(ipath['path'])
aae127d0 969 stat = {
50f43227 970 u'path': u'delete://' + path,
aae127d0
ERE
971 u'type': ipath['type']
972 }
50f43227 973 self.logger.debug("[DELETE] %s" % path)
aae127d0
ERE
974
975 # mark it as deleted in the backup
42d39ca7 976 tarobj.add("/dev/null", arcname=stat['path'])
aae127d0
ERE
977 elif action == 'list':
978 stat = dpath.copy()
50f43227
ERE
979 path = self.unprefixed(ipath['path'])
980 stat['path'] = u'list://' + path
aae127d0 981 # unchanged files do not enter in the backup, only in the index
50f43227 982 self.logger.debug("[UNCHANGED] %s" % path)
80910564
TJ
983 else:
984 # should not happen
4bda6f45 985 self.logger.warning('unknown action in create_diff_backup: {0}'
80910564
TJ
986 ''.format(action))
987 stat = None
aae127d0 988
80910564
TJ
989 if stat:
990 # store the stat dict in the index
be60ffd0 991 s = bytes(json.dumps(stat) + '\n', 'UTF-8')
aae127d0 992 crc = binascii.crc32(s, crc) & 0xffffffff
3031b7ae 993 index_sink.write(s)
aae127d0 994
be60ffd0 995 s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
aae127d0 996 crc = binascii.crc32(s, crc) & 0xffffffff
3031b7ae 997 index_sink.write(s)
be60ffd0 998 s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
3031b7ae 999 index_sink.write(s)
938c2d54 1000
df86af81 1001 index_it.release()
aae127d0
ERE
1002 os.chdir(cwd)
1003 tarobj.close()
938c2d54
PG
1004 index_sink.close()
1005
1006
d07c8065 1007 def iterate_index_path(self, index_path):
df86af81
ERE
1008 '''
1009 Returns an index iterator. Internally, it uses a classic iterator class.
1010 We do that instead of just yielding so that the iterator object can have
1011 an additional function to close the file descriptor that is opened in
1012 the constructor.
1013 '''
d07c8065 1014
df86af81
ERE
1015 class IndexPathIterator(object):
1016 def __init__(self, delta_tar, index_path):
1017 self.delta_tar = delta_tar
1018 self.index_path = index_path
1019 self.f = None
9eae9a1f 1020 self.extra_data = dict()
df86af81 1021 self.__enter__()
d07c8065 1022
df86af81
ERE
1023 def __iter__(self):
1024 return self
d07c8065 1025
df86af81
ERE
1026 def release(self):
1027 if self.f:
1028 self.f.close()
1029
1030 def __enter__(self):
1031 '''
1032 Allows this iterator to be used with the "with" statement
1033 '''
1034 if self.f is None:
9eccb1c2 1035 self.f = self.delta_tar.open_auxiliary_file(self.index_path, 'r')
df86af81
ERE
1036 # check index header
1037 j, l_no = self.delta_tar._parse_json_line(self.f, 0)
1038 if j.get("type", '') != 'python-delta-tar-index' or\
1039 j.get('version', -1) != 1:
1040 raise Exception("invalid index file format: %s" % json.dumps(j))
1041
9eae9a1f
ERE
1042 self.extra_data = j.get('extra_data', dict())
1043
df86af81
ERE
1044 # find BEGIN-FILE-LIST, ignore other headers
1045 while True:
1046 j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
1047 if j.get('type', '') == 'BEGIN-FILE-LIST':
1048 break
1049 return self
1050
1051 def __exit__(self, type, value, tb):
1052 '''
1053 Allows this iterator to be used with the "with" statement
1054 '''
ec57ce53
ERE
1055 if self.f:
1056 self.f.close()
df86af81 1057 self.f = None
d07c8065 1058
be60ffd0 1059 def __next__(self):
0349168a 1060 # read each file in the index and process it to do the restore
df86af81
ERE
1061 j = {}
1062 l_no = -1
1063 try:
1064 j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
be60ffd0 1065 except Exception as e:
df86af81
ERE
1066 if self.f:
1067 self.f.close()
1068 raise e
d07c8065 1069
df86af81 1070 op_type = j.get('type', '')
d07c8065 1071
df86af81
ERE
1072 # when we detect the end of the list, break the loop
1073 if op_type == 'END-FILE-LIST':
1074 if self.f:
1075 self.f.close()
1076 raise StopIteration
1077
1078 # check input
1079 if op_type not in ['directory', 'file', 'link']:
4bda6f45 1080 self.delta_tar.logger.warning('unrecognized type to be '
df86af81
ERE
1081 'restored: %s, line %d' % (op_type, l_no))
1082 # iterate again
be60ffd0 1083 return self.__next__()
df86af81
ERE
1084
1085 return j, l_no
d07c8065 1086
df86af81 1087 return IndexPathIterator(self, index_path)
d07c8065 1088
26fdd428 1089 def iterate_tar_path(self, tar_path, new_volume_handler=None):
24ddf0a2
ERE
1090 '''
1091 Returns a tar iterator that iterates jsonized member items that contain
1092 an additional "member" field, used by RestoreHelper.
1093 '''
ec57ce53 1094 class TarPathIterator(object):
83a81852 1095 def __init__(self, delta_tar, tar_path, new_volume_handler=None):
24ddf0a2 1096 self.delta_tar = delta_tar
ec57ce53 1097 self.tar_path = tar_path
24ddf0a2 1098 self.tar_obj = None
6bca471c 1099 self.last_member = None
26fdd428 1100 self.new_volume_handler = new_volume_handler
24ddf0a2
ERE
1101 self.__enter__()
1102
1103 def __iter__(self):
1104 return self
1105
1106 def release(self):
1107 if self.tar_obj:
1108 self.tar_obj.close()
1109
1110 def __enter__(self):
1111 '''
1112 Allows this iterator to be used with the "with" statement
1113 '''
1114 if self.tar_obj is None:
d5e1d60f
PG
1115 decryptor = None
1116 if self.delta_tar.password is not None:
1f3fd7b0
PG
1117 decryptor = crypto.Decrypt \
1118 (password=self.delta_tar.password,
1119 key=self.delta_tar.crypto_key)
ec57ce53
ERE
1120 self.tar_obj = tarfile.TarFile.open(self.tar_path,
1121 mode='r' + self.delta_tar.mode,
1122 format=tarfile.GNU_FORMAT,
d1c38f40 1123 concat='#' in self.delta_tar.mode,
d5e1d60f 1124 encryption=decryptor,
83a81852 1125 new_volume_handler=self.new_volume_handler,
e2b59b34
ERE
1126 save_to_members=False,
1127 dereference=True)
24ddf0a2
ERE
1128 return self
1129
1130 def __exit__(self, type, value, tb):
1131 '''
1132 Allows this iterator to be used with the "with" statement
1133 '''
ec57ce53
ERE
1134 if self.tar_obj:
1135 self.tar_obj.close()
24ddf0a2
ERE
1136 self.tar_obj = None
1137
be60ffd0 1138 def __next__(self):
24ddf0a2
ERE
1139 '''
1140 Read each member and return it as a stat dict
1141 '''
be60ffd0 1142 tarinfo = self.tar_obj.__iter__().__next__()
8e019196
ERE
1143 # NOTE: here we compare if tarinfo.path is the same as before
1144 # instead of comparing the tarinfo object itself because the
1145 # object itself might change for multivol tarinfos
1146 if tarinfo is None or (self.last_member is not None and\
1147 self.delta_tar.unprefixed(tarinfo.path) == self.delta_tar.unprefixed(self.last_member.path)):
ec57ce53
ERE
1148 raise StopIteration
1149
6bca471c
ERE
1150 self.last_member = tarinfo
1151
24ddf0a2
ERE
1152 ptype = 'unknown'
1153 if tarinfo.isfile():
1154 ptype = 'file'
1155 elif tarinfo.isdir():
ab7e7465 1156 ptype = 'directory'
24ddf0a2
ERE
1157 elif tarinfo.islnk() or tarinfo.issym():
1158 ptype = 'link'
1159
1160 return {
1161 u'type': ptype,
1162 u'path': tarinfo.path,
1163 u'mode': tarinfo.mode,
1164 u'mtime': tarinfo.mtime,
1165 u'ctime': -1, # cannot restore
1166 u'uid': tarinfo.uid,
1167 u'gid': tarinfo.gid,
1168 u'inode': -1, # cannot restore
1169 u'size': tarinfo.size,
1170 u'member': tarinfo
ec57ce53
ERE
1171 }, 0
1172
26fdd428 1173 return TarPathIterator(self, tar_path, new_volume_handler)
24ddf0a2 1174
df99a044 1175 def jsonize_path_iterator(self, iter, strip=0):
d07c8065
ERE
1176 '''
1177 converts the yielded items of an iterator into json path lines.
df99a044
ERE
1178
1179 strip: Strip the smallest prefix containing num leading slashes from
1180 the file path.
d07c8065
ERE
1181 '''
1182 while True:
1183 try:
be60ffd0 1184 path = iter.__next__()
df99a044 1185 if strip == 0:
4ac6d333 1186 yield self._stat_dict(path), 0
df99a044
ERE
1187 else:
1188 st = self._stat_dict(path)
1189 st['path'] = "/".join(path.split("/")[strip:])
4ac6d333 1190 yield st, 0
d07c8065
ERE
1191 except StopIteration:
1192 break
1193
b84beea7
PG
1194 def iterate_disaster_index (self, index):
1195 """
1196 Mimick the behavior of the other object iterators, just with the inputs
1197 supplied directly as *index*.
1198 """
1199
1200 class RawIndexIterator(object):
65b35c42 1201 def __init__(self, delta_tar, index):
b84beea7
PG
1202 self.delta_tar = delta_tar
1203 self.index = index
1204 self.__enter__()
1205
1206 def __iter__(self):
1207 return self
1208
1209 def release(self):
65b35c42 1210 pass
b84beea7
PG
1211
1212 def __enter__(self):
1213 '''
1214 Allows this iterator to be used with the "with" statement
1215 '''
1216 self.iter = self.index.__iter__ ()
1217 return self
1218
1219 def __exit__(self, type, value, tb):
1220 '''
1221 Allows this iterator to be used with the "with" statement
1222 '''
1223
1224 def __next__(self):
1225 idxent = self.iter.__next__ ()
65b35c42 1226 return idxent, 0
b84beea7
PG
1227
1228 return RawIndexIterator(self, index)
1229
d07c8065
ERE
1230 def collate_iterators(self, it1, it2):
1231 '''
1232 Collate two iterators, so that it returns pairs of the items of each
1233 iterator (if the items are the same), or (None, elem2) or (elem1, None)
1234 when there's no match for the items in the other iterator.
1235
1236 It assumes that the items in both lists are ordered in the same way.
1237 '''
ea6d3c3e 1238 l_no = 0
d07c8065
ERE
1239 elem1, elem2 = None, None
1240 while True:
1241 if not elem1:
1242 try:
be60ffd0 1243 elem1, l_no = it1.__next__()
d07c8065
ERE
1244 except StopIteration:
1245 if elem2:
ea6d3c3e 1246 yield (None, elem2, l_no)
d07c8065 1247 for elem2 in it2:
ea6d3c3e
ERE
1248 if isinstance(elem2, tuple):
1249 elem2 = elem2[0]
1250 yield (None, elem2, l_no)
d07c8065 1251 break
d07c8065
ERE
1252 if not elem2:
1253 try:
be60ffd0 1254 elem2 = it2.__next__()
d07c8065
ERE
1255 if isinstance(elem2, tuple):
1256 elem2 = elem2[0]
1257 except StopIteration:
1258 if elem1:
ea6d3c3e 1259 yield (elem1, None, l_no)
df99a044 1260 for elem1, l_no in it1:
ea6d3c3e 1261 yield (elem1, None, l_no)
d07c8065 1262 break
670f9934
ERE
1263
1264 index1 = self.unprefixed(elem1['path'])
1265 index2 = self.unprefixed(elem2['path'])
1266 i1, i2 = self.compare_indexes(index1, index2)
1267
1268 yield1 = yield2 = None
1269 if i1 is not None:
1270 yield1 = elem1
1271 elem1 = None
1272 if i2 is not None:
1273 yield2 = elem2
1274 elem2 = None
1275 yield (yield1, yield2, l_no)
1276
1277 def compare_indexes(self, index1, index2):
1278 '''
1279 Compare iterator indexes and return a tuple in the following form:
1280 if index1 < index2, returns (index1, None)
1281 if index1 == index2 returns (index1, index2)
1282 else: returns (None, index2)
1283 '''
1284 l1 = index1.split('/')
1285 l2 = index2.split('/')
1286 length = len(l2) - len(l1)
1287
1288 if length > 0:
1289 return (index1, None)
1290 elif length < 0:
1291 return (None, index2)
1292
1293 for i1, i2 in zip(l1, l2):
1294 if i1 < i2:
1295 return (index1, None)
1296 elif i1 > i2:
1297 return (None, index2)
1298
1299 return (index1, index2)
0708a374 1300
8c65a2b1 1301 def list_backup(self, backup_tar_path, list_func=None):
be60ffd0 1302 if not isinstance(backup_tar_path, str):
8c65a2b1
ERE
1303 raise Exception('Backup tar path must be a string')
1304
1305 if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
1306 raise Exception('Source path "%s" does not exist or is not a '\
1307 'file' % backup_tar_path)
1308
1309 if not os.access(backup_tar_path, os.R_OK):
1310 raise Exception('Source path "%s" is not readable' % backup_tar_path)
1311
1312 cwd = os.getcwd()
1313
b7c47f38 1314 def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
8c65a2b1
ERE
1315 '''
1316 Handles the new volumes
1317 '''
1318 volume_name = deltarobj.volume_name_func(backup_path, True,
1319 volume_number, guess_name=True)
1320 volume_path = os.path.join(backup_path, volume_name)
1321
1322 # we convert relative paths into absolute because CWD is changed
1323 if not os.path.isabs(volume_path):
1324 volume_path = os.path.join(cwd, volume_path)
b7c47f38
PG
1325 tarobj.open_volume(volume_path, encryption=encryption)
1326
774ca538
PG
1327 if self.decryptor is None:
1328 self.decryptor = self.initialize_encryption (CRYPTO_MODE_DECRYPT)
8c65a2b1
ERE
1329
1330 backup_path = os.path.dirname(backup_tar_path)
1331 if not os.path.isabs(backup_path):
1332 backup_path = os.path.join(cwd, backup_path)
133d30da 1333 new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.decryptor)
b7a6566b 1334
8c65a2b1
ERE
1335 tarobj = tarfile.TarFile.open(backup_tar_path,
1336 mode='r' + self.mode,
1337 format=tarfile.GNU_FORMAT,
d1c38f40 1338 concat='#' in self.mode,
133d30da 1339 encryption=self.decryptor,
ea625b04 1340 new_volume_handler=new_volume_handler,
e2b59b34
ERE
1341 save_to_members=False,
1342 dereference=True)
8c65a2b1
ERE
1343
1344 def filter(cls, list_func, tarinfo):
1345 if list_func is None:
b008f989 1346 self.logger.info(tarinfo.path)
8c65a2b1
ERE
1347 else:
1348 list_func(tarinfo)
1349 return False
1350 filter = partial(filter, self, list_func)
1351
c650acfa 1352 tarobj.extractall(filter=filter, unlink=True)
8c65a2b1
ERE
1353 tarobj.close()
1354
0708a374 1355 def restore_backup(self, target_path, backup_indexes_paths=[],
e93f83f1 1356 backup_tar_path=None, restore_callback=None,
b84beea7 1357 disaster=tarfile.TOLERANCE_STRICT, backup_index=None):
0708a374
ERE
1358 '''
1359 Restores a backup.
1360
1361 Parameters:
0708a374
ERE
1362 - target_path: path to restore.
1363 - backup_indexes_paths: path to backup indexes, in descending date order.
1364 The indexes indicate the location of their respective backup volumes,
1365 and multiple indexes are needed to be able to restore diff backups.
1366 Note that this is an optional parameter: if not suplied, it will
1367 try to restore directly from backup_tar_path.
1368 - backup_tar_path: path to the backup tar file. Used as an alternative
1369 to backup_indexes_paths to restore directly from a tar file without
1370 using any file index. If it's a multivol tarfile, volume_name_func
1371 will be called.
4da27cfe 1372 - restore_callback: callback function to be called during restore.
b0aef801 1373 This is passed to the helper and gets called for every file.
11684b1d 1374
3a7e1a50 1375 NOTE: If you want to use an index to restore a backup, this function
11684b1d
ERE
1376 only supports to do so when the tarfile mode is either uncompressed or
1377 uses concat compress mode, because otherwise it would be very slow.
3a7e1a50
ERE
1378
1379 NOTE: Indices are assumed to follow the same format as the index_mode
1380 specified in the constructor.
e93f83f1
PG
1381
1382 Returns the list of files that could not be restored, if there were
1383 any.
0708a374 1384 '''
11684b1d 1385 # check/sanitize input
be60ffd0 1386 if not isinstance(target_path, str):
e5c6ca04
ERE
1387 raise Exception('Target path must be a string')
1388
11684b1d
ERE
1389 if backup_indexes_paths is None and backup_tar_path == []:
1390 raise Exception("You have to either provide index paths or a tar path")
e5c6ca04 1391
b84beea7
PG
1392 if isinstance (backup_index, list) is True:
1393 mode = "disaster"
1394 elif len(backup_indexes_paths) == 0:
ea6d3c3e
ERE
1395 mode = "tar"
1396 else:
1397 mode = "diff"
1398
1399 if mode == "tar":
be60ffd0 1400 if not isinstance(backup_tar_path, str):
11684b1d
ERE
1401 raise Exception('Backup tar path must be a string')
1402
1403 if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
1404 raise Exception('Source path "%s" does not exist or is not a '\
1405 'file' % backup_tar_path)
1406
1407 if not os.access(backup_tar_path, os.R_OK):
1408 raise Exception('Source path "%s" is not readable' % backup_tar_path)
1409 else:
1410 if not isinstance(backup_indexes_paths, list):
1411 raise Exception('backup_indexes_paths must be a list')
1412
1413 if self.mode.startswith(':') or self.mode.startswith('|'):
1414 raise Exception('Restore only supports either uncompressed tars'
1415 ' or concat compression when restoring from an index, and '
1416 ' the open mode you provided is "%s"' % self.mode)
1417
1418 for index in backup_indexes_paths:
be60ffd0 1419 if not isinstance(index, str):
11684b1d 1420 raise Exception('indices must be strings')
e5c6ca04 1421
11684b1d
ERE
1422 if not os.path.exists(index) or not os.path.isfile(index):
1423 raise Exception('Index path "%s" does not exist or is not a '\
1424 'file' % index)
1425
1426 if not os.access(index, os.R_OK):
1427 raise Exception('Index path "%s" is not readable' % index)
e5c6ca04
ERE
1428
1429 # try to create backup path if needed
37ab0f57 1430 os.makedirs(target_path, exist_ok=True)
e5c6ca04 1431
ec57ce53
ERE
1432 # make backup_tar_path absolute so that iterate_tar_path works fine
1433 if backup_tar_path and not os.path.isabs(backup_tar_path):
1434 backup_tar_path = os.path.abspath(backup_tar_path)
1435
d5361dac 1436 cwd = os.getcwd()
ec57ce53 1437 os.chdir(target_path)
d5361dac 1438
2ae46844 1439 # setup for decrypting payload
774ca538
PG
1440 if self.decryptor is None:
1441 self.decryptor = self.initialize_encryption (CRYPTO_MODE_DECRYPT)
2ae46844 1442
ea6d3c3e 1443 if mode == 'tar':
24ddf0a2
ERE
1444 index_it = self.iterate_tar_path(backup_tar_path)
1445 helper = RestoreHelper(self, cwd, backup_path=backup_tar_path,
ec57ce53 1446 tarobj=index_it.tar_obj)
ea6d3c3e 1447 elif mode == "diff":
04f4c7ab
PG
1448 helper = RestoreHelper(self, cwd, backup_indexes_paths,
1449 disaster=disaster)
f3d10816
PG
1450 try:
1451 # get iterator from newest index at _data[0]
1452 index1 = helper._data[0]["path"]
1453 index_it = self.iterate_index_path(index1)
1454 except tarfile.DecryptionError as exn:
1455 self.logger.error("failed to decrypt file [%s]: %s; is this an "
afc87ebc
PG
1456 "actual encrypted index file?"
1457 % (index1, str (exn)))
1458 return [(index1, exn)]
1459 except Exception as exn:
1460 # compressed files
1461 self.logger.error("failed to read file [%s]: %s; is this an "
1462 "actual index file?" % (index1, str (exn)))
f3d10816 1463 return [(index1, exn)]
b84beea7
PG
1464 elif mode == "disaster":
1465 index_it = self.iterate_disaster_index (backup_index)
65b35c42
PG
1466 helper = RestoreHelper (self, cwd, backup_path=backup_tar_path,
1467 backup_index=backup_index,
1468 disaster=disaster)
b84beea7 1469
d07c8065 1470
24ddf0a2
ERE
1471 dir_it = self._recursive_walk_dir('.')
1472 dir_path_it = self.jsonize_path_iterator(dir_it)
11684b1d 1473
e93f83f1
PG
1474 failed = [] # irrecoverable files
1475
a395759e 1476 # for each file to be restored, do:
24ddf0a2
ERE
1477 for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
1478 if not ipath:
1479 upath = dpath['path']
1480 op_type = dpath['type']
1481 else:
1482 upath = self.unprefixed(ipath['path'])
1483 op_type = ipath['type']
42c04ead 1484
24ddf0a2 1485 # filter paths
75059f3c 1486 if self.filter_path(upath, '', op_type == 'directory') == NO_MATCH:
24ddf0a2 1487 continue
ea6d3c3e 1488
24ddf0a2
ERE
1489 # if types of the file mismatch, the file needs to be deleted
1490 # and re-restored
1491 if ipath is not None and dpath is not None and\
1492 dpath['type'] != ipath['type']:
1493 helper.delete(upath)
1494
1495 # if file not found in dpath, we can directly restore from index
1496 if not dpath:
1497 # if the file doesn't exist and it needs to be deleted, it
1498 # means that work is already done
1499 if ipath['path'].startswith('delete://'):
ea6d3c3e 1500 continue
24ddf0a2 1501 try:
b008f989 1502 self.logger.debug("restore %s" % ipath['path'])
4da27cfe 1503 helper.restore(ipath, l_no, restore_callback)
be60ffd0 1504 except Exception as e:
e93f83f1 1505 iipath = ipath.get ("path", "")
7b07645e 1506 self.logger.error("FAILED to restore: {} ({})"
e93f83f1 1507 .format(iipath, e))
04f4c7ab 1508 if disaster != tarfile.TOLERANCE_STRICT:
e93f83f1 1509 failed.append ((iipath, e))
24ddf0a2 1510 continue
11684b1d 1511
24ddf0a2
ERE
1512 # if both files are equal, we have nothing to restore
1513 if self._equal_stat_dicts(ipath, dpath, listsnapshot_equal=True):
1514 continue
1515
1516 # we have to restore the file, but first we need to delete the
1517 # current existing file.
1518 # we don't delete the file if it's a directory, because it might
1519 # just have changed mtime, so it's quite inefficient to remove
1520 # it
1521 if ipath:
1522 if ipath['type'] != 'directory' or ipath['path'].startswith('delete://'):
42c04ead 1523 helper.delete(upath)
b008f989 1524 self.logger.debug("restore %s" % ipath['path'])
e93f83f1
PG
1525 try:
1526 helper.restore(ipath, l_no, restore_callback)
1527 except Exception as e:
04f4c7ab 1528 if disaster == tarfile.TOLERANCE_STRICT:
e93f83f1
PG
1529 raise
1530 failed.append ((ipath.get ("path", ""), e))
1531 continue
24ddf0a2
ERE
1532
1533 # if the file is not in the index (so it comes from the target
1534 # directory) then we have to delete it
1535 else:
c9d47a03 1536 self.logger.debug("delete %s" % upath)
24ddf0a2 1537 helper.delete(upath)
42c04ead 1538
ec57ce53
ERE
1539 helper.restore_directories_permissions()
1540 index_it.release()
1541 os.chdir(cwd)
1542 helper.cleanup()
ea6d3c3e 1543
e93f83f1
PG
1544 return failed
1545
1546
1547 def recover_backup(self, target_path, backup_indexes_paths=[],
1548 restore_callback=None):
1549 """
1550 Walk the index, extracting objects in disaster mode. Bad files are
1551 reported along with a reason.
1552 """
1553 return self.restore_backup(target_path,
1554 backup_indexes_paths=backup_indexes_paths,
04f4c7ab
PG
1555 disaster=tarfile.TOLERANCE_RECOVER)
1556
1557
6690f5e0 1558 def rescue_backup(self, target_path, backup_tar_path,
04f4c7ab
PG
1559 restore_callback=None):
1560 """
1561 More aggressive “unfsck” mode: do not rely on the index data as the
1562 files may be corrupt; skim files for header-like information and
1563 attempt to retrieve the data.
1564 """
27ee4dd4
PG
1565 def gen_volume_name (nvol):
1566 return os.path.join (os.path.dirname (backup_tar_path),
1567 self.volume_name_func (backup_tar_path,
1568 True,
1569 nvol))
1570
1571 backup_index = tarfile.gen_rescue_index (gen_volume_name,
1572 self.mode,
1573 password=self.password,
1574 key=self.crypto_key)
6690f5e0 1575
04f4c7ab 1576 return self.restore_backup(target_path,
b84beea7 1577 backup_index=backup_index,
65b35c42 1578 backup_tar_path=backup_tar_path,
04f4c7ab 1579 disaster=tarfile.TOLERANCE_RESCUE)
e93f83f1
PG
1580
1581
11684b1d
ERE
1582 def _parse_json_line(self, f, l_no):
1583 '''
ee0e095f 1584 Read line from file like object and process it as JSON.
11684b1d
ERE
1585 '''
1586 l = f.readline()
1587 l_no += 1
1588 try:
be60ffd0 1589 j = json.loads(l.decode('UTF-8'))
ee0e095f
PG
1590 except UnicodeDecodeError as e:
1591 if tuple (l [0:2]) == tarfile.GZ_MAGIC:
1592 raise Exception \
1593 ("error parsing line #%d as json: looks like a compressed file (%d B: [%s..])"
1594 % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
1595 from e
1596 raise Exception \
1597 ("error parsing line #%d as json: not a text file (%d B: [%s..])"
1598 % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
1599 from e
be60ffd0 1600 except ValueError as e:
11684b1d
ERE
1601 raise Exception("error parsing this json line "
1602 "(line number %d): %s" % (l_no, l))
1603 return j, l_no
ea6d3c3e 1604
24ddf0a2 1605
ea6d3c3e
ERE
1606class RestoreHelper(object):
1607 '''
1608 Class used to help to restore files from indices
1609 '''
1610
1611 # holds the dicts of data
1612 _data = []
1613
1614 _deltatar = None
1615
1616 _cwd = None
1617
0501fe0a
ERE
1618 # list of directories to be restored. This is done as a last step, see
1619 # tarfile.extractall for details.
1620 _directories = []
1621
04f4c7ab 1622 _disaster = tarfile.TOLERANCE_STRICT
e93f83f1 1623
037994ca 1624 def __init__(self, deltatar, cwd, index_list=None, backup_path=False,
65b35c42
PG
1625 backup_index=None, tarobj=None,
1626 disaster=tarfile.TOLERANCE_STRICT):
ea6d3c3e
ERE
1627 '''
1628 Constructor opens the tars and init the data structures.
1629
037994ca
PG
1630 Assumptions:
1631
1632 - Index list must be provided in reverse order (newer first).
1633 - “newer first” apparently means that if there are n backups
1634 provided, the last full backup is at index n-1 and the most recent
1635 diff backup is at index 0.
1636 - Only the first, the second, and the last elements of
1637 ``index_list`` are relevant, others will not be accessed.
1638 - If no ``index_list`` is provided, both ``tarobj`` and
1639 ``backup_path`` must be passed.
1640 - If ``index_list`` is provided, the values of ``tarobj`` and
1641 ``backup_path`` are ignored.
ea6d3c3e
ERE
1642 '''
1643 self._data = []
0501fe0a 1644 self._directories = []
ea6d3c3e
ERE
1645 self._deltatar = deltatar
1646 self._cwd = cwd
3031b7ae 1647 self._password = deltatar.password
1f3fd7b0 1648 self._crypto_key = deltatar.crypto_key
3031b7ae 1649 self._decryptors = []
e93f83f1 1650 self._disaster = disaster
ea6d3c3e 1651
253d4cdd
ERE
1652 try:
1653 import grp, pwd
1654 except ImportError:
1655 grp = pwd = None
1656
1657 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
1658 self.canchown = True
1659 else:
1660 self.canchown = False
1661
65b35c42 1662 if isinstance (backup_index, list) is True:
001bd488 1663 decryptor = self._deltatar.decryptor
65b35c42
PG
1664 self._data = \
1665 [{ "curr_vol_no" : None
1666 , "vol_fd" : None
1667 , "offset" : -1
1668 , "tarobj" : None
1669 , "path" : backup_path
1670 , "is_full" : True
1671 , "iterator" : None
1672 , "last_itelement" : None
1673 , "last_lno" : 0
001bd488
PG
1674 , "new_volume_handler" :
1675 partial(self.new_volume_handler,
1676 self._deltatar, self._cwd, True,
1677 os.path.dirname(backup_path), decryptor)
1678 , "decryptor" : decryptor
65b35c42
PG
1679 }]
1680 elif index_list is not None:
24ddf0a2 1681 for index in index_list:
037994ca 1682 is_full = index == index_list[-1]
24ddf0a2 1683
d5e1d60f 1684 decryptor = None
3031b7ae 1685 if self._password is not None:
1f3fd7b0
PG
1686 decryptor = crypto.Decrypt (password=self._password,
1687 key=self._crypto_key)
d5e1d60f 1688
24ddf0a2
ERE
1689 # make paths absolute to avoid cwd problems
1690 if not os.path.isabs(index):
1691 index = os.path.normpath(os.path.join(cwd, index))
1692
1693 s = dict(
1694 curr_vol_no = None,
1695 vol_fd = None,
1696 offset = -1,
1697 tarobj = None,
1698 path = index,
1699 is_full = is_full,
1700 iterator = None,
1701 last_itelement = None,
1702 last_lno = 0,
1703 new_volume_handler = partial(self.new_volume_handler,
1704 self._deltatar, self._cwd, is_full,
d5e1d60f
PG
1705 os.path.dirname(index), decryptor),
1706 decryptor = decryptor
24ddf0a2
ERE
1707 )
1708 self._data.append(s)
1709 else:
ea6d3c3e 1710 # make paths absolute to avoid cwd problems
24ddf0a2
ERE
1711 if not os.path.isabs(backup_path):
1712 backup_path = os.path.normpath(os.path.join(cwd, backup_path))
ea6d3c3e 1713
ec57ce53
ERE
1714 # update the new_volume_handler of tar_obj
1715 tarobj.new_volume_handler = partial(self.new_volume_handler,
b7c47f38 1716 self._deltatar, self._cwd, True, os.path.dirname(backup_path),
133d30da 1717 self._deltatar.decryptor)
ea6d3c3e
ERE
1718 s = dict(
1719 curr_vol_no = None,
1720 vol_fd = None,
1721 offset = -1,
24ddf0a2
ERE
1722 tarobj = tarobj,
1723 path = backup_path,
1724 is_full = True,
670f9934
ERE
1725 iterator = None,
1726 last_itelement = None,
1727 last_lno = 0,
d5e1d60f
PG
1728 new_volume_handler = tarobj.new_volume_handler,
1729 decryptor = self._deltatar.decryptor
ea6d3c3e
ERE
1730 )
1731 self._data.append(s)
1732
3031b7ae 1733
ea6d3c3e
ERE
1734 def cleanup(self):
1735 '''
1736 Closes all open files
1737 '''
1738 for data in self._data:
55b2ffd0
ERE
1739 if data['vol_fd']:
1740 data['vol_fd'].close()
1741 data['vol_fd'] = None
ea6d3c3e
ERE
1742 if data['tarobj']:
1743 data['tarobj'].close()
1744 data['tarobj'] = None
ea6d3c3e
ERE
1745
1746 def delete(self, path):
1747 '''
1748 Delete a file
1749 '''
df99a044
ERE
1750 if not os.path.exists(path):
1751 return
1752
24ddf0a2 1753 # to preserve parent directory mtime, we save it
283fbd5e 1754 parent_dir = os.path.dirname(path) or os.getcwd()
24ddf0a2
ERE
1755 parent_dir_mtime = int(os.stat(parent_dir).st_mtime)
1756
561bc39f 1757 if os.path.isdir(path) and not os.path.islink(path):
ea6d3c3e
ERE
1758 shutil.rmtree(path)
1759 else:
1760 os.unlink(path)
1761
24ddf0a2
ERE
1762 # now we restore parent_directory mtime
1763 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1764
4da27cfe 1765 def restore(self, itpath, l_no, callback=None):
ea6d3c3e 1766 '''
8a54d5dd 1767 Restore the path from the appropriate backup. Receives the current path
e8d95fe5 1768 from the newest (=first) index iterator. itpath must be not null.
b0aef801 1769 callback is a custom function that gets called for every file.
037994ca
PG
1770
1771 NB: This function takes the attribute ``_data`` as input but will only
1772 ever use its first and, if available, second element. Anything else in
1773 ``._data[]`` will be ignored.
ea6d3c3e 1774 '''
ea6d3c3e
ERE
1775 path = itpath['path']
1776
4da27cfe
SA
1777 # Calls the callback function
1778 if callback:
1779 callback()
1780
ea6d3c3e 1781 if path.startswith('delete://'):
df86af81
ERE
1782 # the file has previously been deleted already in restore_backup in
1783 # all cases so we just need to finish
ea6d3c3e 1784 return
df86af81 1785
e8d95fe5 1786 # get data from newest index (_data[0])
df86af81
ERE
1787 data = self._data[0]
1788 upath = self._deltatar.unprefixed(path)
1789
24ddf0a2 1790 # to preserve parent directory mtime, we save it
283fbd5e 1791 parent_dir = os.path.dirname(upath) or os.getcwd()
37ab0f57 1792 os.makedirs(parent_dir, exist_ok=True)
24ddf0a2
ERE
1793 parent_dir_mtime = int(os.stat(parent_dir).st_mtime)
1794
e8d95fe5 1795 # if path is found in the newest index as to be snapshotted, deal with it
df86af81
ERE
1796 # and finish
1797 if path.startswith('snapshot://'):
65b35c42 1798 self.restore_file(itpath, data, path, l_no, upath)
24ddf0a2
ERE
1799
1800 # now we restore parent_directory mtime
1801 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
ea6d3c3e
ERE
1802 return
1803
1804 # we go from index to index, finding the path in the index, then finding
1805 # the index with the most recent snapshot of the file being restored
e8d95fe5
TJ
1806 #
1807 # Right now we support diff backups, only. No incremental backups.
1808 # As a result _data[0] is always the diff backup index
1809 # and _data[1] the full backup index.
527670c4 1810 if len(self._data) == 2:
7273719c 1811 data = self._data[1]
527670c4
TJ
1812 d, l_no, dpath = self.find_path_in_index(data, upath)
1813 if not d:
1814 self._deltatar.logger.warning('Error restoring file %s from '
1815 'index, not found in index %s' % (path, data['path']))
1816 return
1817
1818 cur_path = d.get('path', '')
1819 if cur_path.startswith('delete://'):
1820 self._deltatar.logger.warning(('Strange thing happened, file '
1821 '%s was listed in first index but deleted by another '
1822 'one. Path was ignored and untouched.') % path)
1823 return
1824 elif cur_path.startswith('snapshot://'):
1825 # this code path is reached when the file is unchanged
1826 # in the newest index and therefore of type 'list://'
1827 self.restore_file(d, data, path, l_no, dpath)
1828
1829 # now we restore parent_directory mtime
1830 os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
1831 return
1832
1833 # error code path is reached when:
1834 # a) we have more than two indexes (unsupported atm)
1835 # b) both indexes contain a list:// entry (logic error)
1836 # c) we have just one index and it also contains list://
4bda6f45 1837 self._deltatar.logger.warning(('Error restoring file %s from index, '
ea6d3c3e
ERE
1838 'snapshot not found in any index') % path)
1839
670f9934
ERE
1840 def find_path_in_index(self, data, upath):
1841 # NOTE: we restart the iterator sometimes because the iterator can be
1842 # walked over completely multiple times, for example if one path if not
1843 # found in one index and we have to go to the next index.
7273719c
PG
1844 it = data['iterator']
1845 if it is None:
670f9934 1846 it = data['iterator'] = self._deltatar.iterate_index_path(data["path"])
be60ffd0 1847 d, l_no = it.__next__()
670f9934 1848 else:
670f9934
ERE
1849 d = data['last_itelement']
1850 l_no = data['last_lno']
1851
670f9934 1852 while True:
7273719c 1853 dpath = self._deltatar.unprefixed(d.get('path', ''))
670f9934
ERE
1854 if upath == dpath:
1855 data['last_itelement'] = d
1856 data['last_lno'] = l_no
1857 return d, l_no, dpath
1858
1859 up, dp = self._deltatar.compare_indexes(upath, dpath)
1860 # any time upath should have appeared before current dpath, it means
1861 # upath is just not in this index and we should stop
1862 if dp is None:
1863 data['last_itelement'] = d
1864 data['last_lno'] = l_no
1865 return None, 0, ''
1866
1867 try:
be60ffd0 1868 d, l_no = it.__next__()
670f9934
ERE
1869 except StopIteration:
1870 data['last_itelement'] = d
1871 data['last_lno'] = l_no
1872 return None, 0, ''
670f9934 1873
0501fe0a
ERE
1874 def restore_directories_permissions(self):
1875 '''
1876 Restore directory permissions when everything have been restored
1877 '''
42c04ead
ERE
1878 try:
1879 import grp, pwd
1880 except ImportError:
1881 grp = pwd = None
1882
0501fe0a
ERE
1883 self._directories.sort(key=operator.attrgetter('name'))
1884 self._directories.reverse()
0501fe0a
ERE
1885
1886 # Set correct owner, mtime and filemode on directories.
1887 for member in self._directories:
1888 dirpath = member.name
1889 try:
42c04ead
ERE
1890 os.chmod(dirpath, member.mode)
1891 os.utime(dirpath, (member.mtime, member.mtime))
253d4cdd 1892 if self.canchown:
42c04ead
ERE
1893 # We have to be root to do so.
1894 try:
1895 g = grp.getgrnam(member.gname)[2]
1896 except KeyError:
1897 g = member.gid
1898 try:
1899 u = pwd.getpwnam(member.uname)[2]
1900 except KeyError:
1901 u = member.uid
1902 try:
4e433e00 1903 if member.issym and hasattr(os, "lchown"):
42c04ead
ERE
1904 os.lchown(dirpath, u, g)
1905 else:
1906 os.chown(dirpath, u, g)
1907 except EnvironmentError:
1908 raise tarfile.ExtractError("could not change owner")
1909
be60ffd0 1910 except tarfile.ExtractError as e:
4bda6f45 1911 self._deltatar.logger.warning('tarfile: %s' % e)
0501fe0a 1912
df86af81 1913 @staticmethod
b7c47f38 1914 def new_volume_handler(deltarobj, cwd, is_full, backup_path, encryption, tarobj, base_name, volume_number):
ea6d3c3e
ERE
1915 '''
1916 Handles the new volumes
1917 '''
df86af81
ERE
1918 volume_name = deltarobj.volume_name_func(backup_path, is_full,
1919 volume_number, guess_name=True)
ea6d3c3e
ERE
1920 volume_path = os.path.join(backup_path, volume_name)
1921
1922 # we convert relative paths into absolute because CWD is changed
1923 if not os.path.isabs(volume_path):
1924 volume_path = os.path.join(cwd, volume_path)
b7c47f38 1925 tarobj.open_volume(volume_path, encryption=encryption)
ea6d3c3e 1926
253d4cdd 1927 def restore_file(self, file_data, index_data, path, l_no, unprefixed_path):
ea6d3c3e
ERE
1928 '''
1929 Restores a snapshot of a file from a specific backup
1930 '''
ea6d3c3e 1931 op_type = file_data.get('type', -1)
24ddf0a2 1932 member = file_data.get('member', None)
9f9ae874 1933 ismember = bool(member)
24ddf0a2
ERE
1934
1935 # when member is set, then we can assume everything is right and we
1936 # just have to restore the path
a2a37de7 1937 if member is None:
24ddf0a2
ERE
1938 vol_no = file_data.get('volume', -1)
1939 # sanity check
1940 if not isinstance(vol_no, int) or vol_no < 0:
4bda6f45 1941 self._deltatar.logger.warning('unrecognized type to be restored: '
24ddf0a2
ERE
1942 '%s, line %d' % (op_type, l_no))
1943
1944 # setup the volume that needs to be read. only needed when member is
1945 # not set
a2a37de7 1946 if index_data['curr_vol_no'] != vol_no:
24ddf0a2
ERE
1947 index_data['curr_vol_no'] = vol_no
1948 backup_path = os.path.dirname(index_data['path'])
1949 vol_name = self._deltatar.volume_name_func(backup_path,
1950 index_data['is_full'], vol_no, guess_name=True)
1951 vol_path = os.path.join(backup_path, vol_name)
1952 if index_data['vol_fd']:
1953 index_data['vol_fd'].close()
be60ffd0 1954 index_data['vol_fd'] = open(vol_path, 'rb')
24ddf0a2
ERE
1955
1956 # force reopen of the tarobj because of new volume
1957 if index_data['tarobj']:
1958 index_data['tarobj'].close()
1959 index_data['tarobj'] = None
1960
1961 # seek tarfile if needed
1962 offset = file_data.get('offset', -1)
ea6d3c3e 1963 if index_data['tarobj']:
c52fd26b 1964 if self._disaster == tarfile.TOLERANCE_RESCUE:
24ddf0a2
ERE
1965 # force a seek and reopen
1966 index_data['tarobj'].close()
1967 index_data['tarobj'] = None
c52fd26b
PG
1968 else:
1969 try:
1970 member = index_data['tarobj'].__iter__().__next__()
1971 except tarfile.DecryptionError:
1972 pass
1973 except tarfile.CompressionError:
1974 pass
1975
1976 if not member or member.path != file_data['path']:
1977 # force a seek and reopen
1978 index_data['tarobj'].close()
1979 index_data['tarobj'] = None
1980
24ddf0a2
ERE
1981
1982 # open the tarfile if needed
1983 if not index_data['tarobj']:
1984 index_data['vol_fd'].seek(offset)
1985 index_data['tarobj'] = tarfile.open(mode="r" + self._deltatar.mode,
1986 fileobj=index_data['vol_fd'],
1987 format=tarfile.GNU_FORMAT,
d1c38f40 1988 concat='#' in self._deltatar.mode,
d5e1d60f 1989 encryption=index_data["decryptor"],
253d4cdd 1990 new_volume_handler=index_data['new_volume_handler'],
044585c6 1991 save_to_members=False,
04f4c7ab 1992 tolerance=self._disaster)
24ddf0a2 1993
be60ffd0 1994 member = index_data['tarobj'].__iter__().__next__()
ea6d3c3e 1995
253d4cdd
ERE
1996 member.path = unprefixed_path
1997 member.name = unprefixed_path
0501fe0a
ERE
1998
1999 if op_type == 'directory':
253d4cdd 2000 self.add_member_dir(member)
0501fe0a 2001 member = copy.copy(member)
be60ffd0 2002 member.mode = 0o0700
0501fe0a 2003
df86af81
ERE
2004 # if it's an existing directory, we then don't need to recreate it
2005 # just set the right permissions, mtime and that kind of stuff
2006 if os.path.exists(member.path):
2007 return
2008
9f9ae874 2009 if not ismember:
24ddf0a2
ERE
2010 # set current volume number in tarobj, otherwise the extraction of the
2011 # file might fail when trying to extract a multivolume member
2012 index_data['tarobj'].volume_number = index_data['curr_vol_no']
86a6e741 2013
9b13f5c4
PG
2014 def ignore_symlink (member, *_args):
2015 self._deltatar.logger.warning("Ignoring symlink %s" % member.name)
786addd6 2016
ea6d3c3e 2017 # finally, restore the file
c650acfa
PG
2018 index_data['tarobj'].extract(member, symlink_cb=ignore_symlink,
2019 unlink=True)
253d4cdd
ERE
2020
2021 def add_member_dir(self, member):
2022 '''
2023 Add member dir to be restored at the end
2024 '''
4e433e00 2025 if not self.canchown:
253d4cdd
ERE
2026 self._directories.append(DirItem(name=member.name, mode=member.mode,
2027 mtime=member.mtime))
2028 else:
2029 self._directories.append(DirItem(name=member.name, mode=member.mode,
2030 mtime=member.mtime, gname=member.gname, uname=member.uname,
4e433e00 2031 uid=member.uid, gid=member.gid, issym=member.issym()))
253d4cdd
ERE
2032
2033class DirItem(object):
2034 def __init__(self, **kwargs):
be60ffd0 2035 for k, v in kwargs.items():
9f9ae874 2036 setattr(self, k, v)