| 1 | #!/usr/bin/env python3 |
| 2 | |
| 3 | # Copyright (C) 2013, 2014 Intra2net AG |
| 4 | # |
| 5 | # This program is free software; you can redistribute it and/or modify |
| 6 | # it under the terms of the GNU Lesser General Public License as published |
| 7 | # by the Free Software Foundation; either version 3 of the License, or |
| 8 | # (at your option) any later version. |
| 9 | # |
| 10 | # This program is distributed in the hope that it will be useful, |
| 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | # GNU Lesser General Public License for more details. |
| 14 | # |
| 15 | # You should have received a copy of the GNU General Public License |
| 16 | # along with this program. If not, see |
| 17 | # <http://www.gnu.org/licenses/lgpl-3.0.html> |
| 18 | |
| 19 | DELTATAR_HEADER_VERSION = 1 |
| 20 | DELTATAR_PARAMETER_VERSION = 1 |
| 21 | |
| 22 | import logging |
| 23 | import datetime |
| 24 | import binascii |
| 25 | import io |
| 26 | import operator |
| 27 | import os |
| 28 | import copy |
| 29 | import shutil |
| 30 | import re |
| 31 | import stat |
| 32 | import json |
| 33 | import typing |
| 34 | from functools import partial |
| 35 | |
| 36 | from . import tarfile |
| 37 | from . import crypto |
| 38 | |
| 39 | class NullHandler(logging.Handler): |
| 40 | def emit(self, record): |
| 41 | pass |
| 42 | |
| 43 | |
| 44 | logging.getLogger("deltatar.DeltaTar").addHandler(NullHandler()) |
| 45 | |
| 46 | |
| 47 | # match mode |
| 48 | NO_MATCH = False |
| 49 | MATCH = True |
| 50 | PARENT_MATCH = 2 |
| 51 | |
| 52 | # encryption direction |
| 53 | CRYPTO_MODE_ENCRYPT = 0 |
| 54 | CRYPTO_MODE_DECRYPT = 1 |
| 55 | |
| 56 | # The canonical extension for encrypted backup files regardless of the actual |
| 57 | # encryption parameters is “.pdtcrypt”. This is analogous to the encryption |
| 58 | # header which starts with the eight ASCII bytes “PDTCRYPT”. Historical note: |
| 59 | # Since the introduction of the versioned header there no longer any need |
| 60 | # for encoding encryption parameters in the file extensions (“.aes128” and |
| 61 | # suchlike). |
| 62 | PDTCRYPT_EXTENSION = "pdtcrypt" |
| 63 | PDT_TYPE_ARCHIVE = 0 |
| 64 | PDT_TYPE_AUX = 1 |
| 65 | |
| 66 | AUXILIARY_FILE_INDEX = 0 |
| 67 | AUXILIARY_FILE_INFO = 1 |
| 68 | |
| 69 | class DeltaTar(object): |
| 70 | ''' |
| 71 | Backup class used to create backups |
| 72 | ''' |
| 73 | |
| 74 | # list of files to exclude in the backup creation or restore operation. It |
| 75 | # can contain python regular expressions. |
| 76 | excluded_files = [] |
| 77 | |
| 78 | # list of files to include in the backup creation or restore operation. It |
| 79 | # can contain python regular expressions. If empty, all files in the source |
| 80 | # path will be backed up (when creating a backup) or all the files in the |
| 81 | # backup will be restored (when restoring a backup), but if included_files |
| 82 | # is set then only the files include in the list will be processed. |
| 83 | included_files = [] |
| 84 | |
| 85 | # custom filter of files to be backed up (or restored). Unused and unset |
| 86 | # by default. The function receives a file path and must return a boolean. |
| 87 | filter_func = None |
| 88 | |
| 89 | # mode in which the delta will be created (when creating a backup) or |
| 90 | # opened (when restoring). Accepts modes analog to the tarfile library. |
| 91 | mode = "" |
| 92 | |
| 93 | # used together with aes modes to encrypt and decrypt backups. |
| 94 | password = None |
| 95 | crypto_key = None |
| 96 | nacl = None |
| 97 | |
| 98 | # parameter version to use when encrypting; note that this has no effect |
| 99 | # on decryption since the required settings are determined from the headers |
| 100 | crypto_version = DELTATAR_HEADER_VERSION |
| 101 | crypto_paramversion = None |
| 102 | |
| 103 | # when encrypting or decrypting, these hold crypto handlers; created before |
| 104 | # establishing the Tarfile stream iff a password is supplied. |
| 105 | encryptor = None |
| 106 | decryptor = None |
| 107 | |
| 108 | # python logger object. |
| 109 | logger = None |
| 110 | |
| 111 | # specifies the index mode in the same format as @param mode, but without |
| 112 | # the ':', '|' or '#' at the begining. It doesn't make sense to specify |
| 113 | # that the index is encrypted if no password is given in the constructor. |
| 114 | index_mode = None |
| 115 | |
| 116 | # current time for this backup. Used for file names and file creation checks |
| 117 | current_time = None |
| 118 | |
| 119 | # extra data to included in the header of the index file when creating a |
| 120 | # backup |
| 121 | extra_data = dict() |
| 122 | |
| 123 | # valid tarfile modes and their corresponding default file extension |
| 124 | __file_extensions_dict = { |
| 125 | '': '', |
| 126 | ':': '', |
| 127 | ':gz': '.gz', |
| 128 | ':bz2': '.bz2', |
| 129 | '|': '', |
| 130 | '|gz': '.gz', |
| 131 | '|bz2': '.bz2', |
| 132 | '#gz': '.gz', |
| 133 | '#gz.pdtcrypt': '.gz', |
| 134 | '#pdtcrypt': '', |
| 135 | '#': '', |
| 136 | } |
| 137 | |
| 138 | # valid index modes and their corresponding default file extension |
| 139 | __index_extensions_dict = { |
| 140 | '': '', |
| 141 | 'gz': '.gz', |
| 142 | 'bz2': '.bz2', |
| 143 | 'gz.pdtcrypt': '.gz', |
| 144 | 'pdtcrypt': '', |
| 145 | } |
| 146 | |
| 147 | # valid path prefixes |
| 148 | __path_prefix_list = [ |
| 149 | u'snapshot://', |
| 150 | u'list://', |
| 151 | u'delete://' |
| 152 | ] |
| 153 | |
| 154 | def __init__(self, excluded_files=[], included_files=[], |
| 155 | filter_func=None, mode="", password=None, |
| 156 | crypto_key=None, nacl=None, |
| 157 | crypto_version=DELTATAR_HEADER_VERSION, |
| 158 | crypto_paramversion=DELTATAR_PARAMETER_VERSION, |
| 159 | logger=None, index_mode=None, index_name_func=None, |
| 160 | volume_name_func=None): |
| 161 | ''' |
| 162 | Constructor. Configures the diff engine. |
| 163 | |
| 164 | Parameters: |
| 165 | - excluded_files: list of files to exclude in the backup creation or |
| 166 | restore operation. It can contain python regular expressions. |
| 167 | |
| 168 | - included_files: list of files to include in the backup creation or |
| 169 | restore operation. It can contain python regular expressions. If |
| 170 | empty, all files in the source path will be backed up (when creating a |
| 171 | backup) or all the files in the backup will be restored (when |
| 172 | restoring a backup), but if included_files is set then only the files |
| 173 | include in the list will be processed. |
| 174 | |
| 175 | - filter_func: custom filter of files to be backed up (or restored). |
| 176 | Unused and unset by default. The function receives a file path and |
| 177 | must return a boolean. |
| 178 | |
| 179 | - mode: mode in which the delta will be created (when creating a backup) |
| 180 | or opened (when restoring). Accepts the same modes as the tarfile |
| 181 | library. Valid modes are: |
| 182 | |
| 183 | '' open uncompressed |
| 184 | ':' open uncompressed |
| 185 | ':gz' open with gzip compression |
| 186 | ':bz2' open with bzip2 compression |
| 187 | '|' open an uncompressed stream of tar blocks |
| 188 | '|gz' open a gzip compressed stream of tar blocks |
| 189 | '|bz2' open a bzip2 compressed stream of tar blocks |
| 190 | '#gz' open a stream of gzip compressed tar blocks |
| 191 | |
| 192 | - crypto_key: used to encrypt and decrypt backups. Encryption will |
| 193 | be enabled automatically if a key is supplied. Requires a salt to be |
| 194 | passed as well. |
| 195 | |
| 196 | - nacl: salt that was used to derive the encryption key for embedding |
| 197 | in the PDTCRYPT header. Not needed when decrypting and when |
| 198 | encrypting with password. |
| 199 | |
| 200 | - password: used to encrypt and decrypt backups. Encryption will be |
| 201 | enabled automatically if a password is supplied. |
| 202 | |
| 203 | - crypto_version: version of the format, determining the kind of PDT |
| 204 | object header. |
| 205 | |
| 206 | - crypto_paramversion: optionally request encryption conforming to |
| 207 | a specific parameter version. Defaults to the standard PDT value |
| 208 | which as of 2017 is the only one available. |
| 209 | |
| 210 | - logger: python logger object. Optional. |
| 211 | |
| 212 | - index_mode: specifies the index mode in the same format as @param |
| 213 | mode, but without the ':', '|' or '#' at the begining. If encryption |
| 214 | is requested it will extend to the auxiliary (index, info) files as |
| 215 | well. This is an optional parameter that will automatically mimic |
| 216 | @param mode by default if not provided. Valid modes are: |
| 217 | |
| 218 | '' open uncompressed |
| 219 | 'gz' open with gzip compression |
| 220 | 'bz2' open with bzip2 compression |
| 221 | |
| 222 | - index_name_func: function that sets a custom name for the index file. |
| 223 | This function receives a flag to indicate whether the name will be |
| 224 | used for a full or diff backup. The backup path will be prepended to |
| 225 | its return value. |
| 226 | |
| 227 | - volume_name_func: function that defines the name of tar volumes. It |
| 228 | receives the backup_path, if it's a full backup and the volume number, |
| 229 | and must return the name for the corresponding volume name. Optional, |
| 230 | DeltaTar has default names for tar volumes. |
| 231 | ''' |
| 232 | |
| 233 | if mode not in self.__file_extensions_dict: |
| 234 | raise Exception('Unrecognized extension mode=[%s] requested for files' |
| 235 | % str(mode)) |
| 236 | |
| 237 | self.excluded_files = excluded_files |
| 238 | self.included_files = included_files |
| 239 | self.filter_func = filter_func |
| 240 | self.logger = logging.getLogger('deltatar.DeltaTar') |
| 241 | if logger: |
| 242 | self.logger.addHandler(logger) |
| 243 | self.mode = mode |
| 244 | |
| 245 | if crypto_key is not None: |
| 246 | self.crypto_key = crypto_key |
| 247 | self.nacl = nacl # encryption only |
| 248 | |
| 249 | if password is not None: |
| 250 | self.password = password |
| 251 | |
| 252 | if crypto_version is not None: |
| 253 | self.crypto_version = crypto_version |
| 254 | |
| 255 | if crypto_paramversion is not None: |
| 256 | self.crypto_paramversion = crypto_paramversion |
| 257 | |
| 258 | # generate index_mode |
| 259 | if index_mode is None: |
| 260 | index_mode = '' |
| 261 | if 'gz' in mode: |
| 262 | index_mode = "gz" |
| 263 | elif 'bz2' in mode: |
| 264 | index_mode = "bz2" |
| 265 | elif mode not in self.__index_extensions_dict: |
| 266 | raise Exception('Unrecognized extension mode=[%s] requested for index' |
| 267 | % str(mode)) |
| 268 | |
| 269 | self.index_mode = index_mode |
| 270 | self.current_time = datetime.datetime.now() |
| 271 | |
| 272 | if index_name_func is not None: |
| 273 | self.index_name_func = index_name_func |
| 274 | |
| 275 | if volume_name_func is not None: |
| 276 | self.volume_name_func = volume_name_func |
| 277 | |
| 278 | def pick_extension(self, kind, mode=None): |
| 279 | """ |
| 280 | Choose the extension depending on a) the kind of file given, b) the |
| 281 | processing mode, and c) the current encryption settings. |
| 282 | """ |
| 283 | ret = "" |
| 284 | if kind == PDT_TYPE_ARCHIVE: |
| 285 | ret += ".tar" |
| 286 | if mode is None: |
| 287 | mode = self.__index_extensions_dict [self.index_mode] |
| 288 | ret += mode |
| 289 | if self.crypto_key is not None or self.password is not None: |
| 290 | ret += "." + PDTCRYPT_EXTENSION |
| 291 | return ret |
| 292 | |
| 293 | def index_name_func(self, is_full): # pylint: disable=method-hidden |
| 294 | ''' |
| 295 | Callback for setting a custom name for the index file. Depending on |
| 296 | whether *is_full* is set, it will create a suitable name for a full |
| 297 | or a diff backup. |
| 298 | ''' |
| 299 | prefix = "bfull" if is_full else "bdiff" |
| 300 | date_str = self.current_time.strftime("%Y-%m-%d-%H%M") |
| 301 | extension = self.pick_extension \ |
| 302 | (PDT_TYPE_AUX, |
| 303 | self.__index_extensions_dict [self.index_mode]) |
| 304 | |
| 305 | return "%s-%s.index%s" % (prefix, date_str, extension) |
| 306 | |
| 307 | def volume_name_func(self, backup_path, # pylint: disable=method-hidden |
| 308 | is_full, volume_number, |
| 309 | guess_name=False): |
| 310 | ''' |
| 311 | function that defines the name of tar volumes. It receives the |
| 312 | backup_path, if it's a full backup and the volume number, and must return |
| 313 | the name for the corresponding volume name. Optional, DeltaTar has default |
| 314 | names for tar volumes. |
| 315 | |
| 316 | If guess_name is activated, the file is intended not to be created but |
| 317 | to be found, and thus the date will be guessed. |
| 318 | ''' |
| 319 | prefix = "bfull" if is_full else "bdiff" |
| 320 | extension = self.pick_extension \ |
| 321 | (PDT_TYPE_ARCHIVE, |
| 322 | self.__file_extensions_dict [self.mode]) |
| 323 | |
| 324 | if not guess_name: |
| 325 | date_str = self.current_time.strftime("%Y-%m-%d-%H%M") |
| 326 | return "%s-%s-%03d%s" % (prefix, date_str, volume_number + 1, extension) |
| 327 | else: |
| 328 | prefix = prefix + "-" |
| 329 | postfix = "-%03d%s" % (volume_number + 1, extension) |
| 330 | for f in os.listdir(backup_path): |
| 331 | if f.startswith(prefix) and f.endswith(postfix): |
| 332 | return f |
| 333 | raise Exception("volume not found") |
| 334 | |
| 335 | |
| 336 | def filter_path(self, path, source_path="", is_dir=None): |
| 337 | ''' |
| 338 | Filters a path, given the source_path, using the filtering properties |
| 339 | set in the constructor. |
| 340 | The filtering order is: |
| 341 | 1. included_files (if any) |
| 342 | 2. excluded_files |
| 343 | 3. filter_func (which must return whether the file is accepted or not) |
| 344 | ''' |
| 345 | |
| 346 | if len(source_path) > 0: |
| 347 | # ensure that exactly one '/' at end of dir is also removed |
| 348 | source_path = source_path.rstrip(os.sep) + os.sep |
| 349 | path = path[len(source_path):] |
| 350 | |
| 351 | # 1. filter included_files |
| 352 | match = MATCH |
| 353 | if len(self.included_files) > 0: |
| 354 | match = NO_MATCH |
| 355 | for i in self.included_files: |
| 356 | # it can be either a regexp or a string |
| 357 | if isinstance(i, str): |
| 358 | # if the string matches, then continue |
| 359 | if i == path: |
| 360 | match = MATCH |
| 361 | break |
| 362 | |
| 363 | # if the string ends with / it's a directory, and if the |
| 364 | # path is contained in it, it is included |
| 365 | if i.endswith('/') and path.startswith(i): |
| 366 | match = MATCH |
| 367 | break |
| 368 | |
| 369 | # if the string doesn't end with /, add it and do the same |
| 370 | # check |
| 371 | elif path.startswith(i + '/'): |
| 372 | match = MATCH |
| 373 | break |
| 374 | |
| 375 | # check for PARENT_MATCH |
| 376 | if is_dir: |
| 377 | dir_path = path |
| 378 | if not dir_path.endswith('/'): |
| 379 | dir_path += '/' |
| 380 | |
| 381 | if i.startswith(dir_path): |
| 382 | match = PARENT_MATCH |
| 383 | |
| 384 | # if it's a reg exp, then we just check if it matches |
| 385 | elif isinstance(i, typing.Pattern): |
| 386 | if i.match(path): |
| 387 | match = MATCH |
| 388 | break |
| 389 | else: |
| 390 | self.logger.warning('Invalid pattern in included_files: %s' % str(i)) |
| 391 | |
| 392 | if match == NO_MATCH: |
| 393 | return NO_MATCH |
| 394 | |
| 395 | # when a directory is in PARENT_MATCH, it doesn't matter if it's |
| 396 | # excluded. It's subfiles will be excluded, but the directory itself |
| 397 | # won't |
| 398 | if match != PARENT_MATCH: |
| 399 | for e in self.excluded_files: |
| 400 | # it can be either a regexp or a string |
| 401 | if isinstance(e, str): |
| 402 | # if the string matches, then exclude |
| 403 | if e == path: |
| 404 | return NO_MATCH |
| 405 | |
| 406 | # if the string ends with / it's a directory, and if the |
| 407 | # path starts with the directory, then exclude |
| 408 | if e.endswith('/') and path.startswith(e): |
| 409 | return NO_MATCH |
| 410 | |
| 411 | # if the string doesn't end with /, do the same check with |
| 412 | # the slash added |
| 413 | elif path.startswith(e + '/'): |
| 414 | return NO_MATCH |
| 415 | |
| 416 | # if it's a reg exp, then we just check if it matches |
| 417 | elif isinstance(e, typing.Pattern): |
| 418 | if e.match(path): |
| 419 | return NO_MATCH |
| 420 | else: |
| 421 | self.logger.warning('Invalid pattern in excluded_files: %s' % str(e)) |
| 422 | |
| 423 | if self.filter_func: |
| 424 | return self.filter_func(path) |
| 425 | |
| 426 | return match |
| 427 | |
| 428 | def _recursive_walk_dir(self, source_path, keep_base_dir=False): |
| 429 | ''' |
| 430 | Walk a directory recursively, yielding each file/directory |
| 431 | |
| 432 | Returns the path of an entity. If ``keep_base_dir`` is set, |
| 433 | the path returned contains the prefix ``source_path``; otherwise it is |
| 434 | relative to the prefix. |
| 435 | ''' |
| 436 | |
| 437 | source_path = source_path.rstrip(os.sep) |
| 438 | |
| 439 | if keep_base_dir: |
| 440 | beginning_size = 0 |
| 441 | else: |
| 442 | beginning_size = len(source_path) + 1 # +1 for os.sep |
| 443 | |
| 444 | queue = [source_path] |
| 445 | |
| 446 | while queue: |
| 447 | cur_path = queue.pop(0) |
| 448 | |
| 449 | try: |
| 450 | dfd = os.open (cur_path, os.O_DIRECTORY) |
| 451 | except FileNotFoundError as exn: |
| 452 | self.logger.warning ("failed to open entity [%s] as directory; " |
| 453 | "file system (error: %s); skipping" |
| 454 | % (cur_path, str (exn))) |
| 455 | continue |
| 456 | |
| 457 | try: |
| 458 | for filename in sorted(os.listdir(dfd)): |
| 459 | child = os.path.join(cur_path, filename) |
| 460 | is_dir = os.path.isdir(child) |
| 461 | status = self.filter_path(child, source_path, is_dir) |
| 462 | if status == NO_MATCH: |
| 463 | continue |
| 464 | if not os.access(child, os.R_OK): |
| 465 | self.logger.warning('Error accessing possibly locked file %s' % child) |
| 466 | continue |
| 467 | |
| 468 | if status == MATCH: |
| 469 | yield child[beginning_size:] |
| 470 | |
| 471 | if is_dir and (status == MATCH or status == PARENT_MATCH): |
| 472 | queue.append(child) |
| 473 | finally: |
| 474 | os.close (dfd) |
| 475 | |
| 476 | def _stat_dict(self, path): |
| 477 | ''' |
| 478 | Returns a dict with the stat data used to compare files |
| 479 | ''' |
| 480 | stinfo = os.stat(path) |
| 481 | mode = stinfo.st_mode |
| 482 | |
| 483 | ptype = None |
| 484 | if stat.S_ISDIR(mode): |
| 485 | ptype = u'directory' |
| 486 | elif stat.S_ISREG(mode): |
| 487 | ptype = u'file' |
| 488 | elif stat.S_ISLNK(mode): |
| 489 | ptype = u'link' |
| 490 | |
| 491 | return { |
| 492 | u'type': ptype, |
| 493 | u'path': path, |
| 494 | u'mode': mode, |
| 495 | u'mtime': int(stinfo.st_mtime), |
| 496 | u'ctime': int(stinfo.st_ctime), |
| 497 | u'uid': stinfo.st_uid, |
| 498 | u'gid': stinfo.st_gid, |
| 499 | u'inode': stinfo.st_ino, |
| 500 | u'size': stinfo.st_size |
| 501 | } |
| 502 | |
| 503 | def _equal_stat_dicts(self, d1, d2, listsnapshot_equal=False): |
| 504 | ''' |
| 505 | Return if the dicts are equal in the stat keys |
| 506 | ''' |
| 507 | keys = [u'type', u'mode',u'size', u'mtime', |
| 508 | # not restored: u'inode', u'ctime' |
| 509 | ] |
| 510 | |
| 511 | # only if user is root, then also check gid/uid. otherwise do not check it, |
| 512 | # because tarfile can chown in case of being superuser only |
| 513 | # |
| 514 | # also, skip the check in rpmbuild since the sources end up with the |
| 515 | # uid:gid of the packager while the extracted files are 0:0. |
| 516 | if hasattr(os, "geteuid") and os.geteuid() == 0 \ |
| 517 | and os.getenv ("RPMBUILD_OPTIONS") is None: |
| 518 | keys.append('gid') |
| 519 | keys.append('uid') |
| 520 | |
| 521 | if (not d1 and d2 != None) or (d1 != None and not d2): |
| 522 | return False |
| 523 | |
| 524 | if self.prefixed(d1.get('path', -1), listsnapshot_equal) != self.prefixed(d2.get('path', -2), listsnapshot_equal): |
| 525 | return False |
| 526 | |
| 527 | type = d1.get('type', '') |
| 528 | |
| 529 | for key in keys: |
| 530 | # size doesn't matter for directories |
| 531 | if type == 'directory' and key == 'size': |
| 532 | continue |
| 533 | if d1.get(key, -1) != d2.get(key, -2): |
| 534 | return False |
| 535 | return True |
| 536 | |
| 537 | def prefixed(self, path, listsnapshot_equal=False): |
| 538 | ''' |
| 539 | if a path is not prefixed, return it prefixed |
| 540 | ''' |
| 541 | for prefix in self.__path_prefix_list: |
| 542 | if path.startswith(prefix): |
| 543 | if listsnapshot_equal and prefix == u'list://': |
| 544 | return u'snapshot://' + path[len(prefix):] |
| 545 | return path |
| 546 | return u'snapshot://' + path |
| 547 | |
| 548 | def unprefixed(self, path): |
| 549 | ''' |
| 550 | remove a path prefix if any |
| 551 | ''' |
| 552 | for prefix in self.__path_prefix_list: |
| 553 | if path.startswith(prefix): |
| 554 | return path[len(prefix):] |
| 555 | return path |
| 556 | |
| 557 | |
| 558 | def initialize_encryption (self, mode, strict_validation=True): |
| 559 | """ |
| 560 | :type strict_validation: bool |
| 561 | :param strict_validation: Enable strict IV checking in the crypto |
| 562 | layer. Should be disabled when dealing with |
| 563 | potentially corrupted data. |
| 564 | """ |
| 565 | password = self.password |
| 566 | key = self.crypto_key |
| 567 | nacl = self.nacl |
| 568 | |
| 569 | if key is None and password is None: |
| 570 | return |
| 571 | if mode == CRYPTO_MODE_ENCRYPT: |
| 572 | return crypto.Encrypt (password=password, |
| 573 | key=key, |
| 574 | nacl=nacl, |
| 575 | version=self.crypto_version, |
| 576 | paramversion=self.crypto_paramversion) |
| 577 | if mode == CRYPTO_MODE_DECRYPT: |
| 578 | return crypto.Decrypt (password=password, key=key, |
| 579 | strict_ivs=strict_validation) |
| 580 | |
| 581 | raise Exception ("invalid encryption mode [%r]" % mode) |
| 582 | |
| 583 | |
| 584 | def open_auxiliary_file(self, path, mode='r', kind=AUXILIARY_FILE_INDEX, |
| 585 | strict_validation=True): |
| 586 | ''' |
| 587 | Given the specified configuration, opens a file for reading or writing, |
| 588 | inheriting the encryption and compression settings from the backup. |
| 589 | Returns a file object ready to use. |
| 590 | |
| 591 | :param mode: IO mode (read or write, ``"r"`` and ``"w"``, |
| 592 | respectively). |
| 593 | :type mode: str |
| 594 | :param kind: Role of the file, see AUXILIARY_FILE_* constants. |
| 595 | Both the info and the auxiliary file have a globally |
| 596 | unique, constant counter value. |
| 597 | :type kind: str |
| 598 | ''' |
| 599 | if self.index_mode.startswith('gz'): |
| 600 | comptype = 'gz' |
| 601 | elif self.index_mode.startswith('bz2'): |
| 602 | comptype = 'bz2' |
| 603 | else: |
| 604 | comptype = 'tar' |
| 605 | |
| 606 | crypto_ctx = None |
| 607 | enccounter = None |
| 608 | if mode == "w": |
| 609 | crypto_ctx = self.initialize_encryption (CRYPTO_MODE_ENCRYPT) |
| 610 | elif mode == "r": |
| 611 | crypto_ctx = self.initialize_encryption (CRYPTO_MODE_DECRYPT, |
| 612 | strict_validation=strict_validation) |
| 613 | |
| 614 | if crypto_ctx is not None: |
| 615 | if kind == AUXILIARY_FILE_INFO: |
| 616 | enccounter = crypto.AES_GCM_IV_CNT_INFOFILE |
| 617 | elif kind == AUXILIARY_FILE_INDEX: |
| 618 | enccounter = crypto.AES_GCM_IV_CNT_INDEX |
| 619 | else: |
| 620 | raise Exception ("invalid kind of aux file %r" % kind) |
| 621 | |
| 622 | sink = tarfile._Stream(name=path, mode=mode, comptype=comptype, |
| 623 | bufsize=tarfile.RECORDSIZE, fileobj=None, |
| 624 | encryption=crypto_ctx, enccounter=enccounter) |
| 625 | |
| 626 | return sink |
| 627 | |
| 628 | |
| 629 | def create_full_backup(self, source_path, backup_path, |
| 630 | max_volume_size=None, extra_data=dict()): |
| 631 | ''' |
| 632 | Creates a full backup. |
| 633 | |
| 634 | Parameters: |
| 635 | - source_path: source path to the directory to back up. |
| 636 | - backup_path: path where the back up will be stored. Backup path will |
| 637 | be created if not existent. |
| 638 | - max_volume_size: maximum volume size in megabytes. Used to split the |
| 639 | backup in volumes. Optional (won't split in volumes by default). |
| 640 | - extra_data: a json-serializable dictionary with information that you |
| 641 | want to be included in the header of the index file |
| 642 | ''' |
| 643 | # check input |
| 644 | if not isinstance(source_path, str): |
| 645 | raise Exception('Source path must be a string') |
| 646 | |
| 647 | if not isinstance(backup_path, str): |
| 648 | raise Exception('Backup path must be a string') |
| 649 | |
| 650 | if not os.path.exists(source_path) or not os.path.isdir(source_path): |
| 651 | raise Exception('Source path "%s" does not exist or is not a '\ |
| 652 | 'directory' % source_path) |
| 653 | |
| 654 | if max_volume_size != None and (not isinstance(max_volume_size, int) or\ |
| 655 | max_volume_size < 1): |
| 656 | raise Exception('max_volume_size must be a positive integer') |
| 657 | if max_volume_size != None: |
| 658 | max_volume_size = max_volume_size*1024*1024 |
| 659 | |
| 660 | if not isinstance(extra_data, dict): |
| 661 | raise Exception('extra_data must be a dictionary') |
| 662 | |
| 663 | try: |
| 664 | extra_data_str = json.dumps(extra_data) |
| 665 | except: |
| 666 | raise Exception('extra_data is not json-serializable') |
| 667 | |
| 668 | if not os.access(source_path, os.R_OK): |
| 669 | raise Exception('Source path "%s" is not readable' % source_path) |
| 670 | |
| 671 | # try to create backup path if needed |
| 672 | os.makedirs(backup_path, exist_ok=True) |
| 673 | |
| 674 | if not os.access(backup_path, os.W_OK): |
| 675 | raise Exception('Backup path "%s" is not writeable' % backup_path) |
| 676 | |
| 677 | if source_path.endswith('/'): |
| 678 | source_path = source_path[:-1] |
| 679 | |
| 680 | if backup_path.endswith('/'): |
| 681 | backup_path = backup_path[:-1] |
| 682 | |
| 683 | # update current time |
| 684 | self.current_time = datetime.datetime.now() |
| 685 | |
| 686 | if self.mode not in self.__file_extensions_dict: |
| 687 | raise Exception('Unrecognized extension') |
| 688 | |
| 689 | # setup for encrypting payload |
| 690 | if self.encryptor is None: |
| 691 | self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT) |
| 692 | |
| 693 | # some initialization |
| 694 | self.vol_no = 0 |
| 695 | |
| 696 | # generate the first volume name |
| 697 | vol_name = self.volume_name_func(backup_path, True, 0) |
| 698 | tarfile_path = os.path.join(backup_path, vol_name) |
| 699 | |
| 700 | # init index |
| 701 | index_name = self.index_name_func(True) |
| 702 | index_path = os.path.join(backup_path, index_name) |
| 703 | index_sink = self.open_auxiliary_file(index_path, 'w') |
| 704 | |
| 705 | cwd = os.getcwd() |
| 706 | |
| 707 | def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number): |
| 708 | ''' |
| 709 | Handles the new volumes |
| 710 | ''' |
| 711 | volume_name = deltarobj.volume_name_func(backup_path, True, volume_number) |
| 712 | volume_path = os.path.join(backup_path, volume_name) |
| 713 | deltarobj.vol_no = volume_number |
| 714 | |
| 715 | # we convert relative paths into absolute because CWD is changed |
| 716 | if not os.path.isabs(volume_path): |
| 717 | volume_path = os.path.join(cwd, volume_path) |
| 718 | |
| 719 | if tarobj.fileobj is not None: |
| 720 | tarobj.fileobj.close() |
| 721 | |
| 722 | deltarobj.logger.debug("opening volume %s" % volume_path) |
| 723 | |
| 724 | tarobj.open_volume(volume_path, encryption=encryption) |
| 725 | |
| 726 | # wraps some args from context into the handler |
| 727 | new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.encryptor) |
| 728 | |
| 729 | index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "full", "extra_data": %s}\n' % extra_data_str, 'UTF-8')) |
| 730 | |
| 731 | s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8') |
| 732 | # calculate checksum and write into the stream |
| 733 | crc = binascii.crc32(s) & 0xFFFFffff |
| 734 | index_sink.write(s) |
| 735 | |
| 736 | # start creating the tarfile |
| 737 | tarobj = tarfile.TarFile.open(tarfile_path, |
| 738 | mode='w' + self.mode, |
| 739 | format=tarfile.GNU_FORMAT, |
| 740 | concat='#' in self.mode, |
| 741 | encryption=self.encryptor, |
| 742 | max_volume_size=max_volume_size, |
| 743 | new_volume_handler=new_volume_handler, |
| 744 | save_to_members=False, |
| 745 | dereference=True) |
| 746 | os.chdir(source_path) |
| 747 | |
| 748 | # for each file to be in the backup, do: |
| 749 | for path in self._recursive_walk_dir('.'): |
| 750 | |
| 751 | try: # backup file |
| 752 | # calculate stat dict for current file |
| 753 | statd = self._stat_dict(path) |
| 754 | statd['path'] = u'snapshot://' + statd['path'] |
| 755 | statd['volume'] = self.vol_no |
| 756 | |
| 757 | # backup file |
| 758 | tarobj.add(path, arcname = statd['path'], recursive=False) |
| 759 | except FileNotFoundError as exn: |
| 760 | # file vanished since the call to access(3) above |
| 761 | self.logger.warning ("object [%s] no longer available in " |
| 762 | "file system (error: %s); skipping" |
| 763 | % (path, str (exn))) |
| 764 | continue # prevent indexing |
| 765 | |
| 766 | # retrieve file offset |
| 767 | statd['offset'] = tarobj.get_last_member_offset() |
| 768 | self.logger.debug("backup %s" % statd['path']) |
| 769 | |
| 770 | # store the stat dict in the index |
| 771 | s = bytes(json.dumps(statd) + '\n', 'UTF-8') |
| 772 | crc = binascii.crc32(s, crc) & 0xffffffff |
| 773 | index_sink.write(s) |
| 774 | |
| 775 | s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8') |
| 776 | crc = binascii.crc32(s, crc) & 0xffffffff |
| 777 | index_sink.write(s) |
| 778 | s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8') |
| 779 | index_sink.write(s) |
| 780 | |
| 781 | os.chdir(cwd) |
| 782 | tarobj.close() |
| 783 | index_sink.close (close_fileobj=True) |
| 784 | |
| 785 | def create_diff_backup(self, source_path, backup_path, previous_index_path, |
| 786 | max_volume_size=None, extra_data=dict()): |
| 787 | ''' |
| 788 | Creates a backup. |
| 789 | |
| 790 | Parameters: |
| 791 | - source_path: source path to the directory to back up. |
| 792 | - backup_path: path where the back up will be stored. Backup path will |
| 793 | be created if not existent. |
| 794 | - previous_index_path: index of the previous backup, needed to know |
| 795 | which files changed since then. |
| 796 | - max_volume_size: maximum volume size in megabytes (MB). Used to split |
| 797 | the backup in volumes. Optional (won't split in volumes by default). |
| 798 | |
| 799 | NOTE: previous index is assumed to follow exactly the same format as |
| 800 | the index_mode setup in the constructor. |
| 801 | ''' |
| 802 | # check/sanitize input |
| 803 | if not isinstance(source_path, str): |
| 804 | raise Exception('Source path must be a string') |
| 805 | |
| 806 | if not isinstance(backup_path, str): |
| 807 | raise Exception('Backup path must be a string') |
| 808 | |
| 809 | if not os.path.exists(source_path) or not os.path.isdir(source_path): |
| 810 | raise Exception('Source path "%s" does not exist or is not a '\ |
| 811 | 'directory' % source_path) |
| 812 | |
| 813 | if not isinstance(extra_data, dict): |
| 814 | raise Exception('extra_data must be a dictionary') |
| 815 | |
| 816 | try: |
| 817 | extra_data_str = json.dumps(extra_data) |
| 818 | except: |
| 819 | raise Exception('extra_data is not json-serializable') |
| 820 | |
| 821 | if not os.access(source_path, os.R_OK): |
| 822 | raise Exception('Source path "%s" is not readable' % source_path) |
| 823 | |
| 824 | if max_volume_size != None and (not isinstance(max_volume_size, int) or\ |
| 825 | max_volume_size < 1): |
| 826 | raise Exception('max_volume_size must be a positive integer') |
| 827 | if max_volume_size != None: |
| 828 | max_volume_size = max_volume_size*1024*1024 |
| 829 | |
| 830 | if not isinstance(previous_index_path, str): |
| 831 | raise Exception('previous_index_path must be A string') |
| 832 | |
| 833 | if not os.path.exists(previous_index_path) or not os.path.isfile(previous_index_path): |
| 834 | raise Exception('Index path "%s" does not exist or is not a '\ |
| 835 | 'file' % previous_index_path) |
| 836 | |
| 837 | if not os.access(previous_index_path, os.R_OK): |
| 838 | raise Exception('Index path "%s" is not readable' % previous_index_path) |
| 839 | |
| 840 | # try to create backup path if needed |
| 841 | os.makedirs(backup_path, exist_ok=True) |
| 842 | |
| 843 | if not os.access(backup_path, os.W_OK): |
| 844 | raise Exception('Backup path "%s" is not writeable' % backup_path) |
| 845 | |
| 846 | if source_path.endswith('/'): |
| 847 | source_path = source_path[:-1] |
| 848 | |
| 849 | if backup_path.endswith('/'): |
| 850 | backup_path = backup_path[:-1] |
| 851 | |
| 852 | # update current time |
| 853 | self.current_time = datetime.datetime.now() |
| 854 | |
| 855 | if self.mode not in self.__file_extensions_dict: |
| 856 | raise Exception('Unrecognized extension') |
| 857 | |
| 858 | # setup for encrypting payload |
| 859 | if self.encryptor is None: |
| 860 | self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT) |
| 861 | |
| 862 | # some initialization |
| 863 | self.vol_no = 0 |
| 864 | |
| 865 | # generate the first volume name |
| 866 | vol_name = self.volume_name_func(backup_path, is_full=False, |
| 867 | volume_number=0) |
| 868 | tarfile_path = os.path.join(backup_path, vol_name) |
| 869 | |
| 870 | # init index |
| 871 | cwd = os.getcwd() |
| 872 | |
| 873 | index_name = self.index_name_func(is_full=False) |
| 874 | index_path = os.path.join(backup_path, index_name) |
| 875 | index_sink = self.open_auxiliary_file(index_path, 'w') |
| 876 | |
| 877 | def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number): |
| 878 | ''' |
| 879 | Handles the new volumes |
| 880 | ''' |
| 881 | volume_name = deltarobj.volume_name_func(backup_path, is_full=False, |
| 882 | volume_number=volume_number) |
| 883 | volume_path = os.path.join(backup_path, volume_name) |
| 884 | deltarobj.vol_no = volume_number |
| 885 | |
| 886 | # we convert relative paths into absolute because CWD is changed |
| 887 | if not os.path.isabs(volume_path): |
| 888 | volume_path = os.path.join(cwd, volume_path) |
| 889 | |
| 890 | deltarobj.logger.debug("opening volume %s" % volume_path) |
| 891 | tarobj.open_volume(volume_path) |
| 892 | |
| 893 | # wraps some args from context into the handler |
| 894 | new_volume_handler = partial(new_volume_handler, self, cwd, backup_path) |
| 895 | |
| 896 | index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "diff", "extra_data": %s}\n' % extra_data_str, 'UTF-8')) |
| 897 | |
| 898 | s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8') |
| 899 | # calculate checksum and write into the stream |
| 900 | crc = binascii.crc32(s) & 0xFFFFffff |
| 901 | index_sink.write(s) |
| 902 | |
| 903 | # start creating the tarfile |
| 904 | tarobj = tarfile.TarFile.open(tarfile_path, |
| 905 | mode='w' + self.mode, |
| 906 | format=tarfile.GNU_FORMAT, |
| 907 | concat='#' in self.mode, |
| 908 | encryption=self.encryptor, |
| 909 | max_volume_size=max_volume_size, |
| 910 | new_volume_handler=new_volume_handler, |
| 911 | save_to_members=False, |
| 912 | dereference=True) |
| 913 | |
| 914 | |
| 915 | # create the iterators, first the previous index iterator, then the |
| 916 | # source path directory iterator and collate and iterate them |
| 917 | if not os.path.isabs(previous_index_path): |
| 918 | previous_index_path = os.path.join(cwd, previous_index_path) |
| 919 | index_it = self.iterate_index_path(previous_index_path) |
| 920 | |
| 921 | os.chdir(source_path) |
| 922 | dir_it = self._recursive_walk_dir('.') |
| 923 | dir_path_it = self.jsonize_path_iterator(dir_it) |
| 924 | |
| 925 | def pr(path): |
| 926 | if not path: |
| 927 | return "None" |
| 928 | else: |
| 929 | return path["path"] |
| 930 | |
| 931 | # for each file to be in the backup, do: |
| 932 | for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it): |
| 933 | action = None |
| 934 | # if file is not in the index, it means it's a new file, so we have |
| 935 | # to take a snapshot |
| 936 | |
| 937 | if not ipath: |
| 938 | action = 'snapshot' |
| 939 | # if the file is not in the directory iterator, it means that it has |
| 940 | # been deleted, so we need to mark it as such |
| 941 | elif not dpath: |
| 942 | action = 'delete' |
| 943 | # if the file is in both iterators, it means it might have either |
| 944 | # not changed (in which case we will just list it in our index but |
| 945 | # it will not be included in the tar file), or it might have |
| 946 | # changed, in which case we will snapshot it. |
| 947 | elif ipath and dpath: |
| 948 | if self._equal_stat_dicts(ipath, dpath): |
| 949 | action = 'list' |
| 950 | else: |
| 951 | action = 'snapshot' |
| 952 | # TODO: when creating chained backups (i.e. diffing from another |
| 953 | # diff), we will need to detect the type of action in the previous |
| 954 | # index, because if it was delete and dpath is None, we should |
| 955 | # discard the file |
| 956 | |
| 957 | if action == 'snapshot': |
| 958 | # calculate stat dict for current file |
| 959 | stat = dpath.copy() |
| 960 | stat['path'] = "snapshot://" + dpath['path'] |
| 961 | stat['volume'] = self.vol_no |
| 962 | |
| 963 | self.logger.debug("[STORE] %s" % dpath['path']) |
| 964 | |
| 965 | try: # backup file |
| 966 | tarobj.add(dpath['path'], arcname=stat['path'], recursive=False) |
| 967 | # retrieve file offset |
| 968 | stat['offset'] = tarobj.get_last_member_offset() |
| 969 | except FileNotFoundError as exn: |
| 970 | # file vanished since the call to access(3) above |
| 971 | self.logger.warning ("object [%s] no longer available in " |
| 972 | "file system (error: %s); skipping" |
| 973 | % (dpath ["path"], str (exn))) |
| 974 | stat = None # prevent indexing |
| 975 | |
| 976 | elif action == 'delete': |
| 977 | path = self.unprefixed(ipath['path']) |
| 978 | stat = { |
| 979 | u'path': u'delete://' + path, |
| 980 | u'type': ipath['type'] |
| 981 | } |
| 982 | self.logger.debug("[DELETE] %s" % path) |
| 983 | |
| 984 | # mark it as deleted in the backup |
| 985 | tarobj.add("/dev/null", arcname=stat['path']) |
| 986 | elif action == 'list': |
| 987 | stat = dpath.copy() |
| 988 | path = self.unprefixed(ipath['path']) |
| 989 | stat['path'] = u'list://' + path |
| 990 | # unchanged files do not enter in the backup, only in the index |
| 991 | self.logger.debug("[UNCHANGED] %s" % path) |
| 992 | else: |
| 993 | # should not happen |
| 994 | self.logger.warning('unknown action in create_diff_backup: {0}' |
| 995 | ''.format(action)) |
| 996 | stat = None |
| 997 | |
| 998 | if stat: |
| 999 | # store the stat dict in the index |
| 1000 | s = bytes(json.dumps(stat) + '\n', 'UTF-8') |
| 1001 | crc = binascii.crc32(s, crc) & 0xffffffff |
| 1002 | index_sink.write(s) |
| 1003 | |
| 1004 | s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8') |
| 1005 | crc = binascii.crc32(s, crc) & 0xffffffff |
| 1006 | index_sink.write(s) |
| 1007 | s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8') |
| 1008 | index_sink.write(s) |
| 1009 | |
| 1010 | index_it.release() |
| 1011 | os.chdir(cwd) |
| 1012 | tarobj.close() |
| 1013 | index_sink.close() |
| 1014 | |
| 1015 | |
| 1016 | def iterate_index_path(self, index_path, strict_validation=True): |
| 1017 | ''' |
| 1018 | Returns an index iterator. Internally, it uses a classic iterator class. |
| 1019 | We do that instead of just yielding so that the iterator object can have |
| 1020 | an additional function to close the file descriptor that is opened in |
| 1021 | the constructor. |
| 1022 | ''' |
| 1023 | |
| 1024 | class IndexPathIterator(object): |
| 1025 | def __init__(self, delta_tar, index_path): |
| 1026 | self.delta_tar = delta_tar |
| 1027 | self.index_path = index_path |
| 1028 | self.f = None |
| 1029 | self.extra_data = dict() |
| 1030 | self.__enter__() |
| 1031 | |
| 1032 | def __iter__(self): |
| 1033 | return self |
| 1034 | |
| 1035 | def release(self): |
| 1036 | if self.f: |
| 1037 | self.f.close() |
| 1038 | |
| 1039 | def __enter__(self): |
| 1040 | ''' |
| 1041 | Allows this iterator to be used with the "with" statement |
| 1042 | ''' |
| 1043 | if self.f is None: |
| 1044 | self.f = self.delta_tar.open_auxiliary_file \ |
| 1045 | (self.index_path, |
| 1046 | 'r', |
| 1047 | strict_validation=strict_validation) |
| 1048 | # check index header |
| 1049 | j, l_no = self.delta_tar._parse_json_line(self.f, 0) |
| 1050 | if j.get("type", '') != 'python-delta-tar-index' or\ |
| 1051 | j.get('version', -1) != 1: |
| 1052 | raise Exception("invalid index file format: %s" % json.dumps(j)) |
| 1053 | |
| 1054 | self.extra_data = j.get('extra_data', dict()) |
| 1055 | |
| 1056 | # find BEGIN-FILE-LIST, ignore other headers |
| 1057 | while True: |
| 1058 | j, l_no = self.delta_tar._parse_json_line(self.f, l_no) |
| 1059 | if j.get('type', '') == 'BEGIN-FILE-LIST': |
| 1060 | break |
| 1061 | return self |
| 1062 | |
| 1063 | def __exit__(self, type, value, tb): |
| 1064 | ''' |
| 1065 | Allows this iterator to be used with the "with" statement |
| 1066 | ''' |
| 1067 | if self.f: |
| 1068 | self.f.close() |
| 1069 | self.f = None |
| 1070 | |
| 1071 | def __next__(self): |
| 1072 | # read each file in the index and process it to do the restore |
| 1073 | j = {} |
| 1074 | l_no = -1 |
| 1075 | try: |
| 1076 | j, l_no = self.delta_tar._parse_json_line(self.f, l_no) |
| 1077 | except Exception as e: |
| 1078 | if self.f: |
| 1079 | self.f.close() |
| 1080 | raise e |
| 1081 | |
| 1082 | op_type = j.get('type', '') |
| 1083 | |
| 1084 | # when we detect the end of the list, break the loop |
| 1085 | if op_type == 'END-FILE-LIST': |
| 1086 | if self.f: |
| 1087 | self.f.close() |
| 1088 | raise StopIteration |
| 1089 | |
| 1090 | # check input |
| 1091 | if op_type not in ['directory', 'file', 'link']: |
| 1092 | self.delta_tar.logger.warning('unrecognized type to be ' |
| 1093 | 'restored: %s, line %d' % (op_type, l_no)) |
| 1094 | # iterate again |
| 1095 | return self.__next__() |
| 1096 | |
| 1097 | return j, l_no |
| 1098 | |
| 1099 | return IndexPathIterator(self, index_path) |
| 1100 | |
| 1101 | def iterate_tar_path(self, tar_path, new_volume_handler=None): |
| 1102 | ''' |
| 1103 | Returns a tar iterator that iterates jsonized member items that contain |
| 1104 | an additional "member" field, used by RestoreHelper. |
| 1105 | ''' |
| 1106 | class TarPathIterator(object): |
| 1107 | def __init__(self, delta_tar, tar_path, new_volume_handler=None): |
| 1108 | self.delta_tar = delta_tar |
| 1109 | self.tar_path = tar_path |
| 1110 | self.tar_obj = None |
| 1111 | self.last_member = None |
| 1112 | self.new_volume_handler = new_volume_handler |
| 1113 | self.__enter__() |
| 1114 | |
| 1115 | def __iter__(self): |
| 1116 | return self |
| 1117 | |
| 1118 | def release(self): |
| 1119 | if self.tar_obj: |
| 1120 | self.tar_obj.close() |
| 1121 | |
| 1122 | def __enter__(self): |
| 1123 | ''' |
| 1124 | Allows this iterator to be used with the "with" statement |
| 1125 | ''' |
| 1126 | if self.tar_obj is None: |
| 1127 | decryptor = None |
| 1128 | if self.delta_tar.password is not None: |
| 1129 | decryptor = crypto.Decrypt \ |
| 1130 | (password=self.delta_tar.password, |
| 1131 | key=self.delta_tar.crypto_key, |
| 1132 | strict_ivs=False) |
| 1133 | self.tar_obj = tarfile.TarFile.open(self.tar_path, |
| 1134 | mode='r' + self.delta_tar.mode, |
| 1135 | format=tarfile.GNU_FORMAT, |
| 1136 | concat='#' in self.delta_tar.mode, |
| 1137 | encryption=decryptor, |
| 1138 | new_volume_handler=self.new_volume_handler, |
| 1139 | save_to_members=False, |
| 1140 | dereference=True) |
| 1141 | return self |
| 1142 | |
| 1143 | def __exit__(self, type, value, tb): |
| 1144 | ''' |
| 1145 | Allows this iterator to be used with the "with" statement |
| 1146 | ''' |
| 1147 | if self.tar_obj: |
| 1148 | self.tar_obj.close() |
| 1149 | self.tar_obj = None |
| 1150 | |
| 1151 | def __next__(self): |
| 1152 | ''' |
| 1153 | Read each member and return it as a stat dict |
| 1154 | ''' |
| 1155 | tarinfo = self.tar_obj.__iter__().__next__() |
| 1156 | # NOTE: here we compare if tarinfo.path is the same as before |
| 1157 | # instead of comparing the tarinfo object itself because the |
| 1158 | # object itself might change for multivol tarinfos |
| 1159 | if tarinfo is None or (self.last_member is not None and\ |
| 1160 | self.delta_tar.unprefixed(tarinfo.path) == self.delta_tar.unprefixed(self.last_member.path)): |
| 1161 | raise StopIteration |
| 1162 | |
| 1163 | self.last_member = tarinfo |
| 1164 | |
| 1165 | ptype = 'unknown' |
| 1166 | if tarinfo.isfile(): |
| 1167 | ptype = 'file' |
| 1168 | elif tarinfo.isdir(): |
| 1169 | ptype = 'directory' |
| 1170 | elif tarinfo.islnk() or tarinfo.issym(): |
| 1171 | ptype = 'link' |
| 1172 | |
| 1173 | return { |
| 1174 | u'type': ptype, |
| 1175 | u'path': tarinfo.path, |
| 1176 | u'mode': tarinfo.mode, |
| 1177 | u'mtime': tarinfo.mtime, |
| 1178 | u'ctime': -1, # cannot restore |
| 1179 | u'uid': tarinfo.uid, |
| 1180 | u'gid': tarinfo.gid, |
| 1181 | u'inode': -1, # cannot restore |
| 1182 | u'size': tarinfo.size, |
| 1183 | u'member': tarinfo |
| 1184 | }, 0 |
| 1185 | |
| 1186 | return TarPathIterator(self, tar_path, new_volume_handler) |
| 1187 | |
| 1188 | def jsonize_path_iterator(self, iter, strip=0): |
| 1189 | ''' |
| 1190 | converts the yielded items of an iterator into json path lines. |
| 1191 | |
| 1192 | strip: Strip the smallest prefix containing num leading slashes from |
| 1193 | the file path. |
| 1194 | ''' |
| 1195 | while True: |
| 1196 | try: |
| 1197 | path = iter.__next__() |
| 1198 | if strip == 0: |
| 1199 | yield self._stat_dict(path), 0 |
| 1200 | else: |
| 1201 | st = self._stat_dict(path) |
| 1202 | st['path'] = "/".join(path.split("/")[strip:]) |
| 1203 | yield st, 0 |
| 1204 | except StopIteration: |
| 1205 | break |
| 1206 | |
| 1207 | def iterate_disaster_index (self, index): |
| 1208 | """ |
| 1209 | Mimick the behavior of the other object iterators, just with the inputs |
| 1210 | supplied directly as *index*. |
| 1211 | """ |
| 1212 | |
| 1213 | class RawIndexIterator(object): |
| 1214 | def __init__(self, delta_tar, index): |
| 1215 | self.delta_tar = delta_tar |
| 1216 | self.index = index |
| 1217 | self.__enter__() |
| 1218 | |
| 1219 | def __iter__(self): |
| 1220 | return self |
| 1221 | |
| 1222 | def release(self): |
| 1223 | pass |
| 1224 | |
| 1225 | def __enter__(self): |
| 1226 | ''' |
| 1227 | Allows this iterator to be used with the "with" statement |
| 1228 | ''' |
| 1229 | self.iter = self.index.__iter__ () |
| 1230 | return self |
| 1231 | |
| 1232 | def __exit__(self, type, value, tb): |
| 1233 | ''' |
| 1234 | Allows this iterator to be used with the "with" statement |
| 1235 | ''' |
| 1236 | |
| 1237 | def __next__(self): |
| 1238 | idxent = self.iter.__next__ () |
| 1239 | return idxent, 0 |
| 1240 | |
| 1241 | return RawIndexIterator(self, index) |
| 1242 | |
| 1243 | def collate_iterators(self, it1, it2): |
| 1244 | ''' |
| 1245 | Collate two iterators, so that it returns pairs of the items of each |
| 1246 | iterator (if the items are the same), or (None, elem2) or (elem1, None) |
| 1247 | when there's no match for the items in the other iterator. |
| 1248 | |
| 1249 | It assumes that the items in both lists are ordered in the same way. |
| 1250 | ''' |
| 1251 | l_no = 0 |
| 1252 | elem1, elem2 = None, None |
| 1253 | while True: |
| 1254 | if not elem1: |
| 1255 | try: |
| 1256 | elem1, l_no = it1.__next__() |
| 1257 | except StopIteration: |
| 1258 | if elem2: |
| 1259 | yield (None, elem2, l_no) |
| 1260 | for elem2 in it2: |
| 1261 | if isinstance(elem2, tuple): |
| 1262 | elem2 = elem2[0] |
| 1263 | yield (None, elem2, l_no) |
| 1264 | break |
| 1265 | if not elem2: |
| 1266 | try: |
| 1267 | elem2 = it2.__next__() |
| 1268 | if isinstance(elem2, tuple): |
| 1269 | elem2 = elem2[0] |
| 1270 | except StopIteration: |
| 1271 | if elem1: |
| 1272 | yield (elem1, None, l_no) |
| 1273 | for elem1, l_no in it1: |
| 1274 | yield (elem1, None, l_no) |
| 1275 | break |
| 1276 | |
| 1277 | index1 = self.unprefixed(elem1['path']) |
| 1278 | index2 = self.unprefixed(elem2['path']) |
| 1279 | i1, i2 = self.compare_indexes(index1, index2) |
| 1280 | |
| 1281 | yield1 = yield2 = None |
| 1282 | if i1 is not None: |
| 1283 | yield1 = elem1 |
| 1284 | elem1 = None |
| 1285 | if i2 is not None: |
| 1286 | yield2 = elem2 |
| 1287 | elem2 = None |
| 1288 | yield (yield1, yield2, l_no) |
| 1289 | |
| 1290 | def compare_indexes(self, index1, index2): |
| 1291 | ''' |
| 1292 | Compare iterator indexes and return a tuple in the following form: |
| 1293 | if index1 < index2, returns (index1, None) |
| 1294 | if index1 == index2 returns (index1, index2) |
| 1295 | else: returns (None, index2) |
| 1296 | ''' |
| 1297 | l1 = index1.split('/') |
| 1298 | l2 = index2.split('/') |
| 1299 | length = len(l2) - len(l1) |
| 1300 | |
| 1301 | if length > 0: |
| 1302 | return (index1, None) |
| 1303 | elif length < 0: |
| 1304 | return (None, index2) |
| 1305 | |
| 1306 | for i1, i2 in zip(l1, l2): |
| 1307 | if i1 < i2: |
| 1308 | return (index1, None) |
| 1309 | elif i1 > i2: |
| 1310 | return (None, index2) |
| 1311 | |
| 1312 | return (index1, index2) |
| 1313 | |
| 1314 | def list_backup(self, backup_tar_path, list_func=None): |
| 1315 | if not isinstance(backup_tar_path, str): |
| 1316 | raise Exception('Backup tar path must be a string') |
| 1317 | |
| 1318 | if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path): |
| 1319 | raise Exception('Source path "%s" does not exist or is not a '\ |
| 1320 | 'file' % backup_tar_path) |
| 1321 | |
| 1322 | if not os.access(backup_tar_path, os.R_OK): |
| 1323 | raise Exception('Source path "%s" is not readable' % backup_tar_path) |
| 1324 | |
| 1325 | cwd = os.getcwd() |
| 1326 | |
| 1327 | def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number): |
| 1328 | ''' |
| 1329 | Handles the new volumes |
| 1330 | ''' |
| 1331 | volume_name = deltarobj.volume_name_func(backup_path, True, |
| 1332 | volume_number, guess_name=True) |
| 1333 | volume_path = os.path.join(backup_path, volume_name) |
| 1334 | |
| 1335 | # we convert relative paths into absolute because CWD is changed |
| 1336 | if not os.path.isabs(volume_path): |
| 1337 | volume_path = os.path.join(cwd, volume_path) |
| 1338 | tarobj.open_volume(volume_path, encryption=encryption) |
| 1339 | |
| 1340 | if self.decryptor is None: |
| 1341 | self.decryptor = \ |
| 1342 | self.initialize_encryption (CRYPTO_MODE_DECRYPT, |
| 1343 | strict_validation=False) |
| 1344 | |
| 1345 | backup_path = os.path.dirname(backup_tar_path) |
| 1346 | if not os.path.isabs(backup_path): |
| 1347 | backup_path = os.path.join(cwd, backup_path) |
| 1348 | new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.decryptor) |
| 1349 | |
| 1350 | tarobj = tarfile.TarFile.open(backup_tar_path, |
| 1351 | mode='r' + self.mode, |
| 1352 | format=tarfile.GNU_FORMAT, |
| 1353 | concat='#' in self.mode, |
| 1354 | encryption=self.decryptor, |
| 1355 | new_volume_handler=new_volume_handler, |
| 1356 | save_to_members=False, |
| 1357 | dereference=True) |
| 1358 | |
| 1359 | def filter(cls, list_func, tarinfo): |
| 1360 | if list_func is None: |
| 1361 | self.logger.info(tarinfo.path) |
| 1362 | else: |
| 1363 | list_func(tarinfo) |
| 1364 | return False |
| 1365 | filter = partial(filter, self, list_func) |
| 1366 | |
| 1367 | tarobj.extractall(filter=filter, unlink=True) |
| 1368 | tarobj.close() |
| 1369 | |
| 1370 | def restore_backup(self, target_path, backup_indexes_paths=[], |
| 1371 | backup_tar_path=None, restore_callback=None, |
| 1372 | disaster=tarfile.TOLERANCE_STRICT, backup_index=None, |
| 1373 | strict_validation=True): |
| 1374 | ''' |
| 1375 | Restores a backup. |
| 1376 | |
| 1377 | Parameters: |
| 1378 | - target_path: path to restore. |
| 1379 | - backup_indexes_paths: path to backup indexes, in descending date order. |
| 1380 | The indexes indicate the location of their respective backup volumes, |
| 1381 | and multiple indexes are needed to be able to restore diff backups. |
| 1382 | Note that this is an optional parameter: if not suplied, it will |
| 1383 | try to restore directly from backup_tar_path. |
| 1384 | - backup_tar_path: path to the backup tar file. Used as an alternative |
| 1385 | to backup_indexes_paths to restore directly from a tar file without |
| 1386 | using any file index. If it's a multivol tarfile, volume_name_func |
| 1387 | will be called. |
| 1388 | - restore_callback: callback function to be called during restore. |
| 1389 | This is passed to the helper and gets called for every file. |
| 1390 | |
| 1391 | NOTE: If you want to use an index to restore a backup, this function |
| 1392 | only supports to do so when the tarfile mode is either uncompressed or |
| 1393 | uses concat compress mode, because otherwise it would be very slow. |
| 1394 | |
| 1395 | NOTE: Indices are assumed to follow the same format as the index_mode |
| 1396 | specified in the constructor. |
| 1397 | |
| 1398 | Returns the list of files that could not be restored, if there were |
| 1399 | any. |
| 1400 | ''' |
| 1401 | # check/sanitize input |
| 1402 | if not isinstance(target_path, str): |
| 1403 | raise Exception('Target path must be a string') |
| 1404 | |
| 1405 | if backup_indexes_paths is None and backup_tar_path == []: |
| 1406 | raise Exception("You have to either provide index paths or a tar path") |
| 1407 | |
| 1408 | if isinstance (backup_index, list) is True: |
| 1409 | mode = "disaster" |
| 1410 | elif len(backup_indexes_paths) == 0: |
| 1411 | mode = "tar" |
| 1412 | else: |
| 1413 | mode = "diff" |
| 1414 | |
| 1415 | if mode == "tar": |
| 1416 | if not isinstance(backup_tar_path, str): |
| 1417 | raise Exception('Backup tar path must be a string') |
| 1418 | |
| 1419 | if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path): |
| 1420 | raise Exception('Source path "%s" does not exist or is not a '\ |
| 1421 | 'file' % backup_tar_path) |
| 1422 | |
| 1423 | if not os.access(backup_tar_path, os.R_OK): |
| 1424 | raise Exception('Source path "%s" is not readable' % backup_tar_path) |
| 1425 | else: |
| 1426 | if not isinstance(backup_indexes_paths, list): |
| 1427 | raise Exception('backup_indexes_paths must be a list') |
| 1428 | |
| 1429 | if self.mode.startswith(':') or self.mode.startswith('|'): |
| 1430 | raise Exception('Restore only supports either uncompressed tars' |
| 1431 | ' or concat compression when restoring from an index, and ' |
| 1432 | ' the open mode you provided is "%s"' % self.mode) |
| 1433 | |
| 1434 | for index in backup_indexes_paths: |
| 1435 | if not isinstance(index, str): |
| 1436 | raise Exception('indices must be strings') |
| 1437 | |
| 1438 | if not os.path.exists(index) or not os.path.isfile(index): |
| 1439 | raise Exception('Index path "%s" does not exist or is not a '\ |
| 1440 | 'file' % index) |
| 1441 | |
| 1442 | if not os.access(index, os.R_OK): |
| 1443 | raise Exception('Index path "%s" is not readable' % index) |
| 1444 | |
| 1445 | # try to create backup path if needed |
| 1446 | os.makedirs(target_path, exist_ok=True) |
| 1447 | |
| 1448 | # make backup_tar_path absolute so that iterate_tar_path works fine |
| 1449 | if backup_tar_path and not os.path.isabs(backup_tar_path): |
| 1450 | backup_tar_path = os.path.abspath(backup_tar_path) |
| 1451 | |
| 1452 | cwd = os.getcwd() |
| 1453 | os.chdir(target_path) |
| 1454 | |
| 1455 | # setup for decrypting payload |
| 1456 | if self.decryptor is None: |
| 1457 | self.decryptor = \ |
| 1458 | self.initialize_encryption (CRYPTO_MODE_DECRYPT, |
| 1459 | strict_validation=strict_validation) |
| 1460 | |
| 1461 | if mode == 'tar': |
| 1462 | index_it = self.iterate_tar_path(backup_tar_path) |
| 1463 | helper = RestoreHelper(self, cwd, backup_path=backup_tar_path, |
| 1464 | tarobj=index_it.tar_obj) |
| 1465 | elif mode == "diff": |
| 1466 | helper = RestoreHelper(self, cwd, backup_indexes_paths, |
| 1467 | disaster=disaster) |
| 1468 | try: |
| 1469 | # get iterator from newest index at _data[0] |
| 1470 | index1 = helper._data[0]["path"] |
| 1471 | index_it = \ |
| 1472 | self.iterate_index_path(index1, |
| 1473 | strict_validation=strict_validation) |
| 1474 | except tarfile.DecryptionError as exn: |
| 1475 | self.logger.error("failed to decrypt file [%s]: %s; is this an " |
| 1476 | "actual encrypted index file?" |
| 1477 | % (index1, str (exn))) |
| 1478 | return [(index1, exn)] |
| 1479 | except Exception as exn: |
| 1480 | # compressed files |
| 1481 | self.logger.error("failed to read file [%s]: %s; is this an " |
| 1482 | "actual index file?" % (index1, str (exn))) |
| 1483 | return [(index1, exn)] |
| 1484 | elif mode == "disaster": |
| 1485 | index_it = self.iterate_disaster_index (backup_index) |
| 1486 | helper = RestoreHelper (self, cwd, backup_path=backup_tar_path, |
| 1487 | backup_index=backup_index, |
| 1488 | disaster=disaster) |
| 1489 | |
| 1490 | index_decryptor = helper._data[0]["decryptor"] |
| 1491 | |
| 1492 | dir_it = self._recursive_walk_dir('.') |
| 1493 | dir_path_it = self.jsonize_path_iterator(dir_it) |
| 1494 | |
| 1495 | failed = [] # irrecoverable files |
| 1496 | |
| 1497 | # for each file to be restored, do: |
| 1498 | for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it): |
| 1499 | if not ipath: |
| 1500 | upath = dpath['path'] |
| 1501 | op_type = dpath['type'] |
| 1502 | else: |
| 1503 | upath = self.unprefixed(ipath['path']) |
| 1504 | op_type = ipath['type'] |
| 1505 | |
| 1506 | # filter paths |
| 1507 | if self.filter_path(upath, '', op_type == 'directory') == NO_MATCH: |
| 1508 | continue |
| 1509 | |
| 1510 | # if types of the file mismatch, the file needs to be deleted |
| 1511 | # and re-restored |
| 1512 | if ipath is not None and dpath is not None and\ |
| 1513 | dpath['type'] != ipath['type']: |
| 1514 | helper.delete(upath) |
| 1515 | |
| 1516 | # if file not found in dpath, we can directly restore from index |
| 1517 | if not dpath: |
| 1518 | # if the file doesn't exist and it needs to be deleted, it |
| 1519 | # means that work is already done |
| 1520 | if ipath['path'].startswith('delete://'): |
| 1521 | continue |
| 1522 | try: |
| 1523 | self.logger.debug("restore %s" % ipath['path']) |
| 1524 | helper.restore(ipath, l_no, restore_callback) |
| 1525 | except Exception as e: |
| 1526 | iipath = ipath.get ("path", "") |
| 1527 | self.logger.error("FAILED to restore: {} ({})" |
| 1528 | .format(iipath, e)) |
| 1529 | if disaster != tarfile.TOLERANCE_STRICT: |
| 1530 | failed.append ((iipath, e)) |
| 1531 | continue |
| 1532 | |
| 1533 | # if both files are equal, we have nothing to restore |
| 1534 | if self._equal_stat_dicts(ipath, dpath, listsnapshot_equal=True): |
| 1535 | continue |
| 1536 | |
| 1537 | # we have to restore the file, but first we need to delete the |
| 1538 | # current existing file. |
| 1539 | # we don't delete the file if it's a directory, because it might |
| 1540 | # just have changed mtime, so it's quite inefficient to remove |
| 1541 | # it |
| 1542 | if ipath: |
| 1543 | if ipath['type'] != 'directory' or ipath['path'].startswith('delete://'): |
| 1544 | helper.delete(upath) |
| 1545 | self.logger.debug("restore %s" % ipath['path']) |
| 1546 | try: |
| 1547 | helper.restore(ipath, l_no, restore_callback) |
| 1548 | except Exception as e: |
| 1549 | if disaster == tarfile.TOLERANCE_STRICT: |
| 1550 | raise |
| 1551 | failed.append ((ipath.get ("path", ""), e)) |
| 1552 | continue |
| 1553 | |
| 1554 | # if the file is not in the index (so it comes from the target |
| 1555 | # directory) then we have to delete it |
| 1556 | else: |
| 1557 | self.logger.debug("delete %s" % upath) |
| 1558 | helper.delete(upath) |
| 1559 | |
| 1560 | helper.restore_directories_permissions() |
| 1561 | index_it.release() |
| 1562 | os.chdir(cwd) |
| 1563 | helper.cleanup() |
| 1564 | |
| 1565 | return failed |
| 1566 | |
| 1567 | |
| 1568 | def recover_backup(self, target_path, backup_indexes_paths=[], |
| 1569 | restore_callback=None): |
| 1570 | """ |
| 1571 | Walk the index, extracting objects in disaster mode. Bad files are |
| 1572 | reported along with a reason. |
| 1573 | |
| 1574 | *Security considerations*: In *recovery mode* the headers of encrypted |
| 1575 | objects are assumed damaged and GCM tags are not validated so |
| 1576 | modification of cryptographically relevant parts of the header (more |
| 1577 | specifically, the initalization vectors) can no longer be detected. If |
| 1578 | an attacker can manipulate the encrypted backup set and has access to |
| 1579 | the plaintext of some of the contents, they may be able to obtain the |
| 1580 | plaintext of other encrypted objects by injecting initialization |
| 1581 | vectors. For this reason *recovery mode* should only be used to |
| 1582 | emergency situations and the contents of the resulting files should be |
| 1583 | validated manually if possible and not be disclosed to untrusted |
| 1584 | parties. |
| 1585 | """ |
| 1586 | return self.restore_backup(target_path, |
| 1587 | backup_indexes_paths=backup_indexes_paths, |
| 1588 | disaster=tarfile.TOLERANCE_RECOVER, |
| 1589 | strict_validation=False) |
| 1590 | |
| 1591 | |
| 1592 | def rescue_backup(self, target_path, backup_tar_path, |
| 1593 | restore_callback=None): |
| 1594 | """ |
| 1595 | More aggressive “unfsck” mode: do not rely on the index data as the |
| 1596 | files may be corrupt; skim files for header-like information and |
| 1597 | attempt to retrieve the data. |
| 1598 | |
| 1599 | *Security considerations*: As with *recovery mode*, in *rescue mode* |
| 1600 | the headers of encrypted objects are assumed damaged and GCM tags are |
| 1601 | not validated so modification of cryptographically relevant parts of |
| 1602 | the header (more specifically, the initalization vectors) can no longer |
| 1603 | be detected. If an attacker can manipulate the encrypted backup set and |
| 1604 | has access to the plaintext of some of the contents, they may be able |
| 1605 | to obtain the plaintext of other encrypted objects by injecting |
| 1606 | initialization vectors. For this reason *rescue mode* should only be |
| 1607 | used to emergency situations and the contents of the resulting files |
| 1608 | should be validated manually if possible and not be disclosed to |
| 1609 | untrusted parties. |
| 1610 | """ |
| 1611 | def gen_volume_name (nvol): |
| 1612 | return os.path.join (os.path.dirname (backup_tar_path), |
| 1613 | self.volume_name_func (backup_tar_path, |
| 1614 | True, |
| 1615 | nvol)) |
| 1616 | |
| 1617 | backup_index = tarfile.gen_rescue_index (gen_volume_name, |
| 1618 | self.mode, |
| 1619 | password=self.password, |
| 1620 | key=self.crypto_key) |
| 1621 | |
| 1622 | return self.restore_backup(target_path, |
| 1623 | backup_index=backup_index, |
| 1624 | backup_tar_path=backup_tar_path, |
| 1625 | disaster=tarfile.TOLERANCE_RESCUE, |
| 1626 | strict_validation=False) |
| 1627 | |
| 1628 | |
| 1629 | def _parse_json_line(self, f, l_no): |
| 1630 | ''' |
| 1631 | Read line from file like object and process it as JSON. |
| 1632 | ''' |
| 1633 | l = f.readline() |
| 1634 | l_no += 1 |
| 1635 | try: |
| 1636 | j = json.loads(l.decode('UTF-8')) |
| 1637 | except UnicodeDecodeError as e: |
| 1638 | if tuple (l [0:2]) == tarfile.GZ_MAGIC: |
| 1639 | raise Exception \ |
| 1640 | ("error parsing line #%d as json: looks like a compressed file (%d B: [%s..])" |
| 1641 | % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \ |
| 1642 | from e |
| 1643 | raise Exception \ |
| 1644 | ("error parsing line #%d as json: not a text file (%d B: [%s..])" |
| 1645 | % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \ |
| 1646 | from e |
| 1647 | except ValueError as e: |
| 1648 | raise Exception("error parsing this json line " |
| 1649 | "(line number %d): %s" % (l_no, l)) |
| 1650 | return j, l_no |
| 1651 | |
| 1652 | |
| 1653 | class RestoreHelper(object): |
| 1654 | ''' |
| 1655 | Class used to help to restore files from indices |
| 1656 | ''' |
| 1657 | |
| 1658 | # holds the dicts of data |
| 1659 | _data = [] |
| 1660 | |
| 1661 | _deltatar = None |
| 1662 | |
| 1663 | _cwd = None |
| 1664 | |
| 1665 | # list of directories to be restored. This is done as a last step, see |
| 1666 | # tarfile.extractall for details. |
| 1667 | _directories = [] |
| 1668 | |
| 1669 | _disaster = tarfile.TOLERANCE_STRICT |
| 1670 | |
| 1671 | def __init__(self, deltatar, cwd, index_list=None, backup_path=False, |
| 1672 | backup_index=None, tarobj=None, |
| 1673 | disaster=tarfile.TOLERANCE_STRICT): |
| 1674 | ''' |
| 1675 | Constructor opens the tars and init the data structures. |
| 1676 | |
| 1677 | Assumptions: |
| 1678 | |
| 1679 | - Index list must be provided in reverse order (newer first). |
| 1680 | - “newer first” apparently means that if there are n backups |
| 1681 | provided, the last full backup is at index n-1 and the most recent |
| 1682 | diff backup is at index 0. |
| 1683 | - Only the first, the second, and the last elements of |
| 1684 | ``index_list`` are relevant, others will not be accessed. |
| 1685 | - If no ``index_list`` is provided, both ``tarobj`` and |
| 1686 | ``backup_path`` must be passed. |
| 1687 | - If ``index_list`` is provided, the values of ``tarobj`` and |
| 1688 | ``backup_path`` are ignored. |
| 1689 | ''' |
| 1690 | self._data = [] |
| 1691 | self._directories = [] |
| 1692 | self._deltatar = deltatar |
| 1693 | self._cwd = cwd |
| 1694 | self._password = deltatar.password |
| 1695 | self._crypto_key = deltatar.crypto_key |
| 1696 | self._decryptors = [] |
| 1697 | self._disaster = disaster |
| 1698 | |
| 1699 | # Disable strict checking for linearly increasing IVs when running |
| 1700 | # in rescue or recover mode. |
| 1701 | strict_validation = disaster == tarfile.TOLERANCE_STRICT |
| 1702 | |
| 1703 | try: |
| 1704 | import grp, pwd |
| 1705 | except ImportError: |
| 1706 | grp = pwd = None |
| 1707 | |
| 1708 | if pwd and hasattr(os, "geteuid") and os.geteuid() == 0: |
| 1709 | self.canchown = True |
| 1710 | else: |
| 1711 | self.canchown = False |
| 1712 | |
| 1713 | if isinstance (backup_index, list) is True: |
| 1714 | decryptor = self._deltatar.decryptor |
| 1715 | self._data = \ |
| 1716 | [{ "curr_vol_no" : None |
| 1717 | , "vol_fd" : None |
| 1718 | , "offset" : -1 |
| 1719 | , "tarobj" : None |
| 1720 | , "path" : backup_path |
| 1721 | , "is_full" : True |
| 1722 | , "iterator" : None |
| 1723 | , "last_itelement" : None |
| 1724 | , "last_lno" : 0 |
| 1725 | , "new_volume_handler" : |
| 1726 | partial(self.new_volume_handler, |
| 1727 | self._deltatar, self._cwd, True, |
| 1728 | os.path.dirname(backup_path), decryptor) |
| 1729 | , "decryptor" : decryptor |
| 1730 | }] |
| 1731 | elif index_list is not None: |
| 1732 | for index in index_list: |
| 1733 | is_full = index == index_list[-1] |
| 1734 | |
| 1735 | decryptor = None |
| 1736 | if self._password is not None: |
| 1737 | decryptor = crypto.Decrypt (password=self._password, |
| 1738 | key=self._crypto_key, |
| 1739 | strict_ivs=strict_validation) |
| 1740 | |
| 1741 | # make paths absolute to avoid cwd problems |
| 1742 | if not os.path.isabs(index): |
| 1743 | index = os.path.normpath(os.path.join(cwd, index)) |
| 1744 | |
| 1745 | s = dict( |
| 1746 | curr_vol_no = None, |
| 1747 | vol_fd = None, |
| 1748 | offset = -1, |
| 1749 | tarobj = None, |
| 1750 | path = index, |
| 1751 | is_full = is_full, |
| 1752 | iterator = None, |
| 1753 | last_itelement = None, |
| 1754 | last_lno = 0, |
| 1755 | new_volume_handler = partial(self.new_volume_handler, |
| 1756 | self._deltatar, self._cwd, is_full, |
| 1757 | os.path.dirname(index), decryptor), |
| 1758 | decryptor = decryptor |
| 1759 | ) |
| 1760 | self._data.append(s) |
| 1761 | else: |
| 1762 | # make paths absolute to avoid cwd problems |
| 1763 | if not os.path.isabs(backup_path): |
| 1764 | backup_path = os.path.normpath(os.path.join(cwd, backup_path)) |
| 1765 | |
| 1766 | # update the new_volume_handler of tar_obj |
| 1767 | tarobj.new_volume_handler = partial(self.new_volume_handler, |
| 1768 | self._deltatar, self._cwd, True, os.path.dirname(backup_path), |
| 1769 | self._deltatar.decryptor) |
| 1770 | s = dict( |
| 1771 | curr_vol_no = None, |
| 1772 | vol_fd = None, |
| 1773 | offset = -1, |
| 1774 | tarobj = tarobj, |
| 1775 | path = backup_path, |
| 1776 | is_full = True, |
| 1777 | iterator = None, |
| 1778 | last_itelement = None, |
| 1779 | last_lno = 0, |
| 1780 | new_volume_handler = tarobj.new_volume_handler, |
| 1781 | decryptor = self._deltatar.decryptor |
| 1782 | ) |
| 1783 | self._data.append(s) |
| 1784 | |
| 1785 | |
| 1786 | def cleanup(self): |
| 1787 | ''' |
| 1788 | Closes all open files |
| 1789 | ''' |
| 1790 | for data in self._data: |
| 1791 | if data['vol_fd']: |
| 1792 | data['vol_fd'].close() |
| 1793 | data['vol_fd'] = None |
| 1794 | if data['tarobj']: |
| 1795 | data['tarobj'].close() |
| 1796 | data['tarobj'] = None |
| 1797 | |
| 1798 | def delete(self, path): |
| 1799 | ''' |
| 1800 | Delete a file |
| 1801 | ''' |
| 1802 | if not os.path.exists(path): |
| 1803 | return |
| 1804 | |
| 1805 | # to preserve parent directory mtime, we save it |
| 1806 | parent_dir = os.path.dirname(path) or os.getcwd() |
| 1807 | parent_dir_mtime = int(os.stat(parent_dir).st_mtime) |
| 1808 | |
| 1809 | if os.path.isdir(path) and not os.path.islink(path): |
| 1810 | shutil.rmtree(path) |
| 1811 | else: |
| 1812 | os.unlink(path) |
| 1813 | |
| 1814 | # now we restore parent_directory mtime |
| 1815 | os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime)) |
| 1816 | |
| 1817 | def restore(self, itpath, l_no, callback=None): |
| 1818 | ''' |
| 1819 | Restore the path from the appropriate backup. Receives the current path |
| 1820 | from the newest (=first) index iterator. itpath must be not null. |
| 1821 | callback is a custom function that gets called for every file. |
| 1822 | |
| 1823 | NB: This function takes the attribute ``_data`` as input but will only |
| 1824 | ever use its first and, if available, second element. Anything else in |
| 1825 | ``._data[]`` will be ignored. |
| 1826 | ''' |
| 1827 | path = itpath['path'] |
| 1828 | |
| 1829 | # Calls the callback function |
| 1830 | if callback: |
| 1831 | callback() |
| 1832 | |
| 1833 | if path.startswith('delete://'): |
| 1834 | # the file has previously been deleted already in restore_backup in |
| 1835 | # all cases so we just need to finish |
| 1836 | return |
| 1837 | |
| 1838 | # get data from newest index (_data[0]) |
| 1839 | data = self._data[0] |
| 1840 | upath = self._deltatar.unprefixed(path) |
| 1841 | |
| 1842 | # to preserve parent directory mtime, we save it |
| 1843 | parent_dir = os.path.dirname(upath) or os.getcwd() |
| 1844 | os.makedirs(parent_dir, exist_ok=True) |
| 1845 | parent_dir_mtime = int(os.stat(parent_dir).st_mtime) |
| 1846 | |
| 1847 | # if path is found in the newest index as to be snapshotted, deal with it |
| 1848 | # and finish |
| 1849 | if path.startswith('snapshot://'): |
| 1850 | self.restore_file(itpath, data, path, l_no, upath) |
| 1851 | |
| 1852 | # now we restore parent_directory mtime |
| 1853 | os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime)) |
| 1854 | return |
| 1855 | |
| 1856 | # we go from index to index, finding the path in the index, then finding |
| 1857 | # the index with the most recent snapshot of the file being restored |
| 1858 | # |
| 1859 | # Right now we support diff backups, only. No incremental backups. |
| 1860 | # As a result _data[0] is always the diff backup index |
| 1861 | # and _data[1] the full backup index. |
| 1862 | if len(self._data) == 2: |
| 1863 | data = self._data[1] |
| 1864 | d, l_no, dpath = self.find_path_in_index(data, upath) |
| 1865 | if not d: |
| 1866 | self._deltatar.logger.warning('Error restoring file %s from ' |
| 1867 | 'index, not found in index %s' % (path, data['path'])) |
| 1868 | return |
| 1869 | |
| 1870 | cur_path = d.get('path', '') |
| 1871 | if cur_path.startswith('delete://'): |
| 1872 | self._deltatar.logger.warning(('Strange thing happened, file ' |
| 1873 | '%s was listed in first index but deleted by another ' |
| 1874 | 'one. Path was ignored and untouched.') % path) |
| 1875 | return |
| 1876 | elif cur_path.startswith('snapshot://'): |
| 1877 | # this code path is reached when the file is unchanged |
| 1878 | # in the newest index and therefore of type 'list://' |
| 1879 | self.restore_file(d, data, path, l_no, dpath) |
| 1880 | |
| 1881 | # now we restore parent_directory mtime |
| 1882 | os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime)) |
| 1883 | return |
| 1884 | |
| 1885 | # error code path is reached when: |
| 1886 | # a) we have more than two indexes (unsupported atm) |
| 1887 | # b) both indexes contain a list:// entry (logic error) |
| 1888 | # c) we have just one index and it also contains list:// |
| 1889 | self._deltatar.logger.warning(('Error restoring file %s from index, ' |
| 1890 | 'snapshot not found in any index') % path) |
| 1891 | |
| 1892 | def find_path_in_index(self, data, upath): |
| 1893 | # NOTE: we restart the iterator sometimes because the iterator can be |
| 1894 | # walked over completely multiple times, for example if one path if not |
| 1895 | # found in one index and we have to go to the next index. |
| 1896 | it = data['iterator'] |
| 1897 | if it is None: |
| 1898 | it = data['iterator'] = self._deltatar.iterate_index_path(data["path"]) |
| 1899 | d, l_no = it.__next__() |
| 1900 | else: |
| 1901 | d = data['last_itelement'] |
| 1902 | l_no = data['last_lno'] |
| 1903 | |
| 1904 | while True: |
| 1905 | dpath = self._deltatar.unprefixed(d.get('path', '')) |
| 1906 | if upath == dpath: |
| 1907 | data['last_itelement'] = d |
| 1908 | data['last_lno'] = l_no |
| 1909 | return d, l_no, dpath |
| 1910 | |
| 1911 | up, dp = self._deltatar.compare_indexes(upath, dpath) |
| 1912 | # any time upath should have appeared before current dpath, it means |
| 1913 | # upath is just not in this index and we should stop |
| 1914 | if dp is None: |
| 1915 | data['last_itelement'] = d |
| 1916 | data['last_lno'] = l_no |
| 1917 | return None, 0, '' |
| 1918 | |
| 1919 | try: |
| 1920 | d, l_no = it.__next__() |
| 1921 | except StopIteration: |
| 1922 | data['last_itelement'] = d |
| 1923 | data['last_lno'] = l_no |
| 1924 | return None, 0, '' |
| 1925 | |
| 1926 | def restore_directories_permissions(self): |
| 1927 | ''' |
| 1928 | Restore directory permissions when everything have been restored |
| 1929 | ''' |
| 1930 | try: |
| 1931 | import grp, pwd |
| 1932 | except ImportError: |
| 1933 | grp = pwd = None |
| 1934 | |
| 1935 | self._directories.sort(key=operator.attrgetter('name')) |
| 1936 | self._directories.reverse() |
| 1937 | |
| 1938 | # Set correct owner, mtime and filemode on directories. |
| 1939 | for member in self._directories: |
| 1940 | dirpath = member.name |
| 1941 | try: |
| 1942 | os.chmod(dirpath, member.mode) |
| 1943 | os.utime(dirpath, (member.mtime, member.mtime)) |
| 1944 | if self.canchown: |
| 1945 | # We have to be root to do so. |
| 1946 | try: |
| 1947 | g = grp.getgrnam(member.gname)[2] |
| 1948 | except KeyError: |
| 1949 | g = member.gid |
| 1950 | try: |
| 1951 | u = pwd.getpwnam(member.uname)[2] |
| 1952 | except KeyError: |
| 1953 | u = member.uid |
| 1954 | try: |
| 1955 | if member.issym and hasattr(os, "lchown"): |
| 1956 | os.lchown(dirpath, u, g) |
| 1957 | else: |
| 1958 | os.chown(dirpath, u, g) |
| 1959 | except EnvironmentError: |
| 1960 | raise tarfile.ExtractError("could not change owner") |
| 1961 | |
| 1962 | except tarfile.ExtractError as e: |
| 1963 | self._deltatar.logger.warning('tarfile: %s' % e) |
| 1964 | |
| 1965 | @staticmethod |
| 1966 | def new_volume_handler(deltarobj, cwd, is_full, backup_path, decryptor, tarobj, base_name, volume_number): |
| 1967 | ''' |
| 1968 | Set up a new volume and perform the tasks necessary for transitioning |
| 1969 | to the next one. |
| 1970 | ''' |
| 1971 | volume_name = deltarobj.volume_name_func(backup_path, is_full, |
| 1972 | volume_number, guess_name=True) |
| 1973 | volume_path = os.path.join(backup_path, volume_name) |
| 1974 | |
| 1975 | # we convert relative paths into absolute because CWD is changed |
| 1976 | if not os.path.isabs(volume_path): |
| 1977 | volume_path = os.path.join(cwd, volume_path) |
| 1978 | |
| 1979 | tarobj.open_volume(volume_path, encryption=decryptor) |
| 1980 | |
| 1981 | def restore_file(self, file_data, index_data, path, l_no, unprefixed_path): |
| 1982 | ''' |
| 1983 | Restores a snapshot of a file from a specific backup |
| 1984 | ''' |
| 1985 | op_type = file_data.get('type', -1) |
| 1986 | member = file_data.get('member', None) |
| 1987 | ismember = bool(member) |
| 1988 | |
| 1989 | # when member is set, then we can assume everything is right and we |
| 1990 | # just have to restore the path |
| 1991 | if member is None: |
| 1992 | vol_no = file_data.get('volume', -1) |
| 1993 | # sanity check |
| 1994 | if not isinstance(vol_no, int) or vol_no < 0: |
| 1995 | self._deltatar.logger.warning('unrecognized type to be restored: ' |
| 1996 | '%s, line %d' % (op_type, l_no)) |
| 1997 | |
| 1998 | # setup the volume that needs to be read. only needed when member is |
| 1999 | # not set |
| 2000 | if index_data['curr_vol_no'] != vol_no: |
| 2001 | index_data['curr_vol_no'] = vol_no |
| 2002 | backup_path = os.path.dirname(index_data['path']) |
| 2003 | vol_name = self._deltatar.volume_name_func(backup_path, |
| 2004 | index_data['is_full'], vol_no, guess_name=True) |
| 2005 | vol_path = os.path.join(backup_path, vol_name) |
| 2006 | if index_data['vol_fd']: |
| 2007 | index_data['vol_fd'].close() |
| 2008 | index_data['vol_fd'] = open(vol_path, 'rb') |
| 2009 | |
| 2010 | # force reopen of the tarobj because of new volume |
| 2011 | if index_data['tarobj']: |
| 2012 | index_data['tarobj'].close() |
| 2013 | index_data['tarobj'] = None |
| 2014 | |
| 2015 | # seek tarfile if needed |
| 2016 | offset = file_data.get('offset', -1) |
| 2017 | if index_data['tarobj']: |
| 2018 | if self._disaster == tarfile.TOLERANCE_RESCUE: |
| 2019 | # force a seek and reopen |
| 2020 | index_data['tarobj'].close() |
| 2021 | index_data['tarobj'] = None |
| 2022 | else: |
| 2023 | try: |
| 2024 | member = index_data['tarobj'].__iter__().__next__() |
| 2025 | except tarfile.DecryptionError: |
| 2026 | pass |
| 2027 | except tarfile.CompressionError: |
| 2028 | pass |
| 2029 | |
| 2030 | if not member or member.path != file_data['path']: |
| 2031 | # force a seek and reopen |
| 2032 | index_data['tarobj'].close() |
| 2033 | index_data['tarobj'] = None |
| 2034 | |
| 2035 | |
| 2036 | # open the tarfile if needed |
| 2037 | if not index_data['tarobj']: |
| 2038 | index_data['vol_fd'].seek(offset) |
| 2039 | index_data['tarobj'] = tarfile.open(mode="r" + self._deltatar.mode, |
| 2040 | fileobj=index_data['vol_fd'], |
| 2041 | format=tarfile.GNU_FORMAT, |
| 2042 | concat='#' in self._deltatar.mode, |
| 2043 | encryption=index_data["decryptor"], |
| 2044 | new_volume_handler=index_data['new_volume_handler'], |
| 2045 | save_to_members=False, |
| 2046 | tolerance=self._disaster) |
| 2047 | |
| 2048 | member = index_data['tarobj'].__iter__().__next__() |
| 2049 | |
| 2050 | member.path = unprefixed_path |
| 2051 | member.name = unprefixed_path |
| 2052 | |
| 2053 | if op_type == 'directory': |
| 2054 | self.add_member_dir(member) |
| 2055 | member = copy.copy(member) |
| 2056 | member.mode = 0o0700 |
| 2057 | |
| 2058 | # if it's an existing directory, we then don't need to recreate it |
| 2059 | # just set the right permissions, mtime and that kind of stuff |
| 2060 | if os.path.exists(member.path): |
| 2061 | return |
| 2062 | |
| 2063 | if not ismember: |
| 2064 | # set current volume number in tarobj, otherwise the extraction of the |
| 2065 | # file might fail when trying to extract a multivolume member |
| 2066 | index_data['tarobj'].volume_number = index_data['curr_vol_no'] |
| 2067 | |
| 2068 | def ignore_symlink (member, *_args): |
| 2069 | self._deltatar.logger.warning("Ignoring symlink %s" % member.name) |
| 2070 | |
| 2071 | # finally, restore the file |
| 2072 | index_data['tarobj'].extract(member, symlink_cb=ignore_symlink, |
| 2073 | unlink=True) |
| 2074 | |
| 2075 | def add_member_dir(self, member): |
| 2076 | ''' |
| 2077 | Add member dir to be restored at the end |
| 2078 | ''' |
| 2079 | if not self.canchown: |
| 2080 | self._directories.append(DirItem(name=member.name, mode=member.mode, |
| 2081 | mtime=member.mtime)) |
| 2082 | else: |
| 2083 | self._directories.append(DirItem(name=member.name, mode=member.mode, |
| 2084 | mtime=member.mtime, gname=member.gname, uname=member.uname, |
| 2085 | uid=member.uid, gid=member.gid, issym=member.issym())) |
| 2086 | |
| 2087 | class DirItem(object): |
| 2088 | def __init__(self, **kwargs): |
| 2089 | for k, v in kwargs.items(): |
| 2090 | setattr(self, k, v) |