#!/usr/bin/env python3

# Copyright (C) 2013, 2014 Intra2net AG
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see
# <http://www.gnu.org/licenses/lgpl-3.0.html>

DELTATAR_HEADER_VERSION    = 1
DELTATAR_PARAMETER_VERSION = 1

import logging
import datetime
import binascii
import io
import operator
import os
import copy
import shutil
import re
import stat
import json
import typing
from functools import partial

from . import tarfile
from . import crypto

class NullHandler(logging.Handler):
    def emit(self, record):
        pass


logging.getLogger("deltatar.DeltaTar").addHandler(NullHandler())


# match mode
NO_MATCH        = False
MATCH           = True
PARENT_MATCH    = 2

# encryption direction
CRYPTO_MODE_ENCRYPT = 0
CRYPTO_MODE_DECRYPT = 1

# The canonical extension for encrypted backup files regardless of the actual
# encryption parameters is “.pdtcrypt”. This is analogous to the encryption
# header which starts with the eight ASCII bytes “PDTCRYPT”. Historical note:
# Since the introduction of the versioned header there no longer any need
# for encoding encryption parameters in the file extensions (“.aes128” and
# suchlike).
PDTCRYPT_EXTENSION = "pdtcrypt"
PDT_TYPE_ARCHIVE   = 0
PDT_TYPE_AUX       = 1

AUXILIARY_FILE_INDEX = 0
AUXILIARY_FILE_INFO  = 1

class DeltaTar(object):
    '''
    Backup class used to create backups
    '''

    # list of files to exclude in the backup creation or restore operation. It
    # can contain python regular expressions.
    excluded_files = []

    # list of files to include in the backup creation or restore operation. It
    # can contain python regular expressions. If empty, all files in the source
    # path will be backed up (when creating a backup) or all the files in the
    # backup will be restored (when restoring a backup), but if included_files
    # is set then only the files include in the list will be processed.
    included_files = []

    # custom filter of files to be backed up (or restored). Unused and unset
    # by default. The function receives a file path and must return a boolean.
    filter_func = None

    # mode in which the delta will be created (when creating a backup) or
    # opened (when restoring). Accepts modes analog to the tarfile library.
    mode = ""

    # used together with aes modes to encrypt and decrypt backups.
    password = None
    crypto_key = None
    nacl = None

    # parameter version to use when encrypting; note that this has no effect
    # on decryption since the required settings are determined from the headers
    crypto_version      = DELTATAR_HEADER_VERSION
    crypto_paramversion = None

    # when encrypting or decrypting, these hold crypto handlers; created before
    # establishing the Tarfile stream iff a password is supplied.
    encryptor = None
    decryptor = None

    # python logger object.
    logger = None

    # specifies the index mode in the same format as @param mode, but without
    # the ':', '|' or '#' at the begining. It doesn't make sense to specify
    # that the index is encrypted if no password is given in the constructor.
    index_mode = None

    # current time for this backup. Used for file names and file creation checks
    current_time = None

    # extra data to included in the header of the index file when creating a
    # backup
    extra_data = dict()

    # valid tarfile modes and their corresponding default file extension
    __file_extensions_dict = {
        '': '',
        ':': '',
        ':gz': '.gz',
        ':bz2': '.bz2',
        '|': '',
        '|gz': '.gz',
        '|bz2': '.bz2',
        '#gz': '.gz',
        '#gz.pdtcrypt': '.gz',
        '#pdtcrypt': '',
        '#': '',
    }

    # valid index modes and their corresponding default file extension
    __index_extensions_dict = {
        '': '',
        'gz': '.gz',
        'bz2': '.bz2',
        'gz.pdtcrypt': '.gz',
        'pdtcrypt': '',
    }

    # valid path prefixes
    __path_prefix_list = [
        u'snapshot://',
        u'list://',
        u'delete://'
    ]

    def __init__(self, excluded_files=[], included_files=[],
                 filter_func=None, mode="", password=None,
                 crypto_key=None, nacl=None,
                 crypto_version=DELTATAR_HEADER_VERSION,
                 crypto_paramversion=DELTATAR_PARAMETER_VERSION,
                 logger=None, index_mode=None, index_name_func=None,
                 volume_name_func=None):
        '''
        Constructor. Configures the diff engine.

        Parameters:
        - excluded_files: list of files to exclude in the backup creation or
          restore operation. It can contain python regular expressions.

        - included_files: list of files to include in the backup creation or
          restore operation. It can contain python regular expressions. If
          empty, all files in the source path will be backed up (when creating a
          backup) or all the files in the backup will be restored (when
          restoring a backup), but if included_files is set then only the files
          include in the list will be processed.

        - filter_func: custom filter of files to be backed up (or restored).
          Unused and unset by default. The function receives a file path and
          must return a boolean.

        - mode: mode in which the delta will be created (when creating a backup)
          or opened (when restoring). Accepts the same modes as the tarfile
          library. Valid modes are:

           ''          open uncompressed
           ':'         open uncompressed
           ':gz'       open with gzip compression
           ':bz2'      open with bzip2 compression
           '|'         open an uncompressed stream of tar blocks
           '|gz'       open a gzip compressed stream of tar blocks
           '|bz2'      open a bzip2 compressed stream of tar blocks
           '#gz'       open a stream of gzip compressed tar blocks

        - crypto_key: used to encrypt and decrypt backups. Encryption will
          be enabled automatically if a key is supplied. Requires a salt to be
          passed as well.

        - nacl: salt that was used to derive the encryption key for embedding
          in the PDTCRYPT header. Not needed when decrypting and when
          encrypting with password.

        - password: used to encrypt and decrypt backups. Encryption will be
          enabled automatically if a password is supplied.

        - crypto_version: version of the format, determining the kind of PDT
          object header.

        - crypto_paramversion: optionally request encryption conforming to
          a specific parameter version. Defaults to the standard PDT value
          which as of 2017 is the only one available.

        - logger: python logger object. Optional.

        - index_mode:  specifies the index mode in the same format as @param
          mode, but without the ':', '|' or '#' at the begining. If encryption
          is requested it will extend to the auxiliary (index, info) files as
          well. This is an optional parameter that will automatically mimic
          @param mode by default if not provided. Valid modes are:

           ''         open uncompressed
           'gz'       open with gzip compression
           'bz2'      open with bzip2 compression

        - index_name_func: function that sets a custom name for the index file.
          This function receives a flag to indicate whether the name will be
          used for a full or diff backup. The backup path will be prepended to
          its return value.

        - volume_name_func: function that defines the name of tar volumes. It
          receives the backup_path, if it's a full backup and the volume number,
          and must return the name for the corresponding volume name. Optional,
          DeltaTar has default names for tar volumes.
        '''

        if mode not in self.__file_extensions_dict:
            raise Exception('Unrecognized extension mode=[%s] requested for files'
                            % str(mode))

        self.excluded_files = excluded_files
        self.included_files = included_files
        self.filter_func = filter_func
        self.logger = logging.getLogger('deltatar.DeltaTar')
        if logger:
            self.logger.addHandler(logger)
        self.mode = mode

        if crypto_key is not None:
            self.crypto_key = crypto_key
            self.nacl = nacl # encryption only

        if password is not None:
            self.password = password

        if crypto_version is not None:
            self.crypto_version = crypto_version

        if crypto_paramversion is not None:
            self.crypto_paramversion = crypto_paramversion

        # generate index_mode
        if index_mode is None:
            index_mode = ''
            if 'gz' in mode:
                index_mode = "gz"
            elif 'bz2' in mode:
                index_mode = "bz2"
        elif mode not in self.__index_extensions_dict:
            raise Exception('Unrecognized extension mode=[%s] requested for index'
                            % str(mode))

        self.index_mode = index_mode
        self.current_time = datetime.datetime.now()

        if index_name_func is not None:
            self.index_name_func = index_name_func

        if volume_name_func is not None:
            self.volume_name_func = volume_name_func

    def pick_extension(self, kind, mode=None):
        """
        Choose the extension depending on a) the kind of file given, b) the
        processing mode, and c) the current encryption settings.
        """
        ret = ""
        if kind == PDT_TYPE_ARCHIVE:
            ret += ".tar"
        if mode is None:
            mode = self.__index_extensions_dict [self.index_mode]
        ret += mode
        if self.crypto_key is not None or self.password is not None:
            ret += "." + PDTCRYPT_EXTENSION
        return ret

    def index_name_func(self, is_full):         # pylint: disable=method-hidden
        '''
        Callback for setting a custom name for the index file. Depending on
        whether *is_full* is set, it will create a suitable name for a full
        or a diff backup.
        '''
        prefix = "bfull" if is_full else "bdiff"
        date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
        extension = self.pick_extension \
                        (PDT_TYPE_AUX,
                         self.__index_extensions_dict [self.index_mode])

        return "%s-%s.index%s" % (prefix, date_str, extension)

    def volume_name_func(self, backup_path,     # pylint: disable=method-hidden
                         is_full, volume_number,
                         guess_name=False):
        '''
        function that defines the name of tar volumes. It receives the
        backup_path, if it's a full backup and the volume number, and must return
        the name for the corresponding volume name. Optional, DeltaTar has default
        names for tar volumes.

        If guess_name is activated, the file is intended not to be created but
        to be found, and thus the date will be guessed.
        '''
        prefix = "bfull" if is_full else "bdiff"
        extension = self.pick_extension \
                        (PDT_TYPE_ARCHIVE,
                         self.__file_extensions_dict [self.mode])

        if not guess_name:
            date_str = self.current_time.strftime("%Y-%m-%d-%H%M")
            return "%s-%s-%03d%s" % (prefix, date_str, volume_number + 1, extension)
        else:
            prefix = prefix + "-"
            postfix = "-%03d%s" % (volume_number + 1, extension)
            for f in os.listdir(backup_path):
                if f.startswith(prefix) and f.endswith(postfix):
                    return f
            raise Exception("volume not found")


    def filter_path(self, path, source_path="", is_dir=None):
        '''
        Filters a path, given the source_path, using the filtering properties
        set in the constructor.
        The filtering order is:
        1. included_files (if any)
        2. excluded_files
        3. filter_func (which must return whether the file is accepted or not)
        '''

        if len(source_path) > 0:
            # ensure that exactly one '/' at end of dir is also removed
            source_path = source_path.rstrip(os.sep) + os.sep
            path = path[len(source_path):]

        # 1. filter included_files
        match = MATCH
        if len(self.included_files) > 0:
            match = NO_MATCH
            for i in self.included_files:
                # it can be either a regexp or a string
                if isinstance(i, str):
                    # if the string matches, then continue
                    if i == path:
                        match = MATCH
                        break

                    # if the string ends with / it's a directory, and if the
                    # path is contained in it, it is included
                    if i.endswith('/') and path.startswith(i):
                        match = MATCH
                        break

                    # if the string doesn't end with /, add it and do the same
                    # check
                    elif path.startswith(i + '/'):
                        match = MATCH
                        break

                    # check for PARENT_MATCH
                    if is_dir:
                        dir_path = path
                        if not dir_path.endswith('/'):
                            dir_path += '/'

                        if i.startswith(dir_path):
                            match = PARENT_MATCH

                # if it's a reg exp, then we just check if it matches
                elif isinstance(i, typing.Pattern):
                    if i.match(path):
                        match = MATCH
                        break
                else:
                    self.logger.warning('Invalid pattern in included_files: %s' % str(i))

            if match == NO_MATCH:
                return NO_MATCH

        # when a directory is in PARENT_MATCH, it doesn't matter if it's
        # excluded. It's subfiles will be excluded, but the directory itself
        # won't
        if match != PARENT_MATCH:
            for e in self.excluded_files:
                # it can be either a regexp or a string
                if isinstance(e, str):
                    # if the string matches, then exclude
                    if e == path:
                        return NO_MATCH

                    # if the string ends with / it's a directory, and if the
                    # path starts with the directory, then exclude
                    if e.endswith('/') and path.startswith(e):
                        return NO_MATCH

                    # if the string doesn't end with /, do the same check with
                    # the slash added
                    elif path.startswith(e + '/'):
                        return NO_MATCH

                # if it's a reg exp, then we just check if it matches
                elif isinstance(e, typing.Pattern):
                    if e.match(path):
                        return NO_MATCH
                else:
                    self.logger.warning('Invalid pattern in excluded_files: %s' % str(e))

        if self.filter_func:
            return self.filter_func(path)

        return match

    def _recursive_walk_dir(self, source_path, keep_base_dir=False):
        '''
        Walk a directory recursively, yielding each file/directory

        Returns the path of an entity. If ``keep_base_dir`` is set,
        the path returned contains the prefix ``source_path``; otherwise it is
        relative to the prefix.
        '''

        source_path = source_path.rstrip(os.sep)

        if keep_base_dir:
            beginning_size = 0
        else:
            beginning_size = len(source_path) + 1       # +1 for os.sep

        queue = [source_path]

        while queue:
            cur_path = queue.pop(0)

            try:
                dfd = os.open (cur_path, os.O_DIRECTORY)
            except FileNotFoundError as exn:
                self.logger.warning ("failed to open entity [%s] as directory; "
                                     "file system (error: %s); skipping"
                                     % (cur_path, str (exn)))
                continue

            try:
                for filename in sorted(os.listdir(dfd)):
                    child = os.path.join(cur_path, filename)
                    is_dir = os.path.isdir(child)
                    status = self.filter_path(child, source_path, is_dir)
                    if status == NO_MATCH:
                        continue
                    if not os.access(child, os.R_OK):
                        self.logger.warning('Error accessing possibly locked file %s' % child)
                        continue

                    if status == MATCH:
                        yield child[beginning_size:]

                    if is_dir and (status == MATCH or status == PARENT_MATCH):
                        queue.append(child)
            finally:
                os.close (dfd)

    def _stat_dict(self, path):
        '''
        Returns a dict with the stat data used to compare files
        '''
        stinfo = os.stat(path)
        mode = stinfo.st_mode

        ptype = None
        if stat.S_ISDIR(mode):
            ptype = u'directory'
        elif stat.S_ISREG(mode):
            ptype = u'file'
        elif stat.S_ISLNK(mode):
            ptype = u'link'

        return {
            u'type': ptype,
            u'path': path,
            u'mode': mode,
            u'mtime': int(stinfo.st_mtime),
            u'ctime': int(stinfo.st_ctime),
            u'uid': stinfo.st_uid,
            u'gid': stinfo.st_gid,
            u'inode': stinfo.st_ino,
            u'size': stinfo.st_size
        }

    def _equal_stat_dicts(self, d1, d2, listsnapshot_equal=False):
        '''
        Return if the dicts are equal in the stat keys
        '''
        keys = [u'type', u'mode',u'size', u'mtime',
            # not restored: u'inode', u'ctime'
        ]

        # only if user is root, then also check gid/uid. otherwise do not check it,
        # because tarfile can chown in case of being superuser only
        #
        # also, skip the check in rpmbuild since the sources end up with the
        # uid:gid of the packager while the extracted files are 0:0.
        if hasattr(os, "geteuid") and os.geteuid() == 0 \
                and os.getenv ("RPMBUILD_OPTIONS") is None:
            keys.append('gid')
            keys.append('uid')

        if (not d1 and d2 != None) or (d1 != None and not d2):
            return False

        if self.prefixed(d1.get('path', -1), listsnapshot_equal) != self.prefixed(d2.get('path', -2), listsnapshot_equal):
            return False

        type = d1.get('type', '')

        for key in keys:
            # size doesn't matter for directories
            if type == 'directory' and key == 'size':
                continue
            if d1.get(key, -1) != d2.get(key, -2):
                return False
        return True

    def prefixed(self, path, listsnapshot_equal=False):
        '''
        if a path is not prefixed, return it prefixed
        '''
        for prefix in self.__path_prefix_list:
            if path.startswith(prefix):
                if listsnapshot_equal and prefix == u'list://':
                    return u'snapshot://' + path[len(prefix):]
                return path
        return u'snapshot://' + path

    def unprefixed(self, path):
        '''
        remove a path prefix if any
        '''
        for prefix in self.__path_prefix_list:
            if path.startswith(prefix):
                return path[len(prefix):]
        return path


    def initialize_encryption (self, mode, strict_validation=True):
        """
        :type  strict_validation:   bool
        :param strict_validation:   Enable strict IV checking in the crypto
                                    layer. Should be disabled when dealing with
                                    potentially corrupted data.
        """
        password = self.password
        key      = self.crypto_key
        nacl     = self.nacl

        if key is None and password is None:
            return
        if mode == CRYPTO_MODE_ENCRYPT:
            return crypto.Encrypt (password=password,
                                   key=key,
                                   nacl=nacl,
                                   version=self.crypto_version,
                                   paramversion=self.crypto_paramversion)
        if mode == CRYPTO_MODE_DECRYPT:
            return crypto.Decrypt (password=password, key=key,
                                   strict_ivs=strict_validation)

        raise Exception ("invalid encryption mode [%r]" % mode)


    def open_auxiliary_file(self, path, mode='r', kind=AUXILIARY_FILE_INDEX,
                            strict_validation=True):
        '''
        Given the specified configuration, opens a file for reading or writing,
        inheriting the encryption and compression settings from the backup.
        Returns a file object ready to use.

        :param mode:        IO mode (read or write, ``"r"`` and ``"w"``,
                            respectively).
        :type  mode:        str
        :param kind:        Role of the file, see AUXILIARY_FILE_* constants.
                            Both the info and the auxiliary file have a globally
                            unique, constant counter value.
        :type  kind:        str
        '''
        if self.index_mode.startswith('gz'):
            comptype = 'gz'
        elif self.index_mode.startswith('bz2'):
            comptype = 'bz2'
        else:
            comptype = 'tar'

        crypto_ctx = None
        enccounter = None
        if mode == "w":
            crypto_ctx = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)
        elif mode == "r":
            crypto_ctx = self.initialize_encryption (CRYPTO_MODE_DECRYPT,
                                                     strict_validation=strict_validation)

        if crypto_ctx is not None:
            if kind == AUXILIARY_FILE_INFO:
                enccounter = crypto.AES_GCM_IV_CNT_INFOFILE
            elif kind == AUXILIARY_FILE_INDEX:
                enccounter = crypto.AES_GCM_IV_CNT_INDEX
            else:
                raise Exception ("invalid kind of aux file %r" % kind)

        sink = tarfile._Stream(name=path, mode=mode, comptype=comptype,
                               bufsize=tarfile.RECORDSIZE, fileobj=None,
                               encryption=crypto_ctx, enccounter=enccounter)

        return sink


    def create_full_backup(self, source_path, backup_path,
                           max_volume_size=None, extra_data=dict()):
        '''
        Creates a full backup.

        Parameters:
        - source_path: source path to the directory to back up.
        - backup_path: path where the back up will be stored. Backup path will
          be created if not existent.
        - max_volume_size: maximum volume size in megabytes. Used to split the
          backup in volumes. Optional (won't split in volumes by default).
        - extra_data: a json-serializable dictionary with information that you
          want to be included in the header of the index file
        '''
        # check input
        if not isinstance(source_path, str):
            raise Exception('Source path must be a string')

        if not isinstance(backup_path, str):
            raise Exception('Backup path must be a string')

        if not os.path.exists(source_path) or not os.path.isdir(source_path):
            raise Exception('Source path "%s" does not exist or is not a '\
                            'directory' % source_path)

        if max_volume_size != None and (not isinstance(max_volume_size, int) or\
            max_volume_size < 1):
            raise Exception('max_volume_size must be a positive integer')
        if max_volume_size != None:
            max_volume_size = max_volume_size*1024*1024

        if not isinstance(extra_data, dict):
            raise Exception('extra_data must be a dictionary')

        try:
            extra_data_str = json.dumps(extra_data)
        except:
            raise Exception('extra_data is not json-serializable')

        if not os.access(source_path, os.R_OK):
            raise Exception('Source path "%s" is not readable' % source_path)

        # try to create backup path if needed
        os.makedirs(backup_path, exist_ok=True)

        if not os.access(backup_path, os.W_OK):
            raise Exception('Backup path "%s" is not writeable' % backup_path)

        if source_path.endswith('/'):
            source_path = source_path[:-1]

        if backup_path.endswith('/'):
            backup_path = backup_path[:-1]

        # update current time
        self.current_time = datetime.datetime.now()

        if self.mode not in self.__file_extensions_dict:
            raise Exception('Unrecognized extension')

        # setup for encrypting payload
        if self.encryptor is None:
            self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)

        # some initialization
        self.vol_no = 0

        # generate the first volume name
        vol_name = self.volume_name_func(backup_path, True, 0)
        tarfile_path = os.path.join(backup_path, vol_name)

        # init index
        index_name = self.index_name_func(True)
        index_path = os.path.join(backup_path, index_name)
        index_sink = self.open_auxiliary_file(index_path, 'w')

        cwd = os.getcwd()

        def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
            '''
            Handles the new volumes
            '''
            volume_name = deltarobj.volume_name_func(backup_path, True, volume_number)
            volume_path = os.path.join(backup_path, volume_name)
            deltarobj.vol_no = volume_number

            # we convert relative paths into absolute because CWD is changed
            if not os.path.isabs(volume_path):
                volume_path = os.path.join(cwd, volume_path)

            if tarobj.fileobj is not None:
                tarobj.fileobj.close()

            deltarobj.logger.debug("opening volume %s" % volume_path)

            tarobj.open_volume(volume_path, encryption=encryption)

        # wraps some args from context into the handler
        new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.encryptor)

        index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "full", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))

        s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
        # calculate checksum and write into the stream
        crc = binascii.crc32(s) & 0xFFFFffff
        index_sink.write(s)

        # start creating the tarfile
        tarobj = tarfile.TarFile.open(tarfile_path,
                              mode='w' + self.mode,
                              format=tarfile.GNU_FORMAT,
                              concat='#' in self.mode,
                              encryption=self.encryptor,
                              max_volume_size=max_volume_size,
                              new_volume_handler=new_volume_handler,
                              save_to_members=False,
                              dereference=True)
        os.chdir(source_path)

        # for each file to be in the backup, do:
        for path in self._recursive_walk_dir('.'):

            try: # backup file
                # calculate stat dict for current file
                statd = self._stat_dict(path)
                statd['path'] = u'snapshot://' + statd['path']
                statd['volume'] = self.vol_no

                # backup file
                tarobj.add(path, arcname = statd['path'], recursive=False)
            except FileNotFoundError as exn:
                # file vanished since the call to access(3) above
                self.logger.warning ("object [%s] no longer available in "
                                     "file system (error: %s); skipping"
                                     % (path, str (exn)))
                continue # prevent indexing

            # retrieve file offset
            statd['offset'] = tarobj.get_last_member_offset()
            self.logger.debug("backup %s" % statd['path'])

            # store the stat dict in the index
            s = bytes(json.dumps(statd) + '\n', 'UTF-8')
            crc = binascii.crc32(s, crc) & 0xffffffff
            index_sink.write(s)

        s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
        crc = binascii.crc32(s, crc) & 0xffffffff
        index_sink.write(s)
        s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
        index_sink.write(s)

        os.chdir(cwd)
        tarobj.close()
        index_sink.close (close_fileobj=True)

    def create_diff_backup(self, source_path, backup_path, previous_index_path,
                           max_volume_size=None, extra_data=dict()):
        '''
        Creates a backup.

        Parameters:
        - source_path: source path to the directory to back up.
        - backup_path: path where the back up will be stored. Backup path will
          be created if not existent.
        - previous_index_path: index of the previous backup, needed to know
          which files changed since then.
        - max_volume_size: maximum volume size in megabytes (MB). Used to split
          the backup in volumes. Optional (won't split in volumes by default).

        NOTE: previous index is assumed to follow exactly the same format as
        the index_mode setup in the constructor.
        '''
        # check/sanitize input
        if not isinstance(source_path, str):
            raise Exception('Source path must be a string')

        if not isinstance(backup_path, str):
            raise Exception('Backup path must be a string')

        if not os.path.exists(source_path) or not os.path.isdir(source_path):
            raise Exception('Source path "%s" does not exist or is not a '\
                            'directory' % source_path)

        if not isinstance(extra_data, dict):
            raise Exception('extra_data must be a dictionary')

        try:
            extra_data_str = json.dumps(extra_data)
        except:
            raise Exception('extra_data is not json-serializable')

        if not os.access(source_path, os.R_OK):
            raise Exception('Source path "%s" is not readable' % source_path)

        if max_volume_size != None and (not isinstance(max_volume_size, int) or\
            max_volume_size < 1):
            raise Exception('max_volume_size must be a positive integer')
        if max_volume_size != None:
            max_volume_size = max_volume_size*1024*1024

        if not isinstance(previous_index_path, str):
            raise Exception('previous_index_path must be A string')

        if not os.path.exists(previous_index_path) or not os.path.isfile(previous_index_path):
            raise Exception('Index path "%s" does not exist or is not a '\
                            'file' % previous_index_path)

        if not os.access(previous_index_path, os.R_OK):
            raise Exception('Index path "%s" is not readable' % previous_index_path)

        # try to create backup path if needed
        os.makedirs(backup_path, exist_ok=True)

        if not os.access(backup_path, os.W_OK):
            raise Exception('Backup path "%s" is not writeable' % backup_path)

        if source_path.endswith('/'):
            source_path = source_path[:-1]

        if backup_path.endswith('/'):
            backup_path = backup_path[:-1]

        # update current time
        self.current_time = datetime.datetime.now()

        if self.mode not in self.__file_extensions_dict:
            raise Exception('Unrecognized extension')

        # setup for encrypting payload
        if self.encryptor is None:
            self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT)

        # some initialization
        self.vol_no = 0

        # generate the first volume name
        vol_name = self.volume_name_func(backup_path, is_full=False,
                                         volume_number=0)
        tarfile_path = os.path.join(backup_path, vol_name)

        # init index
        cwd = os.getcwd()

        index_name = self.index_name_func(is_full=False)
        index_path = os.path.join(backup_path, index_name)
        index_sink = self.open_auxiliary_file(index_path, 'w')

        def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number):
            '''
            Handles the new volumes
            '''
            volume_name = deltarobj.volume_name_func(backup_path, is_full=False,
                volume_number=volume_number)
            volume_path = os.path.join(backup_path, volume_name)
            deltarobj.vol_no = volume_number

            # we convert relative paths into absolute because CWD is changed
            if not os.path.isabs(volume_path):
                volume_path = os.path.join(cwd, volume_path)

            deltarobj.logger.debug("opening volume %s" % volume_path)
            tarobj.open_volume(volume_path)

        # wraps some args from context into the handler
        new_volume_handler = partial(new_volume_handler, self, cwd, backup_path)

        index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "diff", "extra_data": %s}\n' % extra_data_str, 'UTF-8'))

        s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8')
        # calculate checksum and write into the stream
        crc = binascii.crc32(s) & 0xFFFFffff
        index_sink.write(s)

        # start creating the tarfile
        tarobj = tarfile.TarFile.open(tarfile_path,
                              mode='w' + self.mode,
                              format=tarfile.GNU_FORMAT,
                              concat='#' in self.mode,
                              encryption=self.encryptor,
                              max_volume_size=max_volume_size,
                              new_volume_handler=new_volume_handler,
                              save_to_members=False,
                              dereference=True)


        # create the iterators, first the previous index iterator, then the
        # source path directory iterator and collate and iterate them
        if not os.path.isabs(previous_index_path):
            previous_index_path = os.path.join(cwd, previous_index_path)
        index_it = self.iterate_index_path(previous_index_path)

        os.chdir(source_path)
        dir_it = self._recursive_walk_dir('.')
        dir_path_it = self.jsonize_path_iterator(dir_it)

        def pr(path):
            if not path:
                return "None"
            else:
                return path["path"]

        # for each file to be in the backup, do:
        for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
            action = None
            # if file is not in the index, it means it's a new file, so we have
            # to take a snapshot

            if not ipath:
                action = 'snapshot'
            # if the file is not in the directory iterator, it means that it has
            # been deleted, so we need to mark it as such
            elif not dpath:
                action = 'delete'
            # if the file is in both iterators, it means it might have either
            # not changed (in which case we will just list it in our index but
            # it will not be included in the tar file), or it might have
            # changed, in which case we will snapshot it.
            elif ipath and dpath:
                if self._equal_stat_dicts(ipath, dpath):
                    action = 'list'
                else:
                    action = 'snapshot'
            # TODO: when creating chained backups (i.e. diffing from another
            # diff), we will need to detect the type of action in the previous
            # index, because if it was delete and dpath is None, we should
            # discard the file

            if action == 'snapshot':
                # calculate stat dict for current file
                stat = dpath.copy()
                stat['path'] = "snapshot://" + dpath['path']
                stat['volume'] = self.vol_no

                self.logger.debug("[STORE] %s" % dpath['path'])

                try: # backup file
                    tarobj.add(dpath['path'], arcname=stat['path'], recursive=False)
                    # retrieve file offset
                    stat['offset'] = tarobj.get_last_member_offset()
                except FileNotFoundError as exn:
                    # file vanished since the call to access(3) above
                    self.logger.warning ("object [%s] no longer available in "
                                         "file system (error: %s); skipping"
                                         % (dpath ["path"], str (exn)))
                    stat = None # prevent indexing

            elif action == 'delete':
                path = self.unprefixed(ipath['path'])
                stat = {
                    u'path': u'delete://' + path,
                    u'type': ipath['type']
                }
                self.logger.debug("[DELETE] %s" % path)

                # mark it as deleted in the backup
                tarobj.add("/dev/null", arcname=stat['path'])
            elif action == 'list':
                stat = dpath.copy()
                path = self.unprefixed(ipath['path'])
                stat['path'] = u'list://' + path
                # unchanged files do not enter in the backup, only in the index
                self.logger.debug("[UNCHANGED] %s" % path)
            else:
                # should not happen
                self.logger.warning('unknown action in create_diff_backup: {0}'
                                 ''.format(action))
                stat = None

            if stat:
                # store the stat dict in the index
                s = bytes(json.dumps(stat) + '\n', 'UTF-8')
                crc = binascii.crc32(s, crc) & 0xffffffff
                index_sink.write(s)

        s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8')
        crc = binascii.crc32(s, crc) & 0xffffffff
        index_sink.write(s)
        s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8')
        index_sink.write(s)

        index_it.release()
        os.chdir(cwd)
        tarobj.close()
        index_sink.close()


    def iterate_index_path(self, index_path, strict_validation=True):
        '''
        Returns an index iterator. Internally, it uses a classic iterator class.
        We do that instead of just yielding so that the iterator object can have
        an additional function to close the file descriptor that is opened in
        the constructor.
        '''

        class IndexPathIterator(object):
            def __init__(self, delta_tar, index_path):
                self.delta_tar = delta_tar
                self.index_path = index_path
                self.f = None
                self.extra_data = dict()
                self.__enter__()

            def __iter__(self):
                return self

            def release(self):
                if self.f:
                    self.f.close()

            def __enter__(self):
                '''
                Allows this iterator to be used with the "with" statement
                '''
                if self.f is None:
                    self.f = self.delta_tar.open_auxiliary_file \
                                    (self.index_path,
                                     'r',
                                     strict_validation=strict_validation)
                    # check index header
                    j, l_no = self.delta_tar._parse_json_line(self.f, 0)
                    if j.get("type", '') != 'python-delta-tar-index' or\
                            j.get('version', -1) != 1:
                        raise Exception("invalid index file format: %s" % json.dumps(j))

                    self.extra_data = j.get('extra_data', dict())

                    # find BEGIN-FILE-LIST, ignore other headers
                    while True:
                        j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
                        if j.get('type', '') == 'BEGIN-FILE-LIST':
                            break
                return self

            def __exit__(self, type, value, tb):
                '''
                Allows this iterator to be used with the "with" statement
                '''
                if self.f:
                    self.f.close()
                self.f = None

            def __next__(self):
                # read each file in the index and process it to do the restore
                j = {}
                l_no = -1
                try:
                    j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
                except Exception as e:
                    if self.f:
                        self.f.close()
                    raise e

                op_type = j.get('type', '')

                # when we detect the end of the list, break the loop
                if op_type == 'END-FILE-LIST':
                    if self.f:
                        self.f.close()
                    raise StopIteration

                # check input
                if op_type not in ['directory', 'file', 'link']:
                    self.delta_tar.logger.warning('unrecognized type to be '
                                        'restored: %s, line %d' % (op_type, l_no))
                    # iterate again
                    return self.__next__()

                return j, l_no

        return IndexPathIterator(self, index_path)

    def iterate_tar_path(self, tar_path, new_volume_handler=None):
        '''
        Returns a tar iterator that iterates jsonized member items that contain
        an additional "member" field, used by RestoreHelper.
        '''
        class TarPathIterator(object):
            def __init__(self, delta_tar, tar_path, new_volume_handler=None):
                self.delta_tar = delta_tar
                self.tar_path = tar_path
                self.tar_obj = None
                self.last_member = None
                self.new_volume_handler = new_volume_handler
                self.__enter__()

            def __iter__(self):
                return self

            def release(self):
                if self.tar_obj:
                    self.tar_obj.close()

            def __enter__(self):
                '''
                Allows this iterator to be used with the "with" statement
                '''
                if self.tar_obj is None:
                    decryptor = None
                    if self.delta_tar.password is not None:
                        decryptor = crypto.Decrypt \
                                        (password=self.delta_tar.password,
                                         key=self.delta_tar.crypto_key,
                                         strict_ivs=False)
                    self.tar_obj = tarfile.TarFile.open(self.tar_path,
                        mode='r' + self.delta_tar.mode,
                        format=tarfile.GNU_FORMAT,
                        concat='#' in self.delta_tar.mode,
                        encryption=decryptor,
                        new_volume_handler=self.new_volume_handler,
                        save_to_members=False,
                        dereference=True)
                return self

            def __exit__(self, type, value, tb):
                '''
                Allows this iterator to be used with the "with" statement
                '''
                if self.tar_obj:
                    self.tar_obj.close()
                self.tar_obj = None

            def __next__(self):
                '''
                Read each member and return it as a stat dict
                '''
                tarinfo = self.tar_obj.__iter__().__next__()
                # NOTE: here we compare if tarinfo.path is the same as before
                # instead of comparing the tarinfo object itself because the
                # object itself might change for multivol tarinfos
                if tarinfo is None or (self.last_member is not None and\
                    self.delta_tar.unprefixed(tarinfo.path) == self.delta_tar.unprefixed(self.last_member.path)):
                    raise StopIteration

                self.last_member = tarinfo

                ptype = 'unknown'
                if tarinfo.isfile():
                    ptype = 'file'
                elif tarinfo.isdir():
                    ptype = 'directory'
                elif tarinfo.islnk() or tarinfo.issym():
                    ptype = 'link'

                return {
                    u'type': ptype,
                    u'path': tarinfo.path,
                    u'mode': tarinfo.mode,
                    u'mtime': tarinfo.mtime,
                    u'ctime': -1, # cannot restore
                    u'uid': tarinfo.uid,
                    u'gid': tarinfo.gid,
                    u'inode': -1, # cannot restore
                    u'size': tarinfo.size,
                    u'member': tarinfo
                }, 0

        return TarPathIterator(self, tar_path, new_volume_handler)

    def jsonize_path_iterator(self, iter, strip=0):
        '''
        converts the yielded items of an iterator into json path lines.

        strip: Strip the smallest prefix containing num leading slashes from
        the file path.
        '''
        while True:
            try:
                path = iter.__next__()
                if strip == 0:
                    yield self._stat_dict(path), 0
                else:
                    st = self._stat_dict(path)
                    st['path'] = "/".join(path.split("/")[strip:])
                    yield st, 0
            except StopIteration:
                break

    def iterate_disaster_index (self, index):
        """
        Mimick the behavior of the other object iterators, just with the inputs
        supplied directly as *index*.
        """

        class RawIndexIterator(object):
            def __init__(self, delta_tar, index):
                self.delta_tar = delta_tar
                self.index = index
                self.__enter__()

            def __iter__(self):
                return self

            def release(self):
                pass

            def __enter__(self):
                '''
                Allows this iterator to be used with the "with" statement
                '''
                self.iter = self.index.__iter__ ()
                return self

            def __exit__(self, type, value, tb):
                '''
                Allows this iterator to be used with the "with" statement
                '''

            def __next__(self):
                idxent = self.iter.__next__ ()
                return idxent, 0

        return RawIndexIterator(self, index)

    def collate_iterators(self, it1, it2):
        '''
        Collate two iterators, so that it returns pairs of the items of each
        iterator (if the items are the same), or (None, elem2) or (elem1, None)
        when there's no match for the items in the other iterator.

        It assumes that the items in both lists are ordered in the same way.
        '''
        l_no = 0
        elem1, elem2 = None, None
        while True:
            if not elem1:
                try:
                    elem1, l_no = it1.__next__()
                except StopIteration:
                    if elem2:
                        yield (None, elem2, l_no)
                    for elem2 in it2:
                        if isinstance(elem2, tuple):
                            elem2 = elem2[0]
                        yield (None, elem2, l_no)
                    break
            if not elem2:
                try:
                    elem2 = it2.__next__()
                    if isinstance(elem2, tuple):
                        elem2 = elem2[0]
                except StopIteration:
                    if elem1:
                        yield (elem1, None, l_no)
                    for elem1, l_no in it1:
                        yield (elem1, None, l_no)
                    break

            index1 = self.unprefixed(elem1['path'])
            index2 = self.unprefixed(elem2['path'])
            i1, i2 = self.compare_indexes(index1, index2)

            yield1 = yield2 = None
            if i1 is not None:
                yield1 = elem1
                elem1 = None
            if i2 is not None:
                yield2 = elem2
                elem2 = None
            yield (yield1, yield2, l_no)

    def compare_indexes(self, index1, index2):
        '''
        Compare iterator indexes and return a tuple in the following form:
        if index1 < index2, returns (index1, None)
        if index1 == index2 returns (index1, index2)
        else: returns (None, index2)
        '''
        l1 = index1.split('/')
        l2 = index2.split('/')
        length = len(l2) - len(l1)

        if length > 0:
            return (index1, None)
        elif length < 0:
            return (None, index2)

        for i1, i2 in zip(l1, l2):
            if i1 < i2:
                return (index1, None)
            elif i1 > i2:
                return (None, index2)

        return (index1, index2)

    def list_backup(self, backup_tar_path, list_func=None):
        if not isinstance(backup_tar_path, str):
            raise Exception('Backup tar path must be a string')

        if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
            raise Exception('Source path "%s" does not exist or is not a '\
                            'file' % backup_tar_path)

        if not os.access(backup_tar_path, os.R_OK):
            raise Exception('Source path "%s" is not readable' % backup_tar_path)

        cwd = os.getcwd()

        def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number):
            '''
            Handles the new volumes
            '''
            volume_name = deltarobj.volume_name_func(backup_path, True,
                volume_number, guess_name=True)
            volume_path = os.path.join(backup_path, volume_name)

            # we convert relative paths into absolute because CWD is changed
            if not os.path.isabs(volume_path):
                volume_path = os.path.join(cwd, volume_path)
            tarobj.open_volume(volume_path, encryption=encryption)

        if self.decryptor is None:
            self.decryptor = \
                self.initialize_encryption (CRYPTO_MODE_DECRYPT,
                                            strict_validation=False)

        backup_path = os.path.dirname(backup_tar_path)
        if not os.path.isabs(backup_path):
            backup_path = os.path.join(cwd, backup_path)
        new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.decryptor)

        tarobj = tarfile.TarFile.open(backup_tar_path,
                            mode='r' + self.mode,
                            format=tarfile.GNU_FORMAT,
                            concat='#' in self.mode,
                            encryption=self.decryptor,
                            new_volume_handler=new_volume_handler,
                            save_to_members=False,
                            dereference=True)

        def filter(cls, list_func, tarinfo):
            if list_func is None:
                self.logger.info(tarinfo.path)
            else:
                list_func(tarinfo)
            return False
        filter = partial(filter, self, list_func)

        tarobj.extractall(filter=filter, unlink=True)
        tarobj.close()

    def restore_backup(self, target_path, backup_indexes_paths=[],
                       backup_tar_path=None, restore_callback=None,
                       disaster=tarfile.TOLERANCE_STRICT, backup_index=None,
                       strict_validation=True):
        '''
        Restores a backup.

        Parameters:
        - target_path: path to restore.
        - backup_indexes_paths: path to backup indexes, in descending date order.
          The indexes indicate the location of their respective backup volumes,
          and multiple indexes are needed to be able to restore diff backups.
          Note that this is an optional parameter: if not suplied, it will
          try to restore directly from backup_tar_path.
        - backup_tar_path: path to the backup tar file. Used as an alternative
          to backup_indexes_paths to restore directly from a tar file without
          using any file index. If it's a multivol tarfile, volume_name_func
          will be called.
        - restore_callback: callback function to be called during restore.
          This is passed to the helper and gets called for every file.

        NOTE: If you want to use an index to restore a backup, this function
        only supports to do so when the tarfile mode is either uncompressed or
        uses concat compress mode, because otherwise it would be very slow.

        NOTE: Indices are assumed to follow the same format as the index_mode
        specified in the constructor.

        Returns the list of files that could not be restored, if there were
        any.
        '''
        # check/sanitize input
        if not isinstance(target_path, str):
            raise Exception('Target path must be a string')

        if backup_indexes_paths is None and backup_tar_path == []:
            raise Exception("You have to either provide index paths or a tar path")

        if isinstance (backup_index, list) is True:
            mode = "disaster"
        elif len(backup_indexes_paths) == 0:
            mode = "tar"
        else:
            mode = "diff"

        if mode == "tar":
            if not isinstance(backup_tar_path, str):
                raise Exception('Backup tar path must be a string')

            if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path):
                raise Exception('Source path "%s" does not exist or is not a '\
                                'file' % backup_tar_path)

            if not os.access(backup_tar_path, os.R_OK):
                raise Exception('Source path "%s" is not readable' % backup_tar_path)
        else:
            if not isinstance(backup_indexes_paths, list):
                raise Exception('backup_indexes_paths must be a list')

            if self.mode.startswith(':') or self.mode.startswith('|'):
                raise Exception('Restore only supports either uncompressed tars'
                    ' or concat compression when restoring from an index, and '
                    ' the open mode you provided is "%s"' % self.mode)

            for index in backup_indexes_paths:
                if not isinstance(index, str):
                    raise Exception('indices must be strings')

                if not os.path.exists(index) or not os.path.isfile(index):
                    raise Exception('Index path "%s" does not exist or is not a '\
                                    'file' % index)

                if not os.access(index, os.R_OK):
                    raise Exception('Index path "%s" is not readable' % index)

        # try to create backup path if needed
        os.makedirs(target_path, exist_ok=True)

        # make backup_tar_path absolute so that iterate_tar_path works fine
        if backup_tar_path and not os.path.isabs(backup_tar_path):
            backup_tar_path = os.path.abspath(backup_tar_path)

        cwd = os.getcwd()
        os.chdir(target_path)

        # setup for decrypting payload
        if self.decryptor is None:
            self.decryptor = \
                self.initialize_encryption (CRYPTO_MODE_DECRYPT,
                                            strict_validation=strict_validation)

        if mode == 'tar':
            index_it = self.iterate_tar_path(backup_tar_path)
            helper = RestoreHelper(self, cwd, backup_path=backup_tar_path,
                                   tarobj=index_it.tar_obj)
        elif mode == "diff":
            helper = RestoreHelper(self, cwd, backup_indexes_paths,
                                   disaster=disaster)
            try:
                # get iterator from newest index at _data[0]
                index1 = helper._data[0]["path"]
                index_it = \
                    self.iterate_index_path(index1,
                                            strict_validation=strict_validation)
            except tarfile.DecryptionError as exn:
                self.logger.error("failed to decrypt file [%s]: %s; is this an "
                                  "actual encrypted index file?"
                                  % (index1, str (exn)))
                return [(index1, exn)]
            except Exception as exn:
                # compressed files
                self.logger.error("failed to read file [%s]: %s; is this an "
                                  "actual index file?" % (index1, str (exn)))
                return [(index1, exn)]
        elif mode == "disaster":
            index_it = self.iterate_disaster_index (backup_index)
            helper = RestoreHelper (self, cwd, backup_path=backup_tar_path,
                                    backup_index=backup_index,
                                    disaster=disaster)

        index_decryptor = helper._data[0]["decryptor"]

        dir_it = self._recursive_walk_dir('.')
        dir_path_it = self.jsonize_path_iterator(dir_it)

        failed = [] # irrecoverable files

        # for each file to be restored, do:
        for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
            if not ipath:
                upath = dpath['path']
                op_type = dpath['type']
            else:
                upath = self.unprefixed(ipath['path'])
                op_type = ipath['type']

            # filter paths
            if self.filter_path(upath, '', op_type == 'directory') == NO_MATCH:
                continue

            # if types of the file mismatch, the file needs to be deleted
            # and re-restored
            if ipath is not None and dpath is not None and\
                    dpath['type'] != ipath['type']:
                helper.delete(upath)

            # if file not found in dpath, we can directly restore from index
            if not dpath:
                # if the file doesn't exist and it needs to be deleted, it
                # means that work is already done
                if ipath['path'].startswith('delete://'):
                    continue
                try:
                    self.logger.debug("restore %s" % ipath['path'])
                    helper.restore(ipath, l_no, restore_callback)
                except Exception as e:
                    iipath = ipath.get ("path", "")
                    self.logger.error("FAILED to restore: {} ({})"
                                      .format(iipath, e))
                    if disaster != tarfile.TOLERANCE_STRICT:
                        failed.append ((iipath, e))
                continue

            # if both files are equal, we have nothing to restore
            if self._equal_stat_dicts(ipath, dpath, listsnapshot_equal=True):
                continue

            # we have to restore the file, but first we need to delete the
            # current existing file.
            # we don't delete the file if it's a directory, because it might
            # just have changed mtime, so it's quite inefficient to remove
            # it
            if ipath:
                if ipath['type'] != 'directory' or ipath['path'].startswith('delete://'):
                    helper.delete(upath)
                self.logger.debug("restore %s" % ipath['path'])
                try:
                    helper.restore(ipath, l_no, restore_callback)
                except Exception as e:
                    if disaster == tarfile.TOLERANCE_STRICT:
                        raise
                    failed.append ((ipath.get ("path", ""), e))
                    continue

            # if the file is not in the index (so it comes from the target
            # directory) then we have to delete it
            else:
                self.logger.debug("delete %s" % upath)
                helper.delete(upath)

        helper.restore_directories_permissions()
        index_it.release()
        os.chdir(cwd)
        helper.cleanup()

        return failed


    def recover_backup(self, target_path, backup_indexes_paths=[],
                       restore_callback=None):
        """
        Walk the index, extracting objects in disaster mode. Bad files are
        reported along with a reason.

        *Security considerations*: In *recovery mode* the headers of encrypted
        objects are assumed damaged and GCM tags are not validated so
        modification of cryptographically relevant parts of the header (more
        specifically, the initalization vectors) can no longer be detected. If
        an attacker can manipulate the encrypted backup set and has access to
        the plaintext of some of the contents, they may be able to obtain the
        plaintext of other encrypted objects by injecting initialization
        vectors. For this reason *recovery mode* should only be used to
        emergency situations and the contents of the resulting files should be
        validated manually if possible and not be disclosed to untrusted
        parties.
        """
        return self.restore_backup(target_path,
                                   backup_indexes_paths=backup_indexes_paths,
                                   disaster=tarfile.TOLERANCE_RECOVER,
                                   strict_validation=False)


    def rescue_backup(self, target_path, backup_tar_path,
                      restore_callback=None):
        """
        More aggressive “unfsck” mode: do not rely on the index data as the
        files may be corrupt; skim files for header-like information and
        attempt to retrieve the data.

        *Security considerations*: As with *recovery mode*, in *rescue mode*
        the headers of encrypted objects are assumed damaged and GCM tags are
        not validated so modification of cryptographically relevant parts of
        the header (more specifically, the initalization vectors) can no longer
        be detected. If an attacker can manipulate the encrypted backup set and
        has access to the plaintext of some of the contents, they may be able
        to obtain the plaintext of other encrypted objects by injecting
        initialization vectors. For this reason *rescue mode* should only be
        used to emergency situations and the contents of the resulting files
        should be validated manually if possible and not be disclosed to
        untrusted parties.
        """
        def gen_volume_name (nvol):
            return os.path.join (os.path.dirname (backup_tar_path),
                                 self.volume_name_func (backup_tar_path,
                                                        True,
                                                        nvol))

        backup_index = tarfile.gen_rescue_index (gen_volume_name,
                                                 self.mode,
                                                 password=self.password,
                                                 key=self.crypto_key)

        return self.restore_backup(target_path,
                                   backup_index=backup_index,
                                   backup_tar_path=backup_tar_path,
                                   disaster=tarfile.TOLERANCE_RESCUE,
                                   strict_validation=False)


    def _parse_json_line(self, f, l_no):
        '''
        Read line from file like object and process it as JSON.
        '''
        l = f.readline()
        l_no += 1
        try:
            j = json.loads(l.decode('UTF-8'))
        except UnicodeDecodeError as e:
            if tuple (l [0:2]) == tarfile.GZ_MAGIC:
                raise Exception \
                    ("error parsing line #%d as json: looks like a compressed file (%d B: [%s..])"
                     % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
                    from e
            raise Exception \
                ("error parsing line #%d as json: not a text file (%d B: [%s..])"
                 % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \
                from e
        except ValueError as e:
            raise Exception("error parsing this json line "
                "(line number %d): %s" % (l_no, l))
        return j, l_no


class RestoreHelper(object):
    '''
    Class used to help to restore files from indices
    '''

    # holds the dicts of data
    _data = []

    _deltatar = None

    _cwd = None

    # list of directories to be restored. This is done as a last step, see
    # tarfile.extractall for details.
    _directories = []

    _disaster = tarfile.TOLERANCE_STRICT

    def __init__(self, deltatar, cwd, index_list=None, backup_path=False,
                 backup_index=None, tarobj=None,
                 disaster=tarfile.TOLERANCE_STRICT):
        '''
        Constructor opens the tars and init the data structures.

        Assumptions:

            - Index list must be provided in reverse order (newer first).
            - “newer first” apparently means that if there are n backups
              provided, the last full backup is at index n-1 and the most recent
              diff backup is at index 0.
            - Only the first, the second, and the last elements of
              ``index_list`` are relevant, others will not be accessed.
            - If no ``index_list`` is provided, both ``tarobj`` and
              ``backup_path`` must be passed.
            - If ``index_list`` is provided, the values of ``tarobj`` and
              ``backup_path`` are ignored.
        '''
        self._data = []
        self._directories = []
        self._deltatar = deltatar
        self._cwd = cwd
        self._password = deltatar.password
        self._crypto_key = deltatar.crypto_key
        self._decryptors = []
        self._disaster = disaster

        # Disable strict checking for linearly increasing IVs when running
        # in rescue or recover mode.
        strict_validation = disaster == tarfile.TOLERANCE_STRICT

        try:
            import grp, pwd
        except ImportError:
            grp = pwd = None

        if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
            self.canchown = True
        else:
            self.canchown = False

        if isinstance (backup_index, list) is True:
            decryptor = self._deltatar.decryptor
            self._data = \
                [{ "curr_vol_no" : None
                 , "vol_fd" : None
                 , "offset" : -1
                 , "tarobj" : None
                 , "path" : backup_path
                 , "is_full" : True
                 , "iterator" : None
                 , "last_itelement" : None
                 , "last_lno" : 0
                 , "new_volume_handler" :
                            partial(self.new_volume_handler,
                                    self._deltatar, self._cwd, True,
                                    os.path.dirname(backup_path), decryptor)
                 , "decryptor" : decryptor
                 }]
        elif index_list is not None:
            for index in index_list:
                is_full = index == index_list[-1]

                decryptor = None
                if self._password is not None:
                    decryptor = crypto.Decrypt (password=self._password,
                                                key=self._crypto_key,
                                                strict_ivs=strict_validation)

                # make paths absolute to avoid cwd problems
                if not os.path.isabs(index):
                    index = os.path.normpath(os.path.join(cwd, index))

                s = dict(
                    curr_vol_no = None,
                    vol_fd = None,
                    offset = -1,
                    tarobj = None,
                    path = index,
                    is_full = is_full,
                    iterator = None,
                    last_itelement = None,
                    last_lno = 0,
                    new_volume_handler = partial(self.new_volume_handler,
                        self._deltatar, self._cwd, is_full,
                        os.path.dirname(index), decryptor),
                    decryptor = decryptor
                )
                self._data.append(s)
        else:
            # make paths absolute to avoid cwd problems
            if not os.path.isabs(backup_path):
                backup_path = os.path.normpath(os.path.join(cwd, backup_path))

            # update the new_volume_handler of tar_obj
            tarobj.new_volume_handler = partial(self.new_volume_handler,
                self._deltatar, self._cwd, True, os.path.dirname(backup_path),
                self._deltatar.decryptor)
            s = dict(
                curr_vol_no = None,
                vol_fd = None,
                offset = -1,
                tarobj = tarobj,
                path = backup_path,
                is_full = True,
                iterator = None,
                last_itelement = None,
                last_lno = 0,
                new_volume_handler = tarobj.new_volume_handler,
                decryptor = self._deltatar.decryptor
            )
            self._data.append(s)


    def cleanup(self):
        '''
        Closes all open files
        '''
        for data in self._data:
            if data['vol_fd']:
                data['vol_fd'].close()
                data['vol_fd'] = None
            if data['tarobj']:
                data['tarobj'].close()
                data['tarobj'] = None

    def delete(self, path):
        '''
        Delete a file
        '''
        if not os.path.exists(path):
            return

        # to preserve parent directory mtime, we save it
        parent_dir = os.path.dirname(path) or os.getcwd()
        parent_dir_mtime = int(os.stat(parent_dir).st_mtime)

        if os.path.isdir(path) and not os.path.islink(path):
            shutil.rmtree(path)
        else:
            os.unlink(path)

        # now we restore parent_directory mtime
        os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))

    def restore(self, itpath, l_no, callback=None):
        '''
        Restore the path from the appropriate backup. Receives the current path
        from the newest (=first) index iterator. itpath must be not null.
        callback is a custom function that gets called for every file.

        NB: This function takes the attribute ``_data`` as input but will only
        ever use its first and, if available, second element. Anything else in
        ``._data[]`` will be ignored.
        '''
        path = itpath['path']

        # Calls the callback function
        if callback:
            callback()

        if path.startswith('delete://'):
            # the file has previously been deleted already in restore_backup in
            # all cases so we just need to finish
            return

        # get data from newest index (_data[0])
        data = self._data[0]
        upath = self._deltatar.unprefixed(path)

        # to preserve parent directory mtime, we save it
        parent_dir = os.path.dirname(upath) or os.getcwd()
        os.makedirs(parent_dir, exist_ok=True)
        parent_dir_mtime = int(os.stat(parent_dir).st_mtime)

        # if path is found in the newest index as to be snapshotted, deal with it
        # and finish
        if path.startswith('snapshot://'):
            self.restore_file(itpath, data, path, l_no, upath)

            # now we restore parent_directory mtime
            os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
            return

        # we go from index to index, finding the path in the index, then finding
        # the index with the most recent snapshot of the file being restored
        #
        # Right now we support diff backups, only. No incremental backups.
        # As a result _data[0] is always the diff backup index
        # and _data[1] the full backup index.
        if len(self._data) == 2:
            data = self._data[1]
            d, l_no, dpath = self.find_path_in_index(data, upath)
            if not d:
                self._deltatar.logger.warning('Error restoring file %s from '
                                            'index, not found in index %s' % (path, data['path']))
                return

            cur_path = d.get('path', '')
            if cur_path.startswith('delete://'):
                self._deltatar.logger.warning(('Strange thing happened, file '
                                            '%s was listed in first index but deleted by another '
                                            'one. Path was ignored and untouched.') % path)
                return
            elif cur_path.startswith('snapshot://'):
                # this code path is reached when the file is unchanged
                # in the newest index and therefore of type 'list://'
                self.restore_file(d, data, path, l_no, dpath)

                # now we restore parent_directory mtime
                os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime))
                return

        # error code path is reached when:
        # a) we have more than two indexes (unsupported atm)
        # b) both indexes contain a list:// entry (logic error)
        # c) we have just one index and it also contains list://
        self._deltatar.logger.warning(('Error restoring file %s from index, '
                                    'snapshot not found in any index') % path)

    def find_path_in_index(self, data, upath):
        # NOTE: we restart the iterator sometimes because the iterator can be
        # walked over completely multiple times, for example if one path if not
        # found in one index and we have to go to the next index.
        it = data['iterator']
        if it is None:
            it = data['iterator'] = self._deltatar.iterate_index_path(data["path"])
            d, l_no = it.__next__()
        else:
            d = data['last_itelement']
            l_no = data['last_lno']

        while True:
            dpath = self._deltatar.unprefixed(d.get('path', ''))
            if upath == dpath:
                data['last_itelement'] = d
                data['last_lno'] = l_no
                return d, l_no, dpath

            up, dp = self._deltatar.compare_indexes(upath, dpath)
            # any time upath should have appeared before current dpath, it means
            # upath is just not in this index and we should stop
            if dp is None:
                data['last_itelement'] = d
                data['last_lno'] = l_no
                return None, 0, ''

            try:
                d, l_no = it.__next__()
            except StopIteration:
                data['last_itelement'] = d
                data['last_lno'] = l_no
                return None, 0, ''

    def restore_directories_permissions(self):
        '''
        Restore directory permissions when everything have been restored
        '''
        try:
            import grp, pwd
        except ImportError:
            grp = pwd = None

        self._directories.sort(key=operator.attrgetter('name'))
        self._directories.reverse()

        # Set correct owner, mtime and filemode on directories.
        for member in self._directories:
            dirpath = member.name
            try:
                os.chmod(dirpath, member.mode)
                os.utime(dirpath, (member.mtime, member.mtime))
                if self.canchown:
                    # We have to be root to do so.
                    try:
                        g = grp.getgrnam(member.gname)[2]
                    except KeyError:
                        g = member.gid
                    try:
                        u = pwd.getpwnam(member.uname)[2]
                    except KeyError:
                        u = member.uid
                    try:
                        if member.issym and hasattr(os, "lchown"):
                            os.lchown(dirpath, u, g)
                        else:
                            os.chown(dirpath, u, g)
                    except EnvironmentError:
                        raise tarfile.ExtractError("could not change owner")

            except tarfile.ExtractError as e:
                self._deltatar.logger.warning('tarfile: %s' % e)

    @staticmethod
    def new_volume_handler(deltarobj, cwd, is_full, backup_path, decryptor, tarobj, base_name, volume_number):
        '''
        Set up a new volume and perform the tasks necessary for transitioning
        to the next one.
        '''
        volume_name = deltarobj.volume_name_func(backup_path, is_full,
            volume_number, guess_name=True)
        volume_path = os.path.join(backup_path, volume_name)

        # we convert relative paths into absolute because CWD is changed
        if not os.path.isabs(volume_path):
            volume_path = os.path.join(cwd, volume_path)

        tarobj.open_volume(volume_path, encryption=decryptor)

    def restore_file(self, file_data, index_data, path, l_no, unprefixed_path):
        '''
        Restores a snapshot of a file from a specific backup
        '''
        op_type = file_data.get('type', -1)
        member = file_data.get('member', None)
        ismember = bool(member)

        # when member is set, then we can assume everything is right and we
        # just have to restore the path
        if member is None:
            vol_no = file_data.get('volume', -1)
            # sanity check
            if not isinstance(vol_no, int) or vol_no < 0:
                self._deltatar.logger.warning('unrecognized type to be restored: '
                                        '%s, line %d' % (op_type, l_no))

            # setup the volume that needs to be read. only needed when member is
            # not set
            if index_data['curr_vol_no'] != vol_no:
                index_data['curr_vol_no'] = vol_no
                backup_path = os.path.dirname(index_data['path'])
                vol_name = self._deltatar.volume_name_func(backup_path,
                    index_data['is_full'], vol_no, guess_name=True)
                vol_path = os.path.join(backup_path, vol_name)
                if index_data['vol_fd']:
                    index_data['vol_fd'].close()
                index_data['vol_fd'] = open(vol_path, 'rb')

                # force reopen of the tarobj because of new volume
                if index_data['tarobj']:
                    index_data['tarobj'].close()
                    index_data['tarobj'] = None

            # seek tarfile if needed
            offset = file_data.get('offset', -1)
            if index_data['tarobj']:
                if self._disaster == tarfile.TOLERANCE_RESCUE:
                    # force a seek and reopen
                    index_data['tarobj'].close()
                    index_data['tarobj'] = None
                else:
                    try:
                        member = index_data['tarobj'].__iter__().__next__()
                    except tarfile.DecryptionError:
                        pass
                    except tarfile.CompressionError:
                        pass

                    if not member or member.path != file_data['path']:
                        # force a seek and reopen
                        index_data['tarobj'].close()
                        index_data['tarobj'] = None


            # open the tarfile if needed
            if not index_data['tarobj']:
                index_data['vol_fd'].seek(offset)
                index_data['tarobj'] = tarfile.open(mode="r" + self._deltatar.mode,
                    fileobj=index_data['vol_fd'],
                    format=tarfile.GNU_FORMAT,
                    concat='#' in self._deltatar.mode,
                    encryption=index_data["decryptor"],
                    new_volume_handler=index_data['new_volume_handler'],
                    save_to_members=False,
                    tolerance=self._disaster)

                member = index_data['tarobj'].__iter__().__next__()

        member.path = unprefixed_path
        member.name = unprefixed_path

        if op_type == 'directory':
            self.add_member_dir(member)
            member = copy.copy(member)
            member.mode = 0o0700

            # if it's an existing directory, we then don't need to recreate it
            # just set the right permissions, mtime and that kind of stuff
            if os.path.exists(member.path):
                return

        if not ismember:
            # set current volume number in tarobj, otherwise the extraction of the
            # file might fail when trying to extract a multivolume member
            index_data['tarobj'].volume_number = index_data['curr_vol_no']

        def ignore_symlink (member, *_args):
            self._deltatar.logger.warning("Ignoring symlink %s" % member.name)

        # finally, restore the file
        index_data['tarobj'].extract(member, symlink_cb=ignore_symlink,
                                     unlink=True)

    def add_member_dir(self, member):
        '''
        Add member dir to be restored at the end
        '''
        if not self.canchown:
            self._directories.append(DirItem(name=member.name, mode=member.mode,
                mtime=member.mtime))
        else:
            self._directories.append(DirItem(name=member.name, mode=member.mode,
                mtime=member.mtime, gname=member.gname, uname=member.uname,
                uid=member.uid, gid=member.gid, issym=member.issym()))

class DirItem(object):
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)