# Copyright 2018 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import gzip
import os
import shutil
import subprocess
import threading

class Archiver():
    """
    An instance of this class stores set of files in given directory on local
    filesystem. Stored files are automatically compressed and organized into
    tar.xz archives based on their filenames prefixes. It is a very useful tool
    when one has to deal with many files with similar content that are generated
    continuously. Packing similar files together into tar.xz archive can
    singificantly reduce amount of required disk space (even for gzipped files).
    As a parameter, the constructor takes set of filenames prefixes. These
    prefixes are automatically clustered into archives by their common prefixes
    (yes, prefixes of prefixes). These archives are automatically created, when
    all files assigned to the given set of prefixes is added to Archiver object.
    Methods provided by this class are synchronized and can be called from
    different Python threads.

    """

    def _split_names_by_prefixes(
            self, names, max_names_per_prefix, prefix_length=0):
        """
        Recursive function used to split given set of names into groups by
        common prefixes. It tries to find configuration with minimum number of
        groups (prefixes) where the number of elements (names) in each group is
        not larger than given parameter.

        @param names: list of names to split into groups (names MUST BE sorted
                and unique).
        @param max_names_per_prefix: maximum number of names assigned to
                group (single prefix).
        @param prefix_length: current length of the prefix (for recursive
                calls); all elements in the list given as the parameter 'names'
                MUST HAVE the same prefix with this length.
        @returns dictionary with prefixes (each one represents single group) and
                size (a number of names in the group).

        """
        assert max_names_per_prefix > 1
        # Returns the current prefix if the group is small enough
        if len(names) <= max_names_per_prefix:
            return { names[0][0:prefix_length] : len(names) }
        # Increases prefix_length until a difference is found:
        # - elements in 'names' are sorted and unique
        # - elements in 'names' have a common prefix with a length of
        #   'prefix_length' characters
        while ( len(names[0]) > prefix_length and
                names[0][prefix_length] == names[-1][prefix_length] ):
            prefix_length += 1
        # Checks for special case, when the first name == prefix
        if len(names[0]) == prefix_length:
            return { names[0][0:prefix_length] : len(names) }
        # Calculates resultant list of prefixes
        results = dict()
        i_begin = 0
        # Calculates all prefixes (groups) using recursion:
        # - 'prefix_length' points to the first character that differentiates
        #   elements from the 'names' list
        while i_begin < len(names):
            char = names[i_begin][prefix_length]
            i_end = i_begin + 1
            while i_end < len(names) and char == names[i_end][prefix_length]:
                i_end += 1
            results.update(self._split_names_by_prefixes(names[i_begin:i_end],
                    max_names_per_prefix, prefix_length+1))
            i_begin = i_end
        return results


    def __init__(self, path_directory, prefixes, max_prefixes_per_archive):
        """
        Constructor.

        @param path_directory: directory where files and archives are stored.
                It is created if not exists.
        @param prefixes: a set of allowed filenames prefixes.
        @param max_prefixes_per_archive: maximum number of filenames prefixes
                assigned to single group (archive).

        """
        self._lock = threading.Lock()
        self._path_directory = path_directory
        if not os.path.exists(self._path_directory):
            os.makedirs(self._path_directory)

        prefixes = sorted(set(prefixes))
        self._archives_names = self._split_names_by_prefixes(prefixes,
                max_prefixes_per_archive)
        self._filenames_prefixes = dict()
        prefixes.reverse()
        for ap, fc in sorted(self._archives_names.iteritems()):
            self._archives_names[ap] = [fc, []]
            while fc > 0:
                self._filenames_prefixes[prefixes.pop()] = [ap, set()]
                fc -= 1


    def save_file(self, prefix, name, content, apply_gzip=False):
        """
        Add a new file with given content to the archive.

        @param prefix: prefix of filename that the new file will be saved with
        @param name: the rest of the filename of the new file; in summary, the
                resultant filename of the new file will be prefix+name
        @param content: a content of the file
        @param apply_gzip: if true, the added file will be gzipped, the suffix
                .gz will be added to its resultant filename

        """
        if apply_gzip:
            name += ".gz"
        path_target = os.path.join(self._path_directory, prefix + name)

        with self._lock:
            assert prefix in self._filenames_prefixes
            assert self._filenames_prefixes[prefix][1] is not None
            assert name not in self._filenames_prefixes[prefix][1]
            self._filenames_prefixes[prefix][1].add(name)

        if apply_gzip:
            file_target = gzip.GzipFile(path_target, 'wb', 9, None, 0)
        else:
            file_target = open(path_target, 'wb')
        with file_target:
            file_target.write(content)


    def copy_file(self, prefix, name, path_file, apply_gzip=False):
        """
        Add a new file to the archive. The file is copied from given location.

        @param prefix: prefix of filename that the new file will be saved with
        @param name: the rest of the filename of the new file; in summary, the
                resultant filename of the new file will be prefix+name
        @param path_file: path to the source file
        @param apply_gzip: if true, the added file will be gzipped, the suffix
                .gz will be added to its resultant filename

        """
        with open(path_file, 'rb') as file_source:
            content = file_source.read()
        self.save_file(prefix, name, content, apply_gzip)


    def move_file(self, prefix, name, path_file, apply_gzip=False):
        """
        Add a new file to the archive. The file is moved, i.e. an original
        file is deleted.

        @param prefix: prefix of filename that the new file will be saved with
        @param name: the rest of the filename of the new file; in summary, the
                resultant filename of the new file will be prefix+name
        @param path_file: path to the source file, it will be deleted
        @param apply_gzip: if true, the added file will be gzipped, the suffix
                .gz will be added to its resultant filename

        """
        if apply_gzip:
            self.copy_file(prefix, name, path_file, apply_gzip)
            os.remove(path_file)
        else:
            path_target = os.path.join(self._path_directory, prefix + name)
            with self._lock:
                assert prefix in self._filenames_prefixes
                assert self._filenames_prefixes[prefix][1] is not None
                assert name not in self._filenames_prefixes[prefix][1]
                self._filenames_prefixes[prefix][1].add(name)
            shutil.move(path_file, path_target)


    def finalize_prefix(self, prefix):
        """
        This method is called to mark that there is no more files to add with
        given prefix. This method creates a tar archive when the last prefix
        assigned to the corresponding group is finalized. This method must be
        called for all prefixes given to the constructor.

        @param prefix: prefix to finalize, no more files with this prefix can
                be added to the archive

        """
        with self._lock:
            assert prefix in self._filenames_prefixes
            assert self._filenames_prefixes[prefix][1] is not None

            filenames = []
            for name in sorted(self._filenames_prefixes[prefix][1]):
                filenames.append(prefix + name)
            self._filenames_prefixes[prefix][1] = None
            archive_name = self._filenames_prefixes[prefix][0]

            self._archives_names[archive_name][0] -= 1
            self._archives_names[archive_name][1] += filenames
            if self._archives_names[archive_name][0] == 0:
                archive_is_complete = True
                filenames = self._archives_names[archive_name][1]
            else:
                archive_is_complete = False

        if archive_is_complete and len(filenames) > 0:
            argv = ['tar', 'cJf', 'archive_' + archive_name + '.tar.xz']
            argv += filenames
            process_tar = subprocess.Popen(argv, cwd=self._path_directory)
            if process_tar.wait() != 0:
                raise Exception("Process 'tar cJf' failed!")
            for filename in filenames:
                os.remove(os.path.join(self._path_directory, filename))