216 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			216 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			Python
		
	
	
	
| # Copyright 2018 The Chromium OS Authors. All rights reserved.
 | |
| # Use of this source code is governed by a BSD-style license that can be
 | |
| # found in the LICENSE file.
 | |
| 
 | |
| import gzip
 | |
| import os
 | |
| import shutil
 | |
| import subprocess
 | |
| import threading
 | |
| 
 | |
| class Archiver():
 | |
|     """
 | |
|     An instance of this class stores set of files in given directory on local
 | |
|     filesystem. Stored files are automatically compressed and organized into
 | |
|     tar.xz archives based on their filenames prefixes. It is a very useful tool
 | |
|     when one has to deal with many files with similar content that are generated
 | |
|     continuously. Packing similar files together into tar.xz archive can
 | |
|     singificantly reduce amount of required disk space (even for gzipped files).
 | |
|     As a parameter, the constructor takes set of filenames prefixes. These
 | |
|     prefixes are automatically clustered into archives by their common prefixes
 | |
|     (yes, prefixes of prefixes). These archives are automatically created, when
 | |
|     all files assigned to the given set of prefixes is added to Archiver object.
 | |
|     Methods provided by this class are synchronized and can be called from
 | |
|     different Python threads.
 | |
| 
 | |
|     """
 | |
| 
 | |
|     def _split_names_by_prefixes(
 | |
|             self, names, max_names_per_prefix, prefix_length=0):
 | |
|         """
 | |
|         Recursive function used to split given set of names into groups by
 | |
|         common prefixes. It tries to find configuration with minimum number of
 | |
|         groups (prefixes) where the number of elements (names) in each group is
 | |
|         not larger than given parameter.
 | |
| 
 | |
|         @param names: list of names to split into groups (names MUST BE sorted
 | |
|                 and unique).
 | |
|         @param max_names_per_prefix: maximum number of names assigned to
 | |
|                 group (single prefix).
 | |
|         @param prefix_length: current length of the prefix (for recursive
 | |
|                 calls); all elements in the list given as the parameter 'names'
 | |
|                 MUST HAVE the same prefix with this length.
 | |
|         @returns dictionary with prefixes (each one represents single group) and
 | |
|                 size (a number of names in the group).
 | |
| 
 | |
|         """
 | |
|         assert max_names_per_prefix > 1
 | |
|         # Returns the current prefix if the group is small enough
 | |
|         if len(names) <= max_names_per_prefix:
 | |
|             return { names[0][0:prefix_length] : len(names) }
 | |
|         # Increases prefix_length until a difference is found:
 | |
|         # - elements in 'names' are sorted and unique
 | |
|         # - elements in 'names' have a common prefix with a length of
 | |
|         #   'prefix_length' characters
 | |
|         while ( len(names[0]) > prefix_length and
 | |
|                 names[0][prefix_length] == names[-1][prefix_length] ):
 | |
|             prefix_length += 1
 | |
|         # Checks for special case, when the first name == prefix
 | |
|         if len(names[0]) == prefix_length:
 | |
|             return { names[0][0:prefix_length] : len(names) }
 | |
|         # Calculates resultant list of prefixes
 | |
|         results = dict()
 | |
|         i_begin = 0
 | |
|         # Calculates all prefixes (groups) using recursion:
 | |
|         # - 'prefix_length' points to the first character that differentiates
 | |
|         #   elements from the 'names' list
 | |
|         while i_begin < len(names):
 | |
|             char = names[i_begin][prefix_length]
 | |
|             i_end = i_begin + 1
 | |
|             while i_end < len(names) and char == names[i_end][prefix_length]:
 | |
|                 i_end += 1
 | |
|             results.update(self._split_names_by_prefixes(names[i_begin:i_end],
 | |
|                     max_names_per_prefix, prefix_length+1))
 | |
|             i_begin = i_end
 | |
|         return results
 | |
| 
 | |
| 
 | |
|     def __init__(self, path_directory, prefixes, max_prefixes_per_archive):
 | |
|         """
 | |
|         Constructor.
 | |
| 
 | |
|         @param path_directory: directory where files and archives are stored.
 | |
|                 It is created if not exists.
 | |
|         @param prefixes: a set of allowed filenames prefixes.
 | |
|         @param max_prefixes_per_archive: maximum number of filenames prefixes
 | |
|                 assigned to single group (archive).
 | |
| 
 | |
|         """
 | |
|         self._lock = threading.Lock()
 | |
|         self._path_directory = path_directory
 | |
|         if not os.path.exists(self._path_directory):
 | |
|             os.makedirs(self._path_directory)
 | |
| 
 | |
|         prefixes = sorted(set(prefixes))
 | |
|         self._archives_names = self._split_names_by_prefixes(prefixes,
 | |
|                 max_prefixes_per_archive)
 | |
|         self._filenames_prefixes = dict()
 | |
|         prefixes.reverse()
 | |
|         for ap, fc in sorted(self._archives_names.iteritems()):
 | |
|             self._archives_names[ap] = [fc, []]
 | |
|             while fc > 0:
 | |
|                 self._filenames_prefixes[prefixes.pop()] = [ap, set()]
 | |
|                 fc -= 1
 | |
| 
 | |
| 
 | |
|     def save_file(self, prefix, name, content, apply_gzip=False):
 | |
|         """
 | |
|         Add a new file with given content to the archive.
 | |
| 
 | |
|         @param prefix: prefix of filename that the new file will be saved with
 | |
|         @param name: the rest of the filename of the new file; in summary, the
 | |
|                 resultant filename of the new file will be prefix+name
 | |
|         @param content: a content of the file
 | |
|         @param apply_gzip: if true, the added file will be gzipped, the suffix
 | |
|                 .gz will be added to its resultant filename
 | |
| 
 | |
|         """
 | |
|         if apply_gzip:
 | |
|             name += ".gz"
 | |
|         path_target = os.path.join(self._path_directory, prefix + name)
 | |
| 
 | |
|         with self._lock:
 | |
|             assert prefix in self._filenames_prefixes
 | |
|             assert self._filenames_prefixes[prefix][1] is not None
 | |
|             assert name not in self._filenames_prefixes[prefix][1]
 | |
|             self._filenames_prefixes[prefix][1].add(name)
 | |
| 
 | |
|         if apply_gzip:
 | |
|             file_target = gzip.GzipFile(path_target, 'wb', 9, None, 0)
 | |
|         else:
 | |
|             file_target = open(path_target, 'wb')
 | |
|         with file_target:
 | |
|             file_target.write(content)
 | |
| 
 | |
| 
 | |
|     def copy_file(self, prefix, name, path_file, apply_gzip=False):
 | |
|         """
 | |
|         Add a new file to the archive. The file is copied from given location.
 | |
| 
 | |
|         @param prefix: prefix of filename that the new file will be saved with
 | |
|         @param name: the rest of the filename of the new file; in summary, the
 | |
|                 resultant filename of the new file will be prefix+name
 | |
|         @param path_file: path to the source file
 | |
|         @param apply_gzip: if true, the added file will be gzipped, the suffix
 | |
|                 .gz will be added to its resultant filename
 | |
| 
 | |
|         """
 | |
|         with open(path_file, 'rb') as file_source:
 | |
|             content = file_source.read()
 | |
|         self.save_file(prefix, name, content, apply_gzip)
 | |
| 
 | |
| 
 | |
|     def move_file(self, prefix, name, path_file, apply_gzip=False):
 | |
|         """
 | |
|         Add a new file to the archive. The file is moved, i.e. an original
 | |
|         file is deleted.
 | |
| 
 | |
|         @param prefix: prefix of filename that the new file will be saved with
 | |
|         @param name: the rest of the filename of the new file; in summary, the
 | |
|                 resultant filename of the new file will be prefix+name
 | |
|         @param path_file: path to the source file, it will be deleted
 | |
|         @param apply_gzip: if true, the added file will be gzipped, the suffix
 | |
|                 .gz will be added to its resultant filename
 | |
| 
 | |
|         """
 | |
|         if apply_gzip:
 | |
|             self.copy_file(prefix, name, path_file, apply_gzip)
 | |
|             os.remove(path_file)
 | |
|         else:
 | |
|             path_target = os.path.join(self._path_directory, prefix + name)
 | |
|             with self._lock:
 | |
|                 assert prefix in self._filenames_prefixes
 | |
|                 assert self._filenames_prefixes[prefix][1] is not None
 | |
|                 assert name not in self._filenames_prefixes[prefix][1]
 | |
|                 self._filenames_prefixes[prefix][1].add(name)
 | |
|             shutil.move(path_file, path_target)
 | |
| 
 | |
| 
 | |
|     def finalize_prefix(self, prefix):
 | |
|         """
 | |
|         This method is called to mark that there is no more files to add with
 | |
|         given prefix. This method creates a tar archive when the last prefix
 | |
|         assigned to the corresponding group is finalized. This method must be
 | |
|         called for all prefixes given to the constructor.
 | |
| 
 | |
|         @param prefix: prefix to finalize, no more files with this prefix can
 | |
|                 be added to the archive
 | |
| 
 | |
|         """
 | |
|         with self._lock:
 | |
|             assert prefix in self._filenames_prefixes
 | |
|             assert self._filenames_prefixes[prefix][1] is not None
 | |
| 
 | |
|             filenames = []
 | |
|             for name in sorted(self._filenames_prefixes[prefix][1]):
 | |
|                 filenames.append(prefix + name)
 | |
|             self._filenames_prefixes[prefix][1] = None
 | |
|             archive_name = self._filenames_prefixes[prefix][0]
 | |
| 
 | |
|             self._archives_names[archive_name][0] -= 1
 | |
|             self._archives_names[archive_name][1] += filenames
 | |
|             if self._archives_names[archive_name][0] == 0:
 | |
|                 archive_is_complete = True
 | |
|                 filenames = self._archives_names[archive_name][1]
 | |
|             else:
 | |
|                 archive_is_complete = False
 | |
| 
 | |
|         if archive_is_complete and len(filenames) > 0:
 | |
|             argv = ['tar', 'cJf', 'archive_' + archive_name + '.tar.xz']
 | |
|             argv += filenames
 | |
|             process_tar = subprocess.Popen(argv, cwd=self._path_directory)
 | |
|             if process_tar.wait() != 0:
 | |
|                 raise Exception("Process 'tar cJf' failed!")
 | |
|             for filename in filenames:
 | |
|                 os.remove(os.path.join(self._path_directory, filename))
 |