435 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			435 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
| # Copyright 2018 The Chromium OS Authors. All rights reserved.
 | |
| # Use of this source code is governed by a BSD-style license that can be
 | |
| # found in the LICENSE file.
 | |
| 
 | |
| import json
 | |
| import md5
 | |
| import os
 | |
| import requests
 | |
| 
 | |
| # ==================== Documents digests
 | |
| 
 | |
| def _read_lines_with_prefix(document, position, prefix):
 | |
|     """
 | |
|     Starting from given position, it parses from the document complete lines
 | |
|     (with '\n' character at the end) starting from given prefix. Parser stops
 | |
|     on the first line that does not start from the given prefix or when there
 | |
|     are no more '\n' characters in the file.
 | |
| 
 | |
|     @param document: a document to parse
 | |
|     @param position: an offset in the document to start from
 | |
| 
 | |
|     @returns a pair (lines, position), where the first element is a list of
 | |
|         parsed lines (with '\n' character at the end) and the second element
 | |
|         is a new offset in the document, pointing at the first character after
 | |
|         the last parsed line
 | |
| 
 | |
|     """
 | |
|     lines = []
 | |
|     while document.startswith(prefix, position):
 | |
|         position_next_line = document.find('\n', position + len(prefix))
 | |
|         if position_next_line < 0:
 | |
|             break
 | |
|         position_next_line += 1  # to eat '\n' character
 | |
|         lines.append(document[position:position_next_line])
 | |
|         position = position_next_line
 | |
|     return lines, position
 | |
| 
 | |
| 
 | |
| def _process_PJL_headers(doc, position, out):
 | |
|     """
 | |
|     The function tries to find a PJL headers in given document and process
 | |
|     them as it was described in _normalize_document(doc) function.
 | |
| 
 | |
|     @param doc: see the description of _normalize_document(doc)
 | |
|     @param position: offset in the document; defines part of the document that
 | |
|             is already processed; searching for headers starts from this
 | |
|             position
 | |
|     @param out: already processed part of the document (from the beginning to
 | |
|             the given position)
 | |
| 
 | |
|     @returns new position and output; the position is set at the end of the last
 | |
|             processed PJL header or it is a copy of of input position, if no PJL
 | |
|             headers have been foound; the output is adjusted accordingly.
 | |
| 
 | |
|     """
 | |
|     PJL_MARKER = b'\x1B%-12345X'
 | |
|     MARGIN = 2048  # max distance to the header
 | |
|     position_pjl = doc.find(PJL_MARKER, position, position + MARGIN)
 | |
|     while position_pjl >= 0:
 | |
|         out += doc[position:(position_pjl+len(PJL_MARKER))]
 | |
|         position = position_pjl + len(PJL_MARKER)
 | |
|         # parse header and filter problematic lines
 | |
|         lines, position = _read_lines_with_prefix(doc, position, '@PJL')
 | |
|         for line in lines:
 | |
|             if not (line.startswith('@PJL SET ') or
 | |
|                     line.startswith('@PJL COMMENT') or
 | |
|                     line.startswith('@PJL DMINFO') or
 | |
|                     line.startswith('@PJL JOB NAME') or
 | |
|                     line.startswith('@PJL JOBNAME')):
 | |
|                 out += line
 | |
|         # try to find next PJL header
 | |
|         position_pjl = doc.find(PJL_MARKER, position, position + MARGIN)
 | |
|     return position, out
 | |
| 
 | |
| 
 | |
| def _process_PS_Adobe_headers(doc, position, out):
 | |
|     """
 | |
|     The function tries to find a PS-Adobe headers in given document and process
 | |
|     them as it was described in _normalize_document(doc) function.
 | |
| 
 | |
|     @param doc: see the description of _normalize_document(doc)
 | |
|     @param position: offset in the document; defines part of the document that
 | |
|             is already processed; searching for headers starts from this
 | |
|             position
 | |
|     @param out: already processed part of the document (from the beginning to
 | |
|             the given position)
 | |
| 
 | |
|     @returns new position and output; the position is set at the end of the last
 | |
|             processed PS-Adobe header or it is a copy of of input position, if
 | |
|             no PS-Adobe headers have been foound; the output is adjusted
 | |
|             accordingly.
 | |
| 
 | |
|     """
 | |
|     PS_MARKER = '%!PS-Adobe'
 | |
|     MARGIN = 2048  # max distance to the header
 | |
|     position_ps = doc.find(PS_MARKER, position, position + MARGIN)
 | |
|     while position_ps >= 0:
 | |
|         # add everything till the end of the first line in the header
 | |
|         position_next_line = doc.find('\n', position_ps + len(PS_MARKER))
 | |
|         if position_next_line < 0:
 | |
|             break  # no more '\n', we finish the parsing here
 | |
|         position_next_line += 1 # to eat \n character
 | |
|         out += doc[position:position_next_line]
 | |
|         # parse the rest of the header and filter problematic lines
 | |
|         lines, position = _read_lines_with_prefix(doc, position_next_line, '%')
 | |
|         for line in lines:
 | |
|             if not (line.startswith('%%Title:') or line.startswith('%%For:')):
 | |
|                 out += line
 | |
|         # search for lines with '{setuserinfo}' or '/JobInfo <<'
 | |
|         position_ps = doc.find(PS_MARKER, position, position + MARGIN)
 | |
|         position_ui = doc.find('{setuserinfo}', position, position + MARGIN)
 | |
|         position_ji = doc.find('/JobInfo <<', position, position + MARGIN)
 | |
|         # if '/JobInfo <<' was found, move the offset to the end of the section
 | |
|         if position_ji >= 0:
 | |
|             position_ji = doc.find('>>', position_ji)
 | |
|         # if the beginning of the next header was found, make sure that
 | |
|         # detected sections do not belong to the next header
 | |
|         if position_ps >= 0:
 | |
|             if position_ji > position_ps:
 | |
|                 position_ji = -1
 | |
|             if position_ui > position_ps:
 | |
|                 position_ui = -1
 | |
|         # choose the farthest section
 | |
|         position_end = max(position_ji, position_ui)
 | |
|         if position_end >= 0:
 | |
|             # find the first '\n' after the farthest section
 | |
|             position_end = doc.find('\n', position_end)
 | |
|             if position_end < 0:
 | |
|                 break  # no more '\n', we finish the parsing here
 | |
|             # split into lines everything from here to the end of the section
 | |
|             lines = doc[position:position_end].split('\n')
 | |
|             position = position_end + 1  # +1 is needed to eat the last \n
 | |
|             # filter problematic lines
 | |
|             for line in lines:
 | |
|                 if not (line.find('{setuserinfo}') >= 0 or
 | |
|                         line.find('/UserID') >= 0 or
 | |
|                         line.find('/Time') >= 0 or
 | |
|                         line.find('/HostLoginName') >= 0 or
 | |
|                         line.find('/HostName') >= 0):
 | |
|                     out += line + '\n'
 | |
|             # go to the next iteration, position_ps is already set
 | |
|     return position, out
 | |
| 
 | |
| 
 | |
| def _normalize_LIDIL(doc):
 | |
|     """
 | |
|     The function tries to proces given document as it was described in
 | |
|     _normalize_document(doc) function, but assuming that the document is in
 | |
|     LIDIL format. This format is used by some HP printers.
 | |
| 
 | |
|     @param doc: see the description of _normalize_document(doc)
 | |
| 
 | |
|     @returns None if the give ndocument is not in LIDIL format. Otherwise, it
 | |
|         returns a result for _normalize_document(doc) function.
 | |
| 
 | |
|     """
 | |
|     LIDIL_MARKER = b'\x24\x01\x00\x00\x07\x00\x00\x00'
 | |
|     LIDIL_JOBID_1_OFF = 2348 # first job id, offset from the beginning
 | |
|     LIDIL_JOBID_2_OFF = 2339 # second job id, offset from the end
 | |
|     JOBID_SIZE = 4 # number of bytes used to store job id
 | |
|     # the document is in LIDIL format <=> it starts with the marker
 | |
|     if not doc.startswith(LIDIL_MARKER):
 | |
|         return None
 | |
|     # remove both JOB IDs and exit
 | |
|     nd = len(doc)
 | |
|     if nd > LIDIL_JOBID_1_OFF + LIDIL_JOBID_2_OFF + 2*JOBID_SIZE:
 | |
|         doc = ''.join([ doc[:(LIDIL_JOBID_1_OFF)],
 | |
|                 doc[(LIDIL_JOBID_1_OFF+JOBID_SIZE):(nd-LIDIL_JOBID_2_OFF)],
 | |
|                 doc[(nd-LIDIL_JOBID_2_OFF+JOBID_SIZE):] ])
 | |
|     return doc
 | |
| 
 | |
| 
 | |
| def _normalize_EJL(doc):
 | |
|     """
 | |
|     The function tries to proces given document as it was described in
 | |
|     _normalize_document(doc) function, but assuming that the document is in
 | |
|     EJL format.
 | |
| 
 | |
|     @param doc: see the description of _normalize_document(doc)
 | |
| 
 | |
|     @returns None if the give ndocument is not in EJL format. Otherwise, it
 | |
|         returns a result for _normalize_document(doc) function.
 | |
| 
 | |
|     """
 | |
|     # EJL - some epson printers (like eplaser)
 | |
|     EJL_MARKER = b'\x1B\x01@EJL \n'
 | |
|     # the document is in EJL format <=> it starts with the marker
 | |
|     if not doc.startswith(EJL_MARKER):
 | |
|         return None
 | |
|     # copy the document to output; filter lines parsed from the EJL header
 | |
|     out = EJL_MARKER
 | |
|     lines, position = _read_lines_with_prefix(doc, len(EJL_MARKER), '@EJL')
 | |
|     for line in lines:
 | |
|         if not (line.startswith('@EJL JI ID=') or
 | |
|                 line.startswith('@EJL JI USER=')):
 | |
|             out += line
 | |
|     # add the rest of the document and exit
 | |
|     out += doc[position:]
 | |
|     return out
 | |
| 
 | |
| 
 | |
| def _normalize_document(doc):
 | |
|     """
 | |
|     The input document is a raw package sent to printer. This function removes
 | |
|     from it all variables that can change, when the same content is printed.
 | |
|     That includes, but is not limited to: user name, host name, job id, date,
 | |
|     time.
 | |
| 
 | |
|     @param doc: a raw document sent directly to printer to be printed
 | |
| 
 | |
|     @returns a copy of doc with removed fragments that can vary between
 | |
|         printing jobs. The returned output is supposed to be identical for the
 | |
|         same input content send to the pipeline for the same PPD file.
 | |
| 
 | |
|     """
 | |
|     # Try to parse the document as LIDIL or EJL and exit if successful.
 | |
|     out = _normalize_LIDIL(doc)
 | |
|     if out is not None:
 | |
|         return out
 | |
|     out = _normalize_EJL(doc)
 | |
|     if out is not None:
 | |
|         return out
 | |
| 
 | |
|     # Try to parse and process PJL and PS headers.
 | |
|     position = 0
 | |
|     out = ''
 | |
|     position, out = _process_PJL_headers(doc, position, out)
 | |
|     position, out = _process_PS_Adobe_headers(doc, position, out)
 | |
| 
 | |
|     # Go to the tail of the document, add the skipped content to the output.
 | |
|     if position + 2048 < len(doc):
 | |
|         position_tail = len(doc) - 2048
 | |
|         out += doc[position:position_tail]
 | |
|         position = position_tail
 | |
| 
 | |
|     # Try to find 'trailer << '.
 | |
|     position_trailer = doc.find('trailer << ', position)
 | |
|     if position_trailer >= 0:
 | |
|         # If found, prune the line with it.
 | |
|         position_end = doc.find('\n', position_trailer)
 | |
|         if position_end >= 0:
 | |
|             out += doc[position:position_trailer]
 | |
|             position = position_end + 1  # +1 to ommit '\n' from the trailer
 | |
| 
 | |
|     # Add the rest of the document to the output.
 | |
|     out += doc[position:]
 | |
| 
 | |
|     return out
 | |
| 
 | |
| 
 | |
| def calculate_digest(doc):
 | |
|     """
 | |
|     Calculates digests for given document.
 | |
| 
 | |
|     @param doc: document's content
 | |
| 
 | |
|     @returns calculated digests as a string of hexadecimals
 | |
| 
 | |
|     """
 | |
|     # Prune the variable parts of the document
 | |
|     out = _normalize_document(doc)
 | |
| 
 | |
|     # Calculates hash
 | |
|     return md5.new(out).hexdigest()
 | |
| 
 | |
| 
 | |
| def parse_digests_file(path_digests, denylist):
 | |
|     """
 | |
|     Parses digests and outputs sizes from file.
 | |
| 
 | |
|     @param path_digests: a path to a file with digests
 | |
|     @param denylist: list of keys to omit
 | |
| 
 | |
|     @returns two dictionaries, both indexed by ppd filenames: the first one
 | |
|             contains digests, the second one contains output sizes; returns
 | |
|             empty dictionaries if the given file does not exist
 | |
| 
 | |
|     """
 | |
|     digests = dict()
 | |
|     sizes = dict()
 | |
|     denylist = set(denylist)
 | |
|     if os.path.isfile(path_digests):
 | |
|         with open(path_digests, 'rb') as file_digests:
 | |
|             lines = file_digests.read().splitlines()
 | |
|             for line in lines:
 | |
|                 cols = line.split()
 | |
|                 if len(cols) >= 2 and cols[0] not in denylist:
 | |
|                     digests[cols[0]] = cols[1]
 | |
|                     if len(cols) > 2 and len(cols[2]) > 0:
 | |
|                         sizes[cols[0]] = int(cols[2])
 | |
|     return digests, sizes
 | |
| 
 | |
| 
 | |
| def save_digests_file(path_digests, digests, sizes, denylist):
 | |
|     """
 | |
|     Saves list of digests and output sizes to file.
 | |
| 
 | |
|     @param digests: dictionary with digests (keys are names)
 | |
|     @param sizes: dictionary with outputs sizes (keys are names)
 | |
|     @param denylist: list of keys to ignore
 | |
| 
 | |
|     @return a content of digests file
 | |
| 
 | |
|     """
 | |
|     digests_content = ''
 | |
|     names = sorted(set(digests.keys()).difference(denylist))
 | |
|     for name in names:
 | |
|         digest = digests[name]
 | |
|         assert name.find('\t') < 0 and name.find('\n') < 0
 | |
|         assert digest.find('\t') < 0 and digest.find('\n') < 0
 | |
|         digests_content += name + '\t' + digest
 | |
|         if name in sizes:
 | |
|             assert isinstance(sizes[name], int)
 | |
|             digests_content += '\t' + str(sizes[name])
 | |
|         digests_content += '\n'
 | |
| 
 | |
|     with open(path_digests, 'wb') as file_digests:
 | |
|         file_digests.write(digests_content)
 | |
| 
 | |
| 
 | |
| def load_lines_from_file(path):
 | |
|     """
 | |
|     Loads strings stored in the given file as separated lines.
 | |
| 
 | |
|     This routine returns lines read from the given file. All leading and
 | |
|     trailing whitespace characters in each line are removed. Lines consisting of
 | |
|     whitespace characters only are skipped.
 | |
| 
 | |
|     @param path: a path to the input file
 | |
| 
 | |
|     @returns a list of non-empty strings
 | |
| 
 | |
|     """
 | |
|     with open(path) as input_file:
 | |
|         lines = input_file.readlines()
 | |
| 
 | |
|     output_list = []
 | |
|     for entry in lines:
 | |
|         entry = entry.strip()
 | |
|         if entry != '':
 | |
|             output_list.append(entry)
 | |
| 
 | |
|     return output_list
 | |
| 
 | |
| 
 | |
| # ===================== PPD files on the SCS server
 | |
| 
 | |
| def get_filenames_from_PPD_index(task_id):
 | |
|     """
 | |
|     It downloads an index file from the SCS server and extracts names
 | |
|     of PPD files from it.
 | |
| 
 | |
|     @param task_id: an order number of an index file to process; this is
 | |
|             an integer from the interval [0..20)
 | |
| 
 | |
|     @returns a list of PPD filenames (may contain duplicates)
 | |
| 
 | |
|     """
 | |
|     # calculates a URL of the index file
 | |
|     url_metadata = 'https://www.gstatic.com/chromeos_printing/metadata_v2/'
 | |
|     url_ppd_index = url_metadata + ('index-%02d.json' % task_id)
 | |
|     # donwloads and parses the index file
 | |
|     request = requests.get(url_ppd_index)
 | |
|     entries = json.loads(request.content)
 | |
|     # extracts PPD filenames (the second element in each index entry)
 | |
|     output = []
 | |
|     for entry in entries:
 | |
|         output.append(entry[1])
 | |
|     # returns a list of extracted filenames
 | |
|     return output
 | |
| 
 | |
| 
 | |
| def download_PPD_file(ppd_file):
 | |
|     """
 | |
|     It downloads a PPD file from the SCS server.
 | |
| 
 | |
|     @param ppd_file: a filename of PPD file (neither path nor URL)
 | |
| 
 | |
|     @returns content of the PPD file
 | |
|     """
 | |
|     url_ppds = 'https://www.gstatic.com/chromeos_printing/ppds/'
 | |
|     request = requests.get(url_ppds + ppd_file)
 | |
|     return request.content
 | |
| 
 | |
| 
 | |
| # ==================== Local filesystem
 | |
| 
 | |
| def list_entries_from_directory(
 | |
|         path,
 | |
|         with_suffixes=None, nonempty_results=False,
 | |
|         include_files=True, include_directories=True ):
 | |
|     """
 | |
|     It returns all filenames from given directory. Results may be filtered
 | |
|     by filenames suffixes or entries types.
 | |
| 
 | |
|     @param path: a path to directory to list files from
 | |
|     @param with_suffixes: if set, only entries with given suffixes are
 | |
|             returned; it must be a tuple
 | |
|     @param nonempty_results: if True then Exception is raised if there is no
 | |
|             results
 | |
|     @param include_files: if False, then regular files and links are omitted
 | |
|     @param include_directories: if False, directories are omitted
 | |
| 
 | |
|     @returns a nonempty list of entries meeting given criteria
 | |
| 
 | |
|     @raises Exception if no matching filenames were found and
 | |
|             nonempty_results is set to True
 | |
| 
 | |
|     """
 | |
|     # lists all files from the directory and filter them by given criteria
 | |
|     list_of_files = []
 | |
|     for filename in os.listdir(path):
 | |
|         path_entry = os.path.join(path, filename)
 | |
|         # check type
 | |
|         if os.path.isfile(path_entry):
 | |
|             if not include_files:
 | |
|                 continue
 | |
|         elif os.path.isdir(path_entry):
 | |
|             if not include_directories:
 | |
|                 continue
 | |
|         else:
 | |
|             continue
 | |
|         # check suffix
 | |
|         if with_suffixes is not None:
 | |
|             if not filename.endswith(with_suffixes):
 | |
|                 continue
 | |
|         list_of_files.append(filename)
 | |
|     # throws exception if no files were found
 | |
|     if nonempty_results and len(list_of_files) == 0:
 | |
|         message = 'Directory %s does not contain any ' % path
 | |
|         message += 'entries meeting the criteria'
 | |
|         raise Exception(message)
 | |
|     # returns a non-empty list
 | |
|     return list_of_files
 |