435 lines
16 KiB
Python
435 lines
16 KiB
Python
# Copyright 2018 The Chromium OS Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
import json
|
|
import md5
|
|
import os
|
|
import requests
|
|
|
|
# ==================== Documents digests
|
|
|
|
def _read_lines_with_prefix(document, position, prefix):
|
|
"""
|
|
Starting from given position, it parses from the document complete lines
|
|
(with '\n' character at the end) starting from given prefix. Parser stops
|
|
on the first line that does not start from the given prefix or when there
|
|
are no more '\n' characters in the file.
|
|
|
|
@param document: a document to parse
|
|
@param position: an offset in the document to start from
|
|
|
|
@returns a pair (lines, position), where the first element is a list of
|
|
parsed lines (with '\n' character at the end) and the second element
|
|
is a new offset in the document, pointing at the first character after
|
|
the last parsed line
|
|
|
|
"""
|
|
lines = []
|
|
while document.startswith(prefix, position):
|
|
position_next_line = document.find('\n', position + len(prefix))
|
|
if position_next_line < 0:
|
|
break
|
|
position_next_line += 1 # to eat '\n' character
|
|
lines.append(document[position:position_next_line])
|
|
position = position_next_line
|
|
return lines, position
|
|
|
|
|
|
def _process_PJL_headers(doc, position, out):
|
|
"""
|
|
The function tries to find a PJL headers in given document and process
|
|
them as it was described in _normalize_document(doc) function.
|
|
|
|
@param doc: see the description of _normalize_document(doc)
|
|
@param position: offset in the document; defines part of the document that
|
|
is already processed; searching for headers starts from this
|
|
position
|
|
@param out: already processed part of the document (from the beginning to
|
|
the given position)
|
|
|
|
@returns new position and output; the position is set at the end of the last
|
|
processed PJL header or it is a copy of of input position, if no PJL
|
|
headers have been foound; the output is adjusted accordingly.
|
|
|
|
"""
|
|
PJL_MARKER = b'\x1B%-12345X'
|
|
MARGIN = 2048 # max distance to the header
|
|
position_pjl = doc.find(PJL_MARKER, position, position + MARGIN)
|
|
while position_pjl >= 0:
|
|
out += doc[position:(position_pjl+len(PJL_MARKER))]
|
|
position = position_pjl + len(PJL_MARKER)
|
|
# parse header and filter problematic lines
|
|
lines, position = _read_lines_with_prefix(doc, position, '@PJL')
|
|
for line in lines:
|
|
if not (line.startswith('@PJL SET ') or
|
|
line.startswith('@PJL COMMENT') or
|
|
line.startswith('@PJL DMINFO') or
|
|
line.startswith('@PJL JOB NAME') or
|
|
line.startswith('@PJL JOBNAME')):
|
|
out += line
|
|
# try to find next PJL header
|
|
position_pjl = doc.find(PJL_MARKER, position, position + MARGIN)
|
|
return position, out
|
|
|
|
|
|
def _process_PS_Adobe_headers(doc, position, out):
|
|
"""
|
|
The function tries to find a PS-Adobe headers in given document and process
|
|
them as it was described in _normalize_document(doc) function.
|
|
|
|
@param doc: see the description of _normalize_document(doc)
|
|
@param position: offset in the document; defines part of the document that
|
|
is already processed; searching for headers starts from this
|
|
position
|
|
@param out: already processed part of the document (from the beginning to
|
|
the given position)
|
|
|
|
@returns new position and output; the position is set at the end of the last
|
|
processed PS-Adobe header or it is a copy of of input position, if
|
|
no PS-Adobe headers have been foound; the output is adjusted
|
|
accordingly.
|
|
|
|
"""
|
|
PS_MARKER = '%!PS-Adobe'
|
|
MARGIN = 2048 # max distance to the header
|
|
position_ps = doc.find(PS_MARKER, position, position + MARGIN)
|
|
while position_ps >= 0:
|
|
# add everything till the end of the first line in the header
|
|
position_next_line = doc.find('\n', position_ps + len(PS_MARKER))
|
|
if position_next_line < 0:
|
|
break # no more '\n', we finish the parsing here
|
|
position_next_line += 1 # to eat \n character
|
|
out += doc[position:position_next_line]
|
|
# parse the rest of the header and filter problematic lines
|
|
lines, position = _read_lines_with_prefix(doc, position_next_line, '%')
|
|
for line in lines:
|
|
if not (line.startswith('%%Title:') or line.startswith('%%For:')):
|
|
out += line
|
|
# search for lines with '{setuserinfo}' or '/JobInfo <<'
|
|
position_ps = doc.find(PS_MARKER, position, position + MARGIN)
|
|
position_ui = doc.find('{setuserinfo}', position, position + MARGIN)
|
|
position_ji = doc.find('/JobInfo <<', position, position + MARGIN)
|
|
# if '/JobInfo <<' was found, move the offset to the end of the section
|
|
if position_ji >= 0:
|
|
position_ji = doc.find('>>', position_ji)
|
|
# if the beginning of the next header was found, make sure that
|
|
# detected sections do not belong to the next header
|
|
if position_ps >= 0:
|
|
if position_ji > position_ps:
|
|
position_ji = -1
|
|
if position_ui > position_ps:
|
|
position_ui = -1
|
|
# choose the farthest section
|
|
position_end = max(position_ji, position_ui)
|
|
if position_end >= 0:
|
|
# find the first '\n' after the farthest section
|
|
position_end = doc.find('\n', position_end)
|
|
if position_end < 0:
|
|
break # no more '\n', we finish the parsing here
|
|
# split into lines everything from here to the end of the section
|
|
lines = doc[position:position_end].split('\n')
|
|
position = position_end + 1 # +1 is needed to eat the last \n
|
|
# filter problematic lines
|
|
for line in lines:
|
|
if not (line.find('{setuserinfo}') >= 0 or
|
|
line.find('/UserID') >= 0 or
|
|
line.find('/Time') >= 0 or
|
|
line.find('/HostLoginName') >= 0 or
|
|
line.find('/HostName') >= 0):
|
|
out += line + '\n'
|
|
# go to the next iteration, position_ps is already set
|
|
return position, out
|
|
|
|
|
|
def _normalize_LIDIL(doc):
|
|
"""
|
|
The function tries to proces given document as it was described in
|
|
_normalize_document(doc) function, but assuming that the document is in
|
|
LIDIL format. This format is used by some HP printers.
|
|
|
|
@param doc: see the description of _normalize_document(doc)
|
|
|
|
@returns None if the give ndocument is not in LIDIL format. Otherwise, it
|
|
returns a result for _normalize_document(doc) function.
|
|
|
|
"""
|
|
LIDIL_MARKER = b'\x24\x01\x00\x00\x07\x00\x00\x00'
|
|
LIDIL_JOBID_1_OFF = 2348 # first job id, offset from the beginning
|
|
LIDIL_JOBID_2_OFF = 2339 # second job id, offset from the end
|
|
JOBID_SIZE = 4 # number of bytes used to store job id
|
|
# the document is in LIDIL format <=> it starts with the marker
|
|
if not doc.startswith(LIDIL_MARKER):
|
|
return None
|
|
# remove both JOB IDs and exit
|
|
nd = len(doc)
|
|
if nd > LIDIL_JOBID_1_OFF + LIDIL_JOBID_2_OFF + 2*JOBID_SIZE:
|
|
doc = ''.join([ doc[:(LIDIL_JOBID_1_OFF)],
|
|
doc[(LIDIL_JOBID_1_OFF+JOBID_SIZE):(nd-LIDIL_JOBID_2_OFF)],
|
|
doc[(nd-LIDIL_JOBID_2_OFF+JOBID_SIZE):] ])
|
|
return doc
|
|
|
|
|
|
def _normalize_EJL(doc):
|
|
"""
|
|
The function tries to proces given document as it was described in
|
|
_normalize_document(doc) function, but assuming that the document is in
|
|
EJL format.
|
|
|
|
@param doc: see the description of _normalize_document(doc)
|
|
|
|
@returns None if the give ndocument is not in EJL format. Otherwise, it
|
|
returns a result for _normalize_document(doc) function.
|
|
|
|
"""
|
|
# EJL - some epson printers (like eplaser)
|
|
EJL_MARKER = b'\x1B\x01@EJL \n'
|
|
# the document is in EJL format <=> it starts with the marker
|
|
if not doc.startswith(EJL_MARKER):
|
|
return None
|
|
# copy the document to output; filter lines parsed from the EJL header
|
|
out = EJL_MARKER
|
|
lines, position = _read_lines_with_prefix(doc, len(EJL_MARKER), '@EJL')
|
|
for line in lines:
|
|
if not (line.startswith('@EJL JI ID=') or
|
|
line.startswith('@EJL JI USER=')):
|
|
out += line
|
|
# add the rest of the document and exit
|
|
out += doc[position:]
|
|
return out
|
|
|
|
|
|
def _normalize_document(doc):
|
|
"""
|
|
The input document is a raw package sent to printer. This function removes
|
|
from it all variables that can change, when the same content is printed.
|
|
That includes, but is not limited to: user name, host name, job id, date,
|
|
time.
|
|
|
|
@param doc: a raw document sent directly to printer to be printed
|
|
|
|
@returns a copy of doc with removed fragments that can vary between
|
|
printing jobs. The returned output is supposed to be identical for the
|
|
same input content send to the pipeline for the same PPD file.
|
|
|
|
"""
|
|
# Try to parse the document as LIDIL or EJL and exit if successful.
|
|
out = _normalize_LIDIL(doc)
|
|
if out is not None:
|
|
return out
|
|
out = _normalize_EJL(doc)
|
|
if out is not None:
|
|
return out
|
|
|
|
# Try to parse and process PJL and PS headers.
|
|
position = 0
|
|
out = ''
|
|
position, out = _process_PJL_headers(doc, position, out)
|
|
position, out = _process_PS_Adobe_headers(doc, position, out)
|
|
|
|
# Go to the tail of the document, add the skipped content to the output.
|
|
if position + 2048 < len(doc):
|
|
position_tail = len(doc) - 2048
|
|
out += doc[position:position_tail]
|
|
position = position_tail
|
|
|
|
# Try to find 'trailer << '.
|
|
position_trailer = doc.find('trailer << ', position)
|
|
if position_trailer >= 0:
|
|
# If found, prune the line with it.
|
|
position_end = doc.find('\n', position_trailer)
|
|
if position_end >= 0:
|
|
out += doc[position:position_trailer]
|
|
position = position_end + 1 # +1 to ommit '\n' from the trailer
|
|
|
|
# Add the rest of the document to the output.
|
|
out += doc[position:]
|
|
|
|
return out
|
|
|
|
|
|
def calculate_digest(doc):
|
|
"""
|
|
Calculates digests for given document.
|
|
|
|
@param doc: document's content
|
|
|
|
@returns calculated digests as a string of hexadecimals
|
|
|
|
"""
|
|
# Prune the variable parts of the document
|
|
out = _normalize_document(doc)
|
|
|
|
# Calculates hash
|
|
return md5.new(out).hexdigest()
|
|
|
|
|
|
def parse_digests_file(path_digests, denylist):
|
|
"""
|
|
Parses digests and outputs sizes from file.
|
|
|
|
@param path_digests: a path to a file with digests
|
|
@param denylist: list of keys to omit
|
|
|
|
@returns two dictionaries, both indexed by ppd filenames: the first one
|
|
contains digests, the second one contains output sizes; returns
|
|
empty dictionaries if the given file does not exist
|
|
|
|
"""
|
|
digests = dict()
|
|
sizes = dict()
|
|
denylist = set(denylist)
|
|
if os.path.isfile(path_digests):
|
|
with open(path_digests, 'rb') as file_digests:
|
|
lines = file_digests.read().splitlines()
|
|
for line in lines:
|
|
cols = line.split()
|
|
if len(cols) >= 2 and cols[0] not in denylist:
|
|
digests[cols[0]] = cols[1]
|
|
if len(cols) > 2 and len(cols[2]) > 0:
|
|
sizes[cols[0]] = int(cols[2])
|
|
return digests, sizes
|
|
|
|
|
|
def save_digests_file(path_digests, digests, sizes, denylist):
|
|
"""
|
|
Saves list of digests and output sizes to file.
|
|
|
|
@param digests: dictionary with digests (keys are names)
|
|
@param sizes: dictionary with outputs sizes (keys are names)
|
|
@param denylist: list of keys to ignore
|
|
|
|
@return a content of digests file
|
|
|
|
"""
|
|
digests_content = ''
|
|
names = sorted(set(digests.keys()).difference(denylist))
|
|
for name in names:
|
|
digest = digests[name]
|
|
assert name.find('\t') < 0 and name.find('\n') < 0
|
|
assert digest.find('\t') < 0 and digest.find('\n') < 0
|
|
digests_content += name + '\t' + digest
|
|
if name in sizes:
|
|
assert isinstance(sizes[name], int)
|
|
digests_content += '\t' + str(sizes[name])
|
|
digests_content += '\n'
|
|
|
|
with open(path_digests, 'wb') as file_digests:
|
|
file_digests.write(digests_content)
|
|
|
|
|
|
def load_lines_from_file(path):
|
|
"""
|
|
Loads strings stored in the given file as separated lines.
|
|
|
|
This routine returns lines read from the given file. All leading and
|
|
trailing whitespace characters in each line are removed. Lines consisting of
|
|
whitespace characters only are skipped.
|
|
|
|
@param path: a path to the input file
|
|
|
|
@returns a list of non-empty strings
|
|
|
|
"""
|
|
with open(path) as input_file:
|
|
lines = input_file.readlines()
|
|
|
|
output_list = []
|
|
for entry in lines:
|
|
entry = entry.strip()
|
|
if entry != '':
|
|
output_list.append(entry)
|
|
|
|
return output_list
|
|
|
|
|
|
# ===================== PPD files on the SCS server
|
|
|
|
def get_filenames_from_PPD_index(task_id):
|
|
"""
|
|
It downloads an index file from the SCS server and extracts names
|
|
of PPD files from it.
|
|
|
|
@param task_id: an order number of an index file to process; this is
|
|
an integer from the interval [0..20)
|
|
|
|
@returns a list of PPD filenames (may contain duplicates)
|
|
|
|
"""
|
|
# calculates a URL of the index file
|
|
url_metadata = 'https://www.gstatic.com/chromeos_printing/metadata_v2/'
|
|
url_ppd_index = url_metadata + ('index-%02d.json' % task_id)
|
|
# donwloads and parses the index file
|
|
request = requests.get(url_ppd_index)
|
|
entries = json.loads(request.content)
|
|
# extracts PPD filenames (the second element in each index entry)
|
|
output = []
|
|
for entry in entries:
|
|
output.append(entry[1])
|
|
# returns a list of extracted filenames
|
|
return output
|
|
|
|
|
|
def download_PPD_file(ppd_file):
|
|
"""
|
|
It downloads a PPD file from the SCS server.
|
|
|
|
@param ppd_file: a filename of PPD file (neither path nor URL)
|
|
|
|
@returns content of the PPD file
|
|
"""
|
|
url_ppds = 'https://www.gstatic.com/chromeos_printing/ppds/'
|
|
request = requests.get(url_ppds + ppd_file)
|
|
return request.content
|
|
|
|
|
|
# ==================== Local filesystem
|
|
|
|
def list_entries_from_directory(
|
|
path,
|
|
with_suffixes=None, nonempty_results=False,
|
|
include_files=True, include_directories=True ):
|
|
"""
|
|
It returns all filenames from given directory. Results may be filtered
|
|
by filenames suffixes or entries types.
|
|
|
|
@param path: a path to directory to list files from
|
|
@param with_suffixes: if set, only entries with given suffixes are
|
|
returned; it must be a tuple
|
|
@param nonempty_results: if True then Exception is raised if there is no
|
|
results
|
|
@param include_files: if False, then regular files and links are omitted
|
|
@param include_directories: if False, directories are omitted
|
|
|
|
@returns a nonempty list of entries meeting given criteria
|
|
|
|
@raises Exception if no matching filenames were found and
|
|
nonempty_results is set to True
|
|
|
|
"""
|
|
# lists all files from the directory and filter them by given criteria
|
|
list_of_files = []
|
|
for filename in os.listdir(path):
|
|
path_entry = os.path.join(path, filename)
|
|
# check type
|
|
if os.path.isfile(path_entry):
|
|
if not include_files:
|
|
continue
|
|
elif os.path.isdir(path_entry):
|
|
if not include_directories:
|
|
continue
|
|
else:
|
|
continue
|
|
# check suffix
|
|
if with_suffixes is not None:
|
|
if not filename.endswith(with_suffixes):
|
|
continue
|
|
list_of_files.append(filename)
|
|
# throws exception if no files were found
|
|
if nonempty_results and len(list_of_files) == 0:
|
|
message = 'Directory %s does not contain any ' % path
|
|
message += 'entries meeting the criteria'
|
|
raise Exception(message)
|
|
# returns a non-empty list
|
|
return list_of_files
|