#!/usr/bin/env python # # Copyright (C) 2018 The Android Open Source Project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Merge multiple CSV files, possibly with different columns. """ import argparse import csv import io import heapq import itertools import operator from zipfile import ZipFile args_parser = argparse.ArgumentParser( description='Merge given CSV files into a single one.' ) args_parser.add_argument( '--header', help='Comma separated field names; ' 'if missing determines the header from input files.', ) args_parser.add_argument( '--zip_input', help='Treat files as ZIP archives containing CSV files to merge.', action="store_true", ) args_parser.add_argument( '--key_field', help='The name of the field by which the rows should be sorted. ' 'Must be in the field names. ' 'Will be the first field in the output. ' 'All input files must be sorted by that field.', ) args_parser.add_argument( '--output', help='Output file for merged CSV.', default='-', type=argparse.FileType('w'), ) args_parser.add_argument('files', nargs=argparse.REMAINDER) args = args_parser.parse_args() def dict_reader(csvfile): return csv.DictReader(csvfile, delimiter=',', quotechar='|') csv_readers = [] if not args.zip_input: for file in args.files: csv_readers.append(dict_reader(open(file, 'r'))) else: for file in args.files: with ZipFile(file) as zipfile: for entry in zipfile.namelist(): if entry.endswith('.uau'): csv_readers.append( dict_reader(io.TextIOWrapper(zipfile.open(entry, 'r'))) ) if args.header: fieldnames = args.header.split(',') else: headers = {} # Build union of all columns from source files: for reader in csv_readers: for fieldname in reader.fieldnames: headers[fieldname] = "" fieldnames = list(headers.keys()) # By default chain the csv readers together so that the resulting output is # the concatenation of the rows from each of them: all_rows = itertools.chain.from_iterable(csv_readers) if len(csv_readers) > 0: keyField = args.key_field if keyField: assert keyField in fieldnames, ( "--key_field {} not found, must be one of {}\n" ).format(keyField, ",".join(fieldnames)) # Make the key field the first field in the output keyFieldIndex = fieldnames.index(args.key_field) fieldnames.insert(0, fieldnames.pop(keyFieldIndex)) # Create an iterable that performs a lazy merge sort on the csv readers # sorting the rows by the key field. all_rows = heapq.merge(*csv_readers, key=operator.itemgetter(keyField)) # Write all rows from the input files to the output: writer = csv.DictWriter( args.output, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, dialect='unix', fieldnames=fieldnames, ) writer.writeheader() # Read all the rows from the input and write them to the output in the correct # order: for row in all_rows: writer.writerow(row)