537 lines
22 KiB
Python
537 lines
22 KiB
Python
# Copyright 2021 Google LLC
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
|
|
# https://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from enum import IntEnum
|
|
import json
|
|
from multiprocessing import Pool
|
|
import pandas as pd
|
|
import pathlib
|
|
import numpy as np
|
|
|
|
BRANCH_ARTIFACTS_DIR = (
|
|
pathlib.Path(__file__).parent.resolve()
|
|
/ "googleapiclient"
|
|
/ "discovery_cache"
|
|
/ "documents"
|
|
)
|
|
MAIN_ARTIFACTS_DIR = (
|
|
pathlib.Path(__file__).parent.resolve()
|
|
/ ".."
|
|
/ "main"
|
|
/ "googleapiclient"
|
|
/ "discovery_cache"
|
|
/ "documents"
|
|
)
|
|
|
|
MULTIPROCESSING_NUM_PER_BATCH = 5
|
|
MULTIPROCESSING_NUM_AGENTS = 10
|
|
|
|
|
|
class ChangeType(IntEnum):
|
|
UNKNOWN = 0
|
|
DELETED = 1
|
|
ADDED = 2
|
|
CHANGED = 3
|
|
|
|
|
|
class DirectoryDoesNotExist(ValueError):
|
|
"""Raised when the specified directory does not exist."""
|
|
|
|
pass
|
|
|
|
|
|
class ChangeSummary:
|
|
"""Represents the change summary between 2 directories containing \
|
|
artifacts.
|
|
"""
|
|
|
|
def __init__(self, new_artifacts_dir, current_artifacts_dir, temp_dir, file_list):
|
|
"""Initializes an instance of a ChangeSummary.
|
|
|
|
Args:
|
|
new_artifacts_dir (str): The relative path to the directory with the
|
|
new discovery artifacts.
|
|
current_artifacts_dir (str): The relative path to the directory with
|
|
the current discovery artifacts.
|
|
temp_dir (str): The relative path to the directory used for
|
|
temporary storage where intermediate files will be stored.
|
|
file_list (list): A list of strings containing files to analyze.
|
|
"""
|
|
|
|
self._file_list = file_list
|
|
self._new_artifacts_dir = pathlib.Path(new_artifacts_dir)
|
|
self._current_artifacts_dir = pathlib.Path(current_artifacts_dir)
|
|
self._temp_dir = pathlib.Path(temp_dir)
|
|
|
|
# Sanity checks to ensure directories exist
|
|
self._raise_if_directory_not_found(self._new_artifacts_dir)
|
|
self._raise_if_directory_not_found(self._current_artifacts_dir)
|
|
self._raise_if_directory_not_found(self._temp_dir)
|
|
|
|
def _raise_if_directory_not_found(self, directory):
|
|
"""Raises if the `directory` doesn't exist
|
|
|
|
args:
|
|
directory (str): The relative path to the `directory`
|
|
"""
|
|
|
|
if not pathlib.Path(directory).exists():
|
|
raise DirectoryDoesNotExist(
|
|
"Directory does not exist : {0}".format(directory)
|
|
)
|
|
|
|
def _load_json_to_dataframe(self, file_path):
|
|
"""Returns a pandas dataframe from the json file provided.
|
|
|
|
args:
|
|
file_path (str): The relative path to the discovery artifact to
|
|
parse.
|
|
"""
|
|
|
|
# Create an empty dataframe as we will need to return it if the file
|
|
# doesn't exist
|
|
dataframe_doc = pd.DataFrame()
|
|
|
|
if pathlib.Path(file_path).is_file():
|
|
with open(file_path, "r") as f:
|
|
# Now load the json file into a pandas dataframe as a flat table
|
|
dataframe_doc = pd.json_normalize(json.load(f))
|
|
return dataframe_doc
|
|
|
|
def _get_discovery_differences(self, filename):
|
|
"""Returns a pandas dataframe which contains the differences with the
|
|
current and new discovery artifact directories, corresponding to the
|
|
file name provided.
|
|
|
|
args:
|
|
filename (str): The name of the discovery artifact to parse.
|
|
"""
|
|
# The paths of the 2 discovery artifacts to compare
|
|
current_artifact_path = self._current_artifacts_dir / filename
|
|
new_artifact_path = self._new_artifacts_dir / filename
|
|
|
|
# Use a helper functions to load the discovery artifacts into pandas
|
|
# dataframes
|
|
current_doc = self._load_json_to_dataframe(current_artifact_path)
|
|
new_doc = self._load_json_to_dataframe(new_artifact_path)
|
|
|
|
# Concatenate the 2 dataframes, transpose them, and create
|
|
# a new dataframe called combined_docs with columns
|
|
# `Key`, `CurrentValue`, `NewValue`.
|
|
combined_docs = (
|
|
pd.concat([current_doc, new_doc], keys=["CurrentValue", "NewValue"])
|
|
# Drop the index column
|
|
.reset_index(drop=True, level=1)
|
|
# Transpose the DataFrame, Resulting Columns should be
|
|
# ["Key", "CurrentValue", "New Value"]
|
|
.rename_axis(["Key"], axis=1).transpose()
|
|
# Drop the index column
|
|
.reset_index()
|
|
)
|
|
|
|
# When discovery documents are added, the column `CurrentValue` will
|
|
# not exist. In that case, we'll just populate with `np.nan`.
|
|
if "CurrentValue" not in combined_docs.columns:
|
|
combined_docs["CurrentValue"] = np.nan
|
|
|
|
# When discovery documents are deleted, the column `NewValue` will
|
|
# not exist. In that case, we'll just populate with `np.nan`.
|
|
if "NewValue" not in combined_docs.columns:
|
|
combined_docs["NewValue"] = np.nan
|
|
|
|
# Split the Key into 2 columns for `Parent` and `Child` in order
|
|
# to group keys with the same parents together to summarize the changes
|
|
# by parent.
|
|
parent_child_df = combined_docs["Key"].str.rsplit(".", 1, expand=True)
|
|
# Rename the columns and join them with the combined_docs dataframe.
|
|
# If we only have a `Parent` column, it means that the Key doesn't have
|
|
# any children.
|
|
if len(parent_child_df.columns) == 1:
|
|
parent_child_df.columns = ["Parent"]
|
|
else:
|
|
parent_child_df.columns = ["Parent", "Child"]
|
|
combined_docs = combined_docs.join(parent_child_df)
|
|
|
|
# Create a new column `Added` to identify rows which have new keys.
|
|
combined_docs["Added"] = np.where(
|
|
combined_docs["CurrentValue"].isnull(), True, False
|
|
)
|
|
|
|
# Create a new column `Deleted` to identify rows which have deleted keys.
|
|
combined_docs["Deleted"] = np.where(
|
|
combined_docs["NewValue"].isnull(), True, False
|
|
)
|
|
|
|
# Aggregate the keys added by grouping keys with the same parents
|
|
# together to summarize the changes by parent rather than by key.
|
|
parent_added_agg = (
|
|
combined_docs.groupby("Parent")
|
|
.Added.value_counts(normalize=True)
|
|
.reset_index(name="Proportion")
|
|
)
|
|
|
|
# Add a column NumLevels to inicate the number of levels in the tree
|
|
# which will allow us to sort the parents in hierarchical order.
|
|
parent_added_agg["NumLevels"] = (
|
|
parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
|
|
)
|
|
|
|
# Aggregate the keys deleted by grouping keys with the same parents
|
|
# together to summarize the changes by parent rather than by key.
|
|
parent_deleted_agg = (
|
|
combined_docs.groupby("Parent")
|
|
.Deleted.value_counts(normalize=True)
|
|
.reset_index(name="Proportion")
|
|
)
|
|
|
|
# Add a column NumLevels to inicate the number of levels in the tree
|
|
# which will allow us to sort the parents in hierarchical order.
|
|
parent_deleted_agg["NumLevels"] = (
|
|
parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
|
|
)
|
|
|
|
# Create a list of all parents that have been added in hierarchical
|
|
# order. When `Proportion` is 1, it means that the parent is new as all
|
|
# children keys have been added.
|
|
all_added = (
|
|
parent_added_agg[
|
|
(parent_added_agg["Proportion"] == 1)
|
|
& (parent_added_agg["Added"] == True)
|
|
][["Parent", "NumLevels"]]
|
|
.sort_values("NumLevels", ascending=True)
|
|
.Parent.to_list()
|
|
)
|
|
|
|
# Create a list of all parents that have been deleted in hierarchical
|
|
# order. When `Proportion` is 1, it means that the parent is new as all
|
|
# children keys have been deleted.
|
|
all_deleted = (
|
|
parent_deleted_agg[
|
|
(parent_deleted_agg["Proportion"] == 1)
|
|
& (parent_deleted_agg["Deleted"] == True)
|
|
][["Parent", "NumLevels"]]
|
|
.sort_values("NumLevels", ascending=True)
|
|
.Parent.to_list()
|
|
)
|
|
|
|
# Go through the list of parents that have been added. If we find any
|
|
# keys with parents which are a substring of the parent in this list,
|
|
# then it means that the entire parent is new. We don't need verbose
|
|
# information about the children, so we replace the parent.
|
|
for i in range(0, len(all_added)):
|
|
word = all_added[i]
|
|
combined_docs.Parent = np.where(
|
|
combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
|
|
)
|
|
|
|
# Go through the list of parents that have been deleted. If we find any
|
|
# keys with parents which are a substring of the parent in this list,
|
|
# then it means that the entire parent is deleted. We don't need verbose
|
|
# information about the children, so we replace the parent.
|
|
for i in range(0, len(all_deleted)):
|
|
word = all_deleted[i]
|
|
combined_docs.Parent = np.where(
|
|
combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
|
|
)
|
|
|
|
# Create a new dataframe with only the keys which have changed
|
|
docs_diff = combined_docs[
|
|
combined_docs["CurrentValue"] != combined_docs["NewValue"]
|
|
].copy(deep=False)
|
|
|
|
# Get the API and Version from the file name but exclude the extension.
|
|
api_version_string = filename.split(".")[:-1]
|
|
# Create columns `Name` and `Version` using the version string
|
|
docs_diff["Name"] = api_version_string[0]
|
|
docs_diff["Version"] = ".".join(api_version_string[1:])
|
|
|
|
# These conditions are used as arguments in the `np.where` function
|
|
# below.
|
|
deleted_condition = docs_diff["NewValue"].isnull()
|
|
added_condition = docs_diff["CurrentValue"].isnull()
|
|
|
|
# Create a new `ChangeType` column. The `np.where()` function is like a
|
|
# tenary operator. When the `deleted_condition` is `True`, the
|
|
# `ChangeType` will be `ChangeType.Deleted`. If the added_condition is
|
|
# `True` the `ChangeType` will be `ChangeType.Added`, otherwise the
|
|
# `ChangeType` will be `ChangeType.Changed`.
|
|
docs_diff["ChangeType"] = np.where(
|
|
deleted_condition,
|
|
ChangeType.DELETED,
|
|
np.where(added_condition, ChangeType.ADDED, ChangeType.CHANGED),
|
|
)
|
|
|
|
# Filter out keys which rarely affect functionality. For example:
|
|
# {"description", "documentation", "enum", "etag", "revision", "title",
|
|
# "url", "rootUrl"}
|
|
docs_diff = docs_diff[
|
|
~docs_diff["Key"].str.contains(
|
|
"|".join(self._get_keys_to_ignore()), case=False
|
|
)
|
|
]
|
|
|
|
# Group keys with similar parents together and create a new column
|
|
# called 'Count' which indicates the number of keys that have been
|
|
# grouped together. The reason for the count column is that when keys
|
|
# have the same parent, we group them together to improve readability.
|
|
docs_diff_with_count = (
|
|
docs_diff.groupby(
|
|
["Parent", "Added", "Deleted", "Name", "Version", "ChangeType"]
|
|
)
|
|
.size()
|
|
.reset_index(name="Count")
|
|
)
|
|
|
|
# Add counts column
|
|
docs_diff = docs_diff.merge(docs_diff_with_count)
|
|
|
|
# When the count is greater than 1, update the key with the name of the
|
|
# parent since we are consolidating keys with the same parent.
|
|
docs_diff.loc[docs_diff["Count"] > 1, "Key"] = docs_diff["Parent"]
|
|
|
|
return docs_diff[
|
|
["Key", "Added", "Deleted", "Name", "Version", "ChangeType", "Count"]
|
|
].drop_duplicates()
|
|
|
|
def _build_summary_message(self, api_name, is_feature):
|
|
"""Returns a string containing the summary for a given api. The string
|
|
returned will be in the format `fix(<api_name>): update the API`
|
|
when `is_feature=False` and `feat(<api_name>)!: update the API`
|
|
when `is_feature=True`.
|
|
|
|
args:
|
|
api_name (str): The name of the api to include in the summary.
|
|
is_feature (bool): If True, include the prefix `feat` otherwise use
|
|
`fix`
|
|
"""
|
|
|
|
# Build the conventional commit string based on the arguments provided
|
|
commit_type = "feat" if is_feature else "fix"
|
|
return "{0}({1}): update the api".format(commit_type, api_name)
|
|
|
|
def _get_keys_to_ignore(self):
|
|
"""Returns a list of strings with keys to ignore because they rarely
|
|
affect functionality.
|
|
|
|
args: None
|
|
"""
|
|
keys_to_ignore = [
|
|
"description",
|
|
"documentation",
|
|
"enum",
|
|
"etag",
|
|
"revision",
|
|
"title",
|
|
"url",
|
|
"rootUrl",
|
|
]
|
|
return keys_to_ignore
|
|
|
|
def _get_stable_versions(self, versions):
|
|
"""Returns a pandas series `pd.Series()` of boolean values,
|
|
corresponding to the given series, indicating whether the version is
|
|
considered stable or not.
|
|
args:
|
|
versions (object): a pandas series containing version
|
|
information for all discovery artifacts.
|
|
"""
|
|
# Use a regex on the version to find versions with the pattern
|
|
# <v>.<0-9>.<0-9>.<0-9> . Any api that matches this pattern will be
|
|
# labeled as stable. In other words, v1, v1.4 and v1.4.5 is stable
|
|
# but v1b1 v1aplha and v1beta1 is not stable.
|
|
return versions.str.extract(r"(v\d?\.?\d?\.?\d+$)").notnull()
|
|
|
|
def _get_summary_and_write_to_disk(self, dataframe, directory):
|
|
"""Writes summary information to file about changes made to discovery
|
|
artifacts based on the provided dataframe and returns a dataframe
|
|
with the same. The file `'allapis.dataframe'` is saved to the current
|
|
working directory.
|
|
args:
|
|
dataframe (object): a pandas dataframe containing summary change
|
|
information for all discovery artifacts
|
|
directory (str): path where the summary file should be saved
|
|
"""
|
|
|
|
dataframe["IsStable"] = self._get_stable_versions(dataframe["Version"])
|
|
|
|
# Create a filter for features, which contains only rows which have keys
|
|
# that have been deleted or added, that will be used as an argument in
|
|
# the `np.where()` call below.
|
|
filter_features = (dataframe["ChangeType"] == ChangeType.DELETED) | (
|
|
dataframe["ChangeType"] == ChangeType.ADDED
|
|
)
|
|
|
|
# Create a new column `IsFeature` to indicate which rows should be
|
|
# considered as features.
|
|
dataframe["IsFeature"] = np.where(filter_features, True, np.nan)
|
|
|
|
# Create a new column `IsFeatureAggregate` which will be used to
|
|
# summarize the api changes. We can either have feature or fix but not
|
|
# both.
|
|
dataframe["IsFeatureAggregate"] = dataframe.groupby("Name").IsFeature.transform(
|
|
lambda x: x.any()
|
|
)
|
|
|
|
# Create a new column `Summary`, which will contain a string with the
|
|
# conventional commit message.
|
|
dataframe["Summary"] = np.vectorize(self._build_summary_message)(
|
|
dataframe["Name"], dataframe["IsFeatureAggregate"]
|
|
)
|
|
|
|
# Write the final dataframe to disk as it will be used in the
|
|
# buildprbody.py script
|
|
dataframe.to_csv(directory / "allapis.dataframe")
|
|
return dataframe
|
|
|
|
def _write_verbose_changes_to_disk(self, dataframe, directory, summary_df):
|
|
"""Writes verbose information to file about changes made to discovery
|
|
artifacts based on the provided dataframe. A separate file is saved
|
|
for each api in the current working directory. The extension of the
|
|
files will be `'.verbose'`.
|
|
|
|
args:
|
|
dataframe (object): a pandas dataframe containing verbose change
|
|
information for all discovery artifacts
|
|
directory (str): path where the summary file should be saved
|
|
summary_df (object): A dataframe containing a summary of the changes
|
|
"""
|
|
# Array of strings which will contains verbose change information for
|
|
# each api
|
|
verbose_changes = []
|
|
|
|
# Sort the dataframe to minimize file operations below.
|
|
dataframe.sort_values(
|
|
by=["Name", "Version", "ChangeType"], ascending=True, inplace=True
|
|
)
|
|
|
|
# Select only the relevant columns. We need to create verbose output
|
|
# by Api Name, Version and ChangeType so we need to group by these
|
|
# columns.
|
|
|
|
change_type_groups = dataframe[
|
|
["Name", "Version", "ChangeType", "Key", "Count"]
|
|
].groupby(["Name", "Version", "ChangeType"])
|
|
|
|
lastApi = ""
|
|
lastVersion = ""
|
|
lastType = ChangeType.UNKNOWN
|
|
|
|
f = None
|
|
for name, group in change_type_groups:
|
|
currentApi = name[0]
|
|
currentVersion = name[1]
|
|
currentType = name[2]
|
|
|
|
# We need to handing file opening and closing when processing an API
|
|
# which is different from the previous one
|
|
if lastApi != currentApi:
|
|
# If we are processing a new api, close the file used for
|
|
# processing the previous API
|
|
if f is not None:
|
|
f.writelines(verbose_changes)
|
|
f.close()
|
|
f = None
|
|
# Clear the array of strings with information from the previous
|
|
# api and reset the last version
|
|
verbose_changes = []
|
|
lastVersion = ""
|
|
# Create a file which contains verbose changes for the current
|
|
# API being processed
|
|
filename = "{0}.verbose".format(currentApi)
|
|
f = open(pathlib.Path(directory / filename), "a")
|
|
lastApi = currentApi
|
|
|
|
# Create a filter with only the rows for the current API
|
|
current_api_filter = summary_df["Name"] == currentApi
|
|
|
|
# Get the string in the `Summary` column for the current api and
|
|
# append it to `verbose_changes`. The `Summary` column contains
|
|
# the conventional commit message. Use pandas.Series.iloc[0] to
|
|
# retrieve only the first elemnt, since all the values in the
|
|
# summary column are the same for a given API.
|
|
verbose_changes.append(summary_df[current_api_filter].Summary.iloc[0])
|
|
|
|
# If the version has changed, we need to create append a new heading
|
|
# in the verbose summary which contains the api and version.
|
|
if lastVersion != currentVersion:
|
|
# Append a header string with the API and version
|
|
verbose_changes.append(
|
|
"\n\n#### {0}:{1}\n\n".format(currentApi, currentVersion)
|
|
)
|
|
|
|
lastVersion = currentVersion
|
|
lastType = ChangeType.UNKNOWN
|
|
|
|
# Whenever the change type is different, we need to create a new
|
|
# heading for the group of keys with the same change type.
|
|
if currentType != lastType:
|
|
if currentType == ChangeType.DELETED:
|
|
verbose_changes.append("\nThe following keys were deleted:\n")
|
|
elif currentType == ChangeType.ADDED:
|
|
verbose_changes.append("\nThe following keys were added:\n")
|
|
else:
|
|
verbose_changes.append("\nThe following keys were changed:\n")
|
|
|
|
lastType = currentType
|
|
|
|
# Append the keys, and corresponding count, in the same change
|
|
# type group.
|
|
verbose_changes.extend(
|
|
[
|
|
"- {0} (Total Keys: {1})\n".format(row["Key"], row["Count"])
|
|
for index, row in group[["Key", "Count"]].iterrows()
|
|
]
|
|
)
|
|
|
|
# Make sure to close the last file and write the changes.
|
|
if f is not None:
|
|
f.writelines(verbose_changes)
|
|
f.close()
|
|
f = None
|
|
|
|
def detect_discovery_changes(self):
|
|
"""Writes a summary of the changes to the discovery artifacts to disk
|
|
at the path specified in `temp_dir`.
|
|
|
|
args: None
|
|
"""
|
|
result = pd.DataFrame()
|
|
# Process files in parallel to improve performance
|
|
with Pool(processes=MULTIPROCESSING_NUM_AGENTS) as pool:
|
|
result = result.append(
|
|
pool.map(
|
|
self._get_discovery_differences,
|
|
self._file_list,
|
|
MULTIPROCESSING_NUM_PER_BATCH,
|
|
)
|
|
)
|
|
|
|
if len(result):
|
|
# Sort the resulting dataframe by `Name`, `Version`, `ChangeType`
|
|
# and `Key`
|
|
sort_columns = ["Name", "Version", "ChangeType", "Key"]
|
|
result.sort_values(by=sort_columns, ascending=True, inplace=True)
|
|
|
|
# Create a folder which be used by the `createcommits.sh` and
|
|
# `buildprbody.py` scripts.
|
|
pathlib.Path(self._temp_dir).mkdir(exist_ok=True)
|
|
|
|
# Create a summary which contains a conventional commit message
|
|
# for each API and write it to disk.
|
|
summary_df = self._get_summary_and_write_to_disk(result, self._temp_dir)
|
|
|
|
# Create verbose change information for each API which contains
|
|
# a list of changes by key and write it to disk.
|
|
self._write_verbose_changes_to_disk(result, self._temp_dir, summary_df)
|