import os
import argparse
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
class BaseCompare:
def __init__(self, base_folder, feature_folder, export_folder, export_file):
self.base_folder = base_folder
self.feature_folder = feature_folder
self.export_folder = export_folder
self.export_file = export_file
@staticmethod
def intersect_rows(df1, df2):
return df1[df1.index.isin(df2.index)]
@staticmethod
def union_columns(df1, df2):
cols = sorted(list(set(df1.columns) | set(df2.columns)))
for col in cols:
if col not in df1.columns:
df1[col] = np.nan
return df1[cols]
def results(self, aggregate_column=None, aggregate_function=None, excludes=[], enum_maps={}):
aggregate_columns = []
if aggregate_column:
aggregate_columns.append(aggregate_column)
files = []
for file in os.listdir(self.base_folder):
if file not in excludes:
files.append(file)
for file in sorted(files):
base_file = os.path.join(self.base_folder, file)
feature_file = os.path.join(self.feature_folder, file)
if not os.path.exists(feature_file):
print("Warning: %s not found. Skipping..." % feature_file)
continue
base_df = read_csv(base_file, index_col=0)
feature_df = read_csv(feature_file, index_col=0)
base_df = self.intersect_rows(base_df, feature_df)
feature_df = self.intersect_rows(feature_df, base_df)
if file == 'results_output.csv':
base_df = base_df.select_dtypes(exclude=['string', 'bool'])
feature_df = feature_df.select_dtypes(exclude=['string', 'bool'])
try:
df = feature_df - base_df
except BaseException:
base_df = self.union_columns(base_df, feature_df)
feature_df = self.union_columns(feature_df, base_df)
df = feature_df != base_df
df = df.astype(int)
df = df.fillna('NA')
df.to_csv(os.path.join(self.export_folder, file))
# Get results charactersistics of groupby columns
if file == 'results_characteristics.csv':
group_df = base_df[aggregate_columns]
# Write grouped & aggregated results dfs
if file == 'results_output.csv':
for col, enum_map in enum_maps.items():
if col in aggregate_columns:
group_df[col] = group_df[col].map(enum_map)
# Merge groupby df and aggregate
sim_ct_base = len(base_df)
sim_ct_feature = len(feature_df)
if aggregate_columns:
base_df = group_df.merge(base_df, 'outer', left_index=True, right_index=True)\
.groupby(aggregate_columns)
feature_df = group_df.merge(feature_df, 'outer', left_index=True, right_index=True)\
.groupby(aggregate_columns)
if aggregate_function == 'sum':
base_df = base_df.sum(min_count=1).stack(dropna=False)
feature_df = feature_df.sum(min_count=1).stack(dropna=False)
elif aggregate_function == 'mean':
base_df = base_df.mean(numeric_only=True).stack(dropna=False)
feature_df = feature_df.mean(numeric_only=True).stack(dropna=False)
else:
if aggregate_function == 'sum':
base_df = base_df.sum(min_count=1)
feature_df = feature_df.sum(min_count=1)
elif aggregate_function == 'mean':
base_df = base_df.mean(numeric_only=True)
feature_df = feature_df.mean(numeric_only=True)
if not aggregate_function:
return
# Write aggregate results df
deltas = pd.DataFrame()
deltas['base'] = base_df
deltas['feature'] = feature_df
deltas['diff'] = deltas['feature'] - deltas['base']
deltas_non_zero = deltas[deltas['base'] != 0].index
deltas.loc[deltas_non_zero, '% diff'] = (100 * (deltas.loc[deltas_non_zero, 'diff'] /
deltas.loc[deltas_non_zero, 'base']))
deltas = deltas.round(2)
deltas.reset_index(level=aggregate_columns, inplace=True)
deltas.index.name = 'enduse'
deltas.fillna('n/a', inplace=True)
sims_df = pd.DataFrame({'base': sim_ct_base,
'feature': sim_ct_feature,
'diff': 'n/a',
'% diff': 'n/a'},
index=['simulation_count'])
sims_df[aggregate_columns] = 'n/a'
deltas = pd.concat([sims_df, deltas])
for group in aggregate_columns:
first_col = deltas.pop(group)
deltas.insert(0, group, first_col)
basename, ext = os.path.splitext(file)
if aggregate_columns:
basename += '_{aggregate_column}'.format(aggregate_column=aggregate_columns[0])
deltas.to_csv(
os.path.join(
self.export_folder,
self.export_file))
def visualize(self, aggregate_column=None, aggregate_function=None, display_column=None,
excludes=[], enum_maps={}, cols_to_ignore=[]):
colors = px.colors.qualitative.Dark24
aggregate_columns = []
if aggregate_column:
aggregate_columns.append(aggregate_column)
display_columns = []
if display_column:
display_columns.append(display_column)
files = []
for file in os.listdir(self.base_folder):
if file not in excludes:
files.append(file)
if display_columns or aggregate_columns:
base_characteristics_df = read_csv(
os.path.join(
self.base_folder,
'results_characteristics.csv'),
index_col=0)[
display_columns +
aggregate_columns]
feature_characteristics_df = read_csv(
os.path.join(
self.feature_folder,
'results_characteristics.csv'),
index_col=0)[
display_columns +
aggregate_columns]
def get_min_max(x_col, y_col, min_value, max_value):
try:
if 0.9 * np.min([x_col.min(), y_col.min()]) < min_value:
min_value = 0.9 * np.min([x_col.min(), y_col.min()])
except BaseException:
pass
try:
if 1.1 * np.max([x_col.max(), y_col.max()]) > max_value:
max_value = 1.1 * np.max([x_col.max(), y_col.max()])
except BaseException:
pass
return(min_value, max_value)
def add_error_lines(fig, showlegend, row, col, min_value, max_value):
fig.add_trace(go.Scatter(x=[min_value, max_value], y=[min_value, max_value],
line=dict(color='black', dash='dash', width=1), mode='lines',
showlegend=showlegend, name='0% Error'), row=row, col=col)
fig.add_trace(go.Scatter(x=[min_value, max_value], y=[0.9 * min_value, 0.9 * max_value],
line=dict(color='black', dash='dashdot', width=1), mode='lines',
showlegend=showlegend, name='+/- 10% Error'), row=row, col=col)
fig.add_trace(go.Scatter(x=[min_value, max_value], y=[1.1 * min_value, 1.1 * max_value],
line=dict(color='black', dash='dashdot', width=1), mode='lines',
showlegend=False), row=row, col=col)
def remove_columns(cols):
for col in cols[:]:
if all(v == 0 for v in base_df[col].values) and all(v == 0 for v in feature_df[col].values):
cols.remove(col)
for col in cols[:]:
for col_to_ignore in cols_to_ignore:
if col_to_ignore in col:
cols.remove(col)
return cols
for file in sorted(files):
base_file = os.path.join(self.base_folder, file)
feature_file = os.path.join(self.feature_folder, file)
if not os.path.exists(feature_file):
print("Warning: %s not found. Skipping..." % feature_file)
continue
base_df = read_csv(base_file, index_col=0)
feature_df = read_csv(feature_file, index_col=0)
base_df = self.intersect_rows(base_df, feature_df)
feature_df = self.intersect_rows(feature_df, base_df)
for col in base_df.columns:
if base_df[col].isnull().all():
base_df.drop(col, axis=1, inplace=True)
for col in feature_df.columns:
if feature_df[col].isnull().all():
feature_df.drop(col, axis=1, inplace=True)
cols = sorted(list(set(base_df.columns) & set(feature_df.columns)))
cols = remove_columns(cols)
n_cols = max(len(cols), 1)
groups = [None]
if display_columns:
base_df = base_characteristics_df.join(base_df, how='right')
feature_df = feature_characteristics_df.join(feature_df, how='right')
for col, enum_map in enum_maps.items():
if col in display_columns:
for df in [base_df, feature_df]:
df[col] = df[col].map(enum_map)
groups = list(base_df[display_columns[0]].unique())
n_groups = max(len(groups), 1)
vertical_spacing = 0.3 / n_cols
fig = make_subplots(
rows=n_cols,
cols=n_groups,
subplot_titles=groups * n_cols,
row_titles=[
f'{f}' for f in cols],
vertical_spacing=vertical_spacing)
nrow = 0
for col in cols:
nrow += 1
for group in groups:
ncol = groups.index(group) + 1
showlegend = False
if ncol == 1 and nrow == 1:
showlegend = True
x = base_df.copy()
y = feature_df.copy()
if group:
x = x.loc[x[display_columns[0]] == group, :]
y = y.loc[y[display_columns[0]] == group, :]
if aggregate_function:
x = x.assign(count=1)
sizes = x.groupby(aggregate_columns)[['count']].sum().reset_index()
if aggregate_function == 'sum':
x = x.groupby(aggregate_columns).sum().reset_index()
y = y.groupby(aggregate_columns).sum().reset_index()
elif aggregate_function == 'mean':
x = x.groupby(aggregate_columns).mean().reset_index()
y = y.groupby(aggregate_columns).mean().reset_index()
for agg_col in sorted(list(x[aggregate_columns[0]].unique())):
x_c = x[x[aggregate_columns[0]] == agg_col]
y_c = y[y[aggregate_columns[0]] == agg_col]
s_c = sizes[sizes[aggregate_columns[0]] == agg_col]
fig.add_trace(go.Scatter(x=x_c[col],
y=y_c[col],
marker=dict(size=s_c['count'],
line=dict(width=1.5,
color='DarkSlateGrey')),
mode='markers',
text=s_c['count'],
name=agg_col,
legendgroup=agg_col,
showlegend=False),
row=nrow, col=ncol)
else:
color = [colors[0] for i in y[col]]
if 'color_index' in y.columns.values:
color = [colors[i] for i in y['color_index']]
fig.add_trace(go.Scatter(x=x[col],
y=y[col],
marker=dict(size=12,
color=color,
line=dict(width=1.5,
color='DarkSlateGrey')),
mode='markers',
text=x.index,
name='',
legendgroup=col,
showlegend=False),
row=nrow, col=ncol)
min_value, max_value = get_min_max(x[col], y[col], 0, 0)
add_error_lines(fig, showlegend, nrow, ncol, min_value, max_value)
fig.update_xaxes(title_text='base', row=nrow, col=ncol)
fig.update_yaxes(title_text='feature', row=nrow, col=ncol)
fig['layout'].update(template='plotly_white')
fig.update_layout(width=800 * n_groups, height=600 * n_cols, autosize=False, font=dict(size=12))
# Re-locate row titles above plots
increment = (1/n_cols/2)*0.95
for i in fig['layout']['annotations']:
text = i['text'].replace('', '').replace('', '')
if text in cols:
i['textangle'] = 0
i['x'] = 0
i['y'] += increment
basename, ext = os.path.splitext(file)
filename = '{basename}.html'.format(basename=basename)
if self.export_file:
filename = self.export_file
plotly.offline.plot(fig,
filename=os.path.join(self.export_folder, '{filename}'.format(filename=filename)),
auto_open=False)
def read_csv(csv_file_path, **kwargs) -> pd.DataFrame:
default_na_values = pd._libs.parsers.STR_NA_VALUES
df = pd.read_csv(csv_file_path, na_values=list(default_na_values - {'None'}), keep_default_na=False, **kwargs)
return df
if __name__ == '__main__':
default_base_folder = 'workflow/tests/base_results'
default_feature_folder = 'workflow/tests/results'
default_export_folder = 'workflow/tests/comparisons'
actions = [method for method in dir(BaseCompare) if method.startswith('__') is False]
parser = argparse.ArgumentParser()
parser.add_argument('-b', '--base_folder', default=default_base_folder, help='Path of the base folder.')
parser.add_argument('-f', '--feature_folder', default=default_feature_folder, help='Path of the feature folder.')
parser.add_argument('-e', '--export_folder', default=default_export_folder, help='Path of the export folder.')
parser.add_argument('-x', '--export_file', help='Path of the export file.')
parser.add_argument('-a', '--actions', action='append', choices=actions, help='Method to call.')
args = parser.parse_args()
print(args)
if not os.path.exists(args.export_folder):
os.makedirs(args.export_folder)
compare = BaseCompare(args.base_folder, args.feature_folder, args.export_folder, args.export_file)
if args.actions is None:
args.actions = []
for action in args.actions:
if action == 'results':
compare.results()
elif action == 'visualize':
compare.visualize()