import os import argparse import numpy as np import pandas as pd import plotly import plotly.graph_objects as go from plotly.subplots import make_subplots import plotly.express as px class BaseCompare: def __init__(self, base_folder, feature_folder, export_folder, export_file): self.base_folder = base_folder self.feature_folder = feature_folder self.export_folder = export_folder self.export_file = export_file @staticmethod def intersect_rows(df1, df2): return df1[df1.index.isin(df2.index)] @staticmethod def union_columns(df1, df2): cols = sorted(list(set(df1.columns) | set(df2.columns))) for col in cols: if col not in df1.columns: df1[col] = np.nan return df1[cols] def results(self, aggregate_column=None, aggregate_function=None, excludes=[], enum_maps={}): aggregate_columns = [] if aggregate_column: aggregate_columns.append(aggregate_column) files = [] for file in os.listdir(self.base_folder): if file not in excludes: files.append(file) for file in sorted(files): base_file = os.path.join(self.base_folder, file) feature_file = os.path.join(self.feature_folder, file) if not os.path.exists(feature_file): print("Warning: %s not found. Skipping..." % feature_file) continue base_df = read_csv(base_file, index_col=0) feature_df = read_csv(feature_file, index_col=0) base_df = self.intersect_rows(base_df, feature_df) feature_df = self.intersect_rows(feature_df, base_df) if file == 'results_output.csv': base_df = base_df.select_dtypes(exclude=['string', 'bool']) feature_df = feature_df.select_dtypes(exclude=['string', 'bool']) try: df = feature_df - base_df except BaseException: base_df = self.union_columns(base_df, feature_df) feature_df = self.union_columns(feature_df, base_df) df = feature_df != base_df df = df.astype(int) df = df.fillna('NA') df.to_csv(os.path.join(self.export_folder, file)) # Get results charactersistics of groupby columns if file == 'results_characteristics.csv': group_df = base_df[aggregate_columns] # Write grouped & aggregated results dfs if file == 'results_output.csv': for col, enum_map in enum_maps.items(): if col in aggregate_columns: group_df[col] = group_df[col].map(enum_map) # Merge groupby df and aggregate sim_ct_base = len(base_df) sim_ct_feature = len(feature_df) if aggregate_columns: base_df = group_df.merge(base_df, 'outer', left_index=True, right_index=True)\ .groupby(aggregate_columns) feature_df = group_df.merge(feature_df, 'outer', left_index=True, right_index=True)\ .groupby(aggregate_columns) if aggregate_function == 'sum': base_df = base_df.sum(min_count=1).stack(dropna=False) feature_df = feature_df.sum(min_count=1).stack(dropna=False) elif aggregate_function == 'mean': base_df = base_df.mean(numeric_only=True).stack(dropna=False) feature_df = feature_df.mean(numeric_only=True).stack(dropna=False) else: if aggregate_function == 'sum': base_df = base_df.sum(min_count=1) feature_df = feature_df.sum(min_count=1) elif aggregate_function == 'mean': base_df = base_df.mean(numeric_only=True) feature_df = feature_df.mean(numeric_only=True) if not aggregate_function: return # Write aggregate results df deltas = pd.DataFrame() deltas['base'] = base_df deltas['feature'] = feature_df deltas['diff'] = deltas['feature'] - deltas['base'] deltas_non_zero = deltas[deltas['base'] != 0].index deltas.loc[deltas_non_zero, '% diff'] = (100 * (deltas.loc[deltas_non_zero, 'diff'] / deltas.loc[deltas_non_zero, 'base'])) deltas = deltas.round(2) deltas.reset_index(level=aggregate_columns, inplace=True) deltas.index.name = 'enduse' deltas.fillna('n/a', inplace=True) sims_df = pd.DataFrame({'base': sim_ct_base, 'feature': sim_ct_feature, 'diff': 'n/a', '% diff': 'n/a'}, index=['simulation_count']) sims_df[aggregate_columns] = 'n/a' deltas = pd.concat([sims_df, deltas]) for group in aggregate_columns: first_col = deltas.pop(group) deltas.insert(0, group, first_col) basename, ext = os.path.splitext(file) if aggregate_columns: basename += '_{aggregate_column}'.format(aggregate_column=aggregate_columns[0]) deltas.to_csv( os.path.join( self.export_folder, self.export_file)) def visualize(self, aggregate_column=None, aggregate_function=None, display_column=None, excludes=[], enum_maps={}, cols_to_ignore=[]): colors = px.colors.qualitative.Dark24 aggregate_columns = [] if aggregate_column: aggregate_columns.append(aggregate_column) display_columns = [] if display_column: display_columns.append(display_column) files = [] for file in os.listdir(self.base_folder): if file not in excludes: files.append(file) if display_columns or aggregate_columns: base_characteristics_df = read_csv( os.path.join( self.base_folder, 'results_characteristics.csv'), index_col=0)[ display_columns + aggregate_columns] feature_characteristics_df = read_csv( os.path.join( self.feature_folder, 'results_characteristics.csv'), index_col=0)[ display_columns + aggregate_columns] def get_min_max(x_col, y_col, min_value, max_value): try: if 0.9 * np.min([x_col.min(), y_col.min()]) < min_value: min_value = 0.9 * np.min([x_col.min(), y_col.min()]) except BaseException: pass try: if 1.1 * np.max([x_col.max(), y_col.max()]) > max_value: max_value = 1.1 * np.max([x_col.max(), y_col.max()]) except BaseException: pass return(min_value, max_value) def add_error_lines(fig, showlegend, row, col, min_value, max_value): fig.add_trace(go.Scatter(x=[min_value, max_value], y=[min_value, max_value], line=dict(color='black', dash='dash', width=1), mode='lines', showlegend=showlegend, name='0% Error'), row=row, col=col) fig.add_trace(go.Scatter(x=[min_value, max_value], y=[0.9 * min_value, 0.9 * max_value], line=dict(color='black', dash='dashdot', width=1), mode='lines', showlegend=showlegend, name='+/- 10% Error'), row=row, col=col) fig.add_trace(go.Scatter(x=[min_value, max_value], y=[1.1 * min_value, 1.1 * max_value], line=dict(color='black', dash='dashdot', width=1), mode='lines', showlegend=False), row=row, col=col) def remove_columns(cols): for col in cols[:]: if all(v == 0 for v in base_df[col].values) and all(v == 0 for v in feature_df[col].values): cols.remove(col) for col in cols[:]: for col_to_ignore in cols_to_ignore: if col_to_ignore in col: cols.remove(col) return cols for file in sorted(files): base_file = os.path.join(self.base_folder, file) feature_file = os.path.join(self.feature_folder, file) if not os.path.exists(feature_file): print("Warning: %s not found. Skipping..." % feature_file) continue base_df = read_csv(base_file, index_col=0) feature_df = read_csv(feature_file, index_col=0) base_df = self.intersect_rows(base_df, feature_df) feature_df = self.intersect_rows(feature_df, base_df) for col in base_df.columns: if base_df[col].isnull().all(): base_df.drop(col, axis=1, inplace=True) for col in feature_df.columns: if feature_df[col].isnull().all(): feature_df.drop(col, axis=1, inplace=True) cols = sorted(list(set(base_df.columns) & set(feature_df.columns))) cols = remove_columns(cols) n_cols = max(len(cols), 1) groups = [None] if display_columns: base_df = base_characteristics_df.join(base_df, how='right') feature_df = feature_characteristics_df.join(feature_df, how='right') for col, enum_map in enum_maps.items(): if col in display_columns: for df in [base_df, feature_df]: df[col] = df[col].map(enum_map) groups = list(base_df[display_columns[0]].unique()) n_groups = max(len(groups), 1) vertical_spacing = 0.3 / n_cols fig = make_subplots( rows=n_cols, cols=n_groups, subplot_titles=groups * n_cols, row_titles=[ f'{f}' for f in cols], vertical_spacing=vertical_spacing) nrow = 0 for col in cols: nrow += 1 for group in groups: ncol = groups.index(group) + 1 showlegend = False if ncol == 1 and nrow == 1: showlegend = True x = base_df.copy() y = feature_df.copy() if group: x = x.loc[x[display_columns[0]] == group, :] y = y.loc[y[display_columns[0]] == group, :] if aggregate_function: x = x.assign(count=1) sizes = x.groupby(aggregate_columns)[['count']].sum().reset_index() if aggregate_function == 'sum': x = x.groupby(aggregate_columns).sum().reset_index() y = y.groupby(aggregate_columns).sum().reset_index() elif aggregate_function == 'mean': x = x.groupby(aggregate_columns).mean().reset_index() y = y.groupby(aggregate_columns).mean().reset_index() for agg_col in sorted(list(x[aggregate_columns[0]].unique())): x_c = x[x[aggregate_columns[0]] == agg_col] y_c = y[y[aggregate_columns[0]] == agg_col] s_c = sizes[sizes[aggregate_columns[0]] == agg_col] fig.add_trace(go.Scatter(x=x_c[col], y=y_c[col], marker=dict(size=s_c['count'], line=dict(width=1.5, color='DarkSlateGrey')), mode='markers', text=s_c['count'], name=agg_col, legendgroup=agg_col, showlegend=False), row=nrow, col=ncol) else: color = [colors[0] for i in y[col]] if 'color_index' in y.columns.values: color = [colors[i] for i in y['color_index']] fig.add_trace(go.Scatter(x=x[col], y=y[col], marker=dict(size=12, color=color, line=dict(width=1.5, color='DarkSlateGrey')), mode='markers', text=x.index, name='', legendgroup=col, showlegend=False), row=nrow, col=ncol) min_value, max_value = get_min_max(x[col], y[col], 0, 0) add_error_lines(fig, showlegend, nrow, ncol, min_value, max_value) fig.update_xaxes(title_text='base', row=nrow, col=ncol) fig.update_yaxes(title_text='feature', row=nrow, col=ncol) fig['layout'].update(template='plotly_white') fig.update_layout(width=800 * n_groups, height=600 * n_cols, autosize=False, font=dict(size=12)) # Re-locate row titles above plots increment = (1/n_cols/2)*0.95 for i in fig['layout']['annotations']: text = i['text'].replace('', '').replace('', '') if text in cols: i['textangle'] = 0 i['x'] = 0 i['y'] += increment basename, ext = os.path.splitext(file) filename = '{basename}.html'.format(basename=basename) if self.export_file: filename = self.export_file plotly.offline.plot(fig, filename=os.path.join(self.export_folder, '{filename}'.format(filename=filename)), auto_open=False) def read_csv(csv_file_path, **kwargs) -> pd.DataFrame: default_na_values = pd._libs.parsers.STR_NA_VALUES df = pd.read_csv(csv_file_path, na_values=list(default_na_values - {'None'}), keep_default_na=False, **kwargs) return df if __name__ == '__main__': default_base_folder = 'workflow/tests/base_results' default_feature_folder = 'workflow/tests/results' default_export_folder = 'workflow/tests/comparisons' actions = [method for method in dir(BaseCompare) if method.startswith('__') is False] parser = argparse.ArgumentParser() parser.add_argument('-b', '--base_folder', default=default_base_folder, help='Path of the base folder.') parser.add_argument('-f', '--feature_folder', default=default_feature_folder, help='Path of the feature folder.') parser.add_argument('-e', '--export_folder', default=default_export_folder, help='Path of the export folder.') parser.add_argument('-x', '--export_file', help='Path of the export file.') parser.add_argument('-a', '--actions', action='append', choices=actions, help='Method to call.') args = parser.parse_args() print(args) if not os.path.exists(args.export_folder): os.makedirs(args.export_folder) compare = BaseCompare(args.base_folder, args.feature_folder, args.export_folder, args.export_file) if args.actions is None: args.actions = [] for action in args.actions: if action == 'results': compare.results() elif action == 'visualize': compare.visualize()