-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot_results.py
executable file
·307 lines (232 loc) · 13.5 KB
/
plot_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#!/usr/bin/env python
import sys
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
# Generic function to plot grouped bar charts
def plot_grouped_bar_chart(df, column, results_dir, colors, title, ylabel, filename):
# Group by 'file' and 'script' to get the values for the specified column
grouped = df.groupby(['file', 'script'])[column].mean().reset_index()
# Extract unique files and scripts
unique_files = grouped['file'].unique()
unique_scripts = grouped['script'].unique()
# Set up the bar positions
bar_width = 0.2 # Width of each bar
x = np.arange(len(unique_files)) # X locations for the groups
# Plotting
plt.figure(figsize=(14, 8))
for i, script in enumerate(unique_scripts):
# Extract data for the current script
script_data = grouped[grouped['script'] == script]
# Get the column values in the order of unique_files
values = [script_data[script_data['file'] == f][column].values[0] if f in script_data['file'].values else 0 for
f in unique_files]
# Plot bars for this script with a specific color
plt.bar(x + i * bar_width, values, bar_width, label=script,
color=colors.get(script, 'gray')) # Default to 'gray' if script not in colors
plt.title(title)
plt.ylabel(ylabel)
plt.xticks(x + bar_width, unique_files, rotation=45, ha="right")
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.30), ncol=len(unique_scripts))
plt.tight_layout()
plt.savefig(os.path.join(results_dir, filename)) # Save plot in the results directory
plt.close()
# Generic function to plot grouped bar charts with overall averages
def plot_grouped_bar_chart_with_overall(df, column, results_dir, colors, title, ylabel, filename):
# Group by 'file' and 'script' to get the values for the specified column
grouped = df.groupby(['file', 'script'])[column].mean().reset_index()
# Calculate overall average for each script
overall_avg = grouped.groupby('script')[column].mean().reset_index()
overall_avg['file'] = 'Overall Average' # Add a label for overall averages
# Concatenate both DataFrames
combined_df = pd.concat([grouped, overall_avg], ignore_index=True)
# Extract unique files and scripts
unique_files = combined_df['file'].unique()
unique_scripts = combined_df['script'].unique()
# Set up the bar positions
bar_width = 0.2 # Width of each bar
x = np.arange(len(unique_files)) # X locations for the groups
# Plotting
plt.figure(figsize=(14, 8))
for i, script in enumerate(unique_scripts):
# Extract data for the current script
script_data = combined_df[combined_df['script'] == script]
# Get the column values in the order of unique_files
values = [script_data[script_data['file'] == f][column].values[0] if f in script_data['file'].values else 0 for
f in unique_files]
# Plot bars for this script with a specific color
plt.bar(x + i * bar_width, values, bar_width, label=script,
color=colors.get(script, 'gray')) # Default to 'gray' if script not in colors
plt.title(title)
plt.ylabel(ylabel)
plt.xticks(x + bar_width, unique_files, rotation=45, ha="right")
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.30), ncol=len(unique_scripts))
plt.tight_layout()
plt.savefig(os.path.join(results_dir, filename)) # Save plot in the results directory
plt.close()
# Generic function to plot speed-up per file with grouped bars and overall average
def plot_grouped_bar_chart_speedup_with_overall(df, column, results_dir, colors, title, ylabel, filename, reference_script):
# Create a new DataFrame to store speed-up values
speedup_values = []
# Group by 'file' and 'script' to get the values for the specified column
grouped = df.groupby(['file', 'script'])[column].mean().reset_index()
# Get the reference times (values of the reference script)
reference_values = grouped[grouped['script'] == reference_script].set_index('file')[column].to_dict()
# Calculate the speed-up for each script relative to the reference script
for script in df['script'].unique():
for file in df['file'].unique():
# Get the value of the current script for the current file
script_value = grouped[(grouped['file'] == file) & (grouped['script'] == script)][column].values
# If both the script value and reference value exist, calculate speed-up
if len(script_value) > 0 and file in reference_values:
speedup = reference_values[file] / script_value[0]
speedup_values.append({'file': file, 'script': script, 'speedup': speedup})
# Create a DataFrame from the speed-up values
speedup_df = pd.DataFrame(speedup_values)
# Group by 'script' to get the overall average speed-up across all files
overall_speedup = speedup_df.groupby('script')['speedup'].mean().reset_index()
overall_speedup['file'] = 'Overall Average' # Add a label for overall averages
# Concatenate both DataFrames
combined_df = pd.concat([speedup_df, overall_speedup], ignore_index=True)
# Extract unique files and scripts
unique_files = combined_df['file'].unique()
unique_scripts = combined_df['script'].unique()
# Set up the bar positions
bar_width = 0.2 # Width of each bar
x = np.arange(len(unique_files)) # X locations for the groups
# Plotting
plt.figure(figsize=(14, 8))
for i, script in enumerate(unique_scripts):
# Extract data for the current script
script_data = combined_df[combined_df['script'] == script]
# Get the speedup values in the order of unique_files
values = [script_data[script_data['file'] == f]['speedup'].values[0] if f in script_data['file'].values else 0
for f in unique_files]
# Plot bars for this script with a specific color
plt.bar(x + i * bar_width, values, bar_width, label=script,
color=colors.get(script, 'gray')) # Default to 'gray' if script not in colors
plt.title(title)
plt.ylabel(ylabel)
plt.xticks(x + bar_width, unique_files, rotation=45, ha="right")
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.30), ncol=len(unique_scripts))
plt.tight_layout()
plt.savefig(os.path.join(results_dir, filename)) # Save plot in the results directory
plt.close()
# Generic function to plot grouped horizontal bar charts
def plot_grouped_horizontal_bar_chart(df, column, results_dir, colors, title, xlabel, filename):
# Group by 'file' and 'script' to get the values for the specified column
grouped = df.groupby(['file', 'script'])[column].mean().reset_index()
# Extract unique files and scripts
unique_files = grouped['file'].unique()
unique_scripts = grouped['script'].unique()
# Set up the bar positions
bar_height = 0.2 # Height of each bar
y = np.arange(len(unique_files)) # Y locations for the groups
# Plotting
plt.figure(figsize=(14, 8))
for i, script in enumerate(unique_scripts):
# Extract data for the current script
script_data = grouped[grouped['script'] == script]
# Get the column values in the order of unique_files
values = [script_data[script_data['file'] == f][column].values[0] if f in script_data['file'].values else 0 for
f in unique_files]
# Plot horizontal bars for this script with a specific color
plt.barh(y + i * bar_height - (bar_height * (len(unique_scripts) - 1) / 2), values, bar_height, label=script,
color=colors.get(script, 'gray')) # Default to 'gray' if script not in colors
plt.title(title)
plt.xlabel(xlabel)
plt.yticks(y, unique_files, rotation=45)
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.15), ncol=len(unique_scripts))
plt.tight_layout()
plt.savefig(os.path.join(results_dir, filename)) # Save plot in the results directory
plt.close()
# Generic function to plot grouped horizontal bar charts with overall averages
def plot_grouped_horizontal_bar_chart_with_overall(df, column, results_dir, colors, title, xlabel, filename):
# Group by 'file' and 'script' to get the values for the specified column
grouped = df.groupby(['file', 'script'])[column].mean().reset_index()
# Calculate overall average for each script
overall_avg = grouped.groupby('script')[column].mean().reset_index()
overall_avg['file'] = 'Overall Average' # Add a label for overall averages
# Concatenate both DataFrames
combined_df = pd.concat([grouped, overall_avg], ignore_index=True)
# Extract unique files and scripts
unique_files = combined_df['file'].unique()
unique_scripts = combined_df['script'].unique()
# Set up the bar positions
bar_height = 0.2 # Height of each bar
y = np.arange(len(unique_files)) # Y locations for the groups
# Plotting
plt.figure(figsize=(14, 8))
for i, script in enumerate(unique_scripts):
# Extract data for the current script
script_data = combined_df[combined_df['script'] == script]
# Get the column values in the order of unique_files
values = [script_data[script_data['file'] == f][column].values[0] if f in script_data['file'].values else 0 for
f in unique_files]
# Plot horizontal bars for this script with a specific color
plt.barh(y + i * bar_height - (bar_height * (len(unique_scripts) - 1) / 2), values, bar_height, label=script,
color=colors.get(script, 'gray')) # Default to 'gray' if script not in colors
plt.title(title)
plt.xlabel(xlabel)
plt.yticks(y, unique_files, rotation=45)
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.15), ncol=len(unique_scripts))
plt.tight_layout()
plt.savefig(os.path.join(results_dir, filename)) # Save plot in the results directory
plt.close()
# Main function to load data and call plotting functions
def plot_results(results_dir):
# Load CSV files from the results directory
memory_usage_df = pd.read_csv(os.path.join(results_dir, 'memory_usage.csv'))
timings_df = pd.read_csv(os.path.join(results_dir, 'timings.csv'))
quality_score_df = pd.read_csv(os.path.join(results_dir, 'quality_scores.csv'))
# Convert peak memory and total allocated memory from bytes to MB
memory_usage_df['peak_memory_MB'] = memory_usage_df['peak_memory'] / 1048576
memory_usage_df['total_allocated_MB'] = memory_usage_df['total_allocated_bytes'] / 1048576
# Round the avg_seconds column to 3 decimal places
timings_df['avg_seconds'] = timings_df['avg_seconds'].round(3)
# Merge dataframes on 'file' and 'script'
merged_df = pd.merge(memory_usage_df, timings_df, on=['file', 'script'])
merged_df = pd.merge(merged_df, quality_score_df, on=['file', 'script'])
# Define colors for each script
colors = {
'run_unstructured': 'darkturquoise', # Replace 'script1' with the actual script names
'run_extractous_stream': 'black', # Replace 'script2' with the actual script names
'run_pypdf2': 'tab:orange' # Replace 'script3' with the actual script names
}
# Execution times
plot_grouped_bar_chart(merged_df, 'avg_seconds', results_dir, colors,
'Execution Times','Time (seconds)', 'execution_times.png')
plot_grouped_horizontal_bar_chart(merged_df, 'avg_seconds', results_dir, colors,
'Execution Times', 'Time (seconds)', 'execution_times_hor.png')
# Allocations
plot_grouped_bar_chart(merged_df, 'total_allocated_MB', results_dir, colors,
'Total Allocated Memory', 'Memory (MB)', 'total_allocated_memory.png')
plot_grouped_horizontal_bar_chart(merged_df, 'total_allocated_MB', results_dir, colors,
'Total Allocated Memory', 'Memory (MB)', 'total_allocated_memory_hor.png')
# Peak memory
plot_grouped_bar_chart(merged_df, 'peak_memory_MB', results_dir, colors,
'Peak RAM Memory Usage', 'Memory (MB)', 'peak_memory.png')
plot_grouped_horizontal_bar_chart_with_overall(merged_df, 'peak_memory_MB', results_dir, colors,
'Peak RAM Memory Usage', 'Memory (MB)', 'peak_memory_hor.png')
# Scores
plot_grouped_bar_chart_with_overall(merged_df, 'score', results_dir, colors,
'Quality Scores', 'Score ', 'quality_scores.png')
# Speed-up
plot_grouped_bar_chart_speedup_with_overall(merged_df, 'avg_seconds', results_dir, colors,
'Speedup relative to run_unstructured', 'Speed-up (Higher is Better)',
'speedup_relative_to_unstructured.png', reference_script='run_unstructured')
# Memory efficiency
plot_grouped_bar_chart_speedup_with_overall(merged_df, 'total_allocated_MB', results_dir, colors,
'Memory Allocations efficiency relative to run_unstructured', 'Memory Allocations Savings (Higher is Better)',
'memory_efficiency_relative_to_unstructured.png',
reference_script='run_unstructured')
# Entry point for the script
if __name__ == "__main__":
if len(sys.argv) != 2:
print(f"Usage: '{sys.argv[0]}' <results_dir>")
sys.exit(1)
results_directory = sys.argv[1]
if not os.path.isdir(results_directory):
raise FileNotFoundError(f"Directory does not exist: '{results_directory}'")
plot_results(results_directory)