-
Notifications
You must be signed in to change notification settings - Fork 0
/
benchmark.sh
executable file
·213 lines (162 loc) · 7.7 KB
/
benchmark.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/bin/bash
DO_BENCHMARK_SPEED=1
DO_PROFILE_MEMORY=1
WARMUP_RUNS=2 # Number of warmup runs (hyperfine handles this)
TIMED_RUNS=5 # Number of benchmarked runs to calculate average speed
# List of Python scripts to benchmark
SCRIPTS_SRC_DIR="src"
#SCRIPTS=("run_pypdfium2.py")
SCRIPTS=("run_extractous_stream.py" "run_unstructured.py" "run_pypdf2.py")
DATASET_FILES_DIR="dataset/sec10-filings"
DATASET_GT_DIR="dataset/sec10-ground-truth"
INPUT_FILES=(
"2022_Q3_AAPL.pdf" "2022_Q3_AMZN.pdf" "2022_Q3_INTC.pdf" "2022_Q3_MSFT.pdf" "2022_Q3_NVDA.pdf"
"2023_Q1_AAPL.pdf" "2023_Q1_AMZN.pdf" "2023_Q1_INTC.pdf" "2023_Q1_MSFT.pdf" "2023_Q1_NVDA.pdf"
# "2023_Q2_AAPL.pdf" "2023_Q2_AMZN.pdf" "2023_Q2_INTC.pdf" "2023_Q2_MSFT.pdf" "2023_Q2_NVDA.pdf"
# "2023_Q3_AAPL.pdf" "2023_Q3_AMZN.pdf" "2023_Q3_INTC.pdf" "2023_Q3_MSFT.pdf" "2023_Q3_NVDA.pdf"
)
#DATASET_FILES_DIR="dataset/pypdf-files"
#DATASET_GT_DIR="dataset/pypdf-ground-truth"
#INPUT_FILES=(
# "2201.00214.pdf" "2201.00151.pdf" "1707.09725.pdf" "2201.00021.pdf" "2201.00037.pdf"
# "2201.00069.pdf" "2201.00178.pdf" "2201.00201.pdf" "1602.06541.pdf" "2201.00200.pdf"
# "2201.00022.pdf" "2201.00029.pdf" "1601.03642.pdf" "GeoTopo.pdf"
#)
RESULTS_TAG="$(basename "$DATASET_FILES_DIR")_$(date +"%d_%m_%Y")"
RESULTS_DIR="results/$RESULTS_TAG"
RESULTS_EXTRACT_OUT_DIR="$RESULTS_DIR/extract-output"
RESULTS_HYPERFINE_OUT_DIR="$RESULTS_DIR/hyperfine-output"
RESULTS_MEMRAY_BIN_DIR="$RESULTS_DIR/memray-profiles"
# make directories
mkdir -p "$RESULTS_EXTRACT_OUT_DIR" "$RESULTS_HYPERFINE_OUT_DIR" "$RESULTS_MEMRAY_BIN_DIR"
# Temp file for benchmark timings
TIMINGS_FILE="$RESULTS_DIR/timings.csv"
echo "file,script,avg_seconds" > "$TIMINGS_FILE"
# Memory profiling output file
QUALITY_SCORES_FILE="$RESULTS_DIR/quality_scores.csv"
echo "file,script,score" > "$QUALITY_SCORES_FILE"
# Memory profiling output file
MEMORY_USAGE_FILE="$RESULTS_DIR/memory_usage.csv"
echo "file,script,total_allocated_bytes,total_allocations,peak_memory" > "$MEMORY_USAGE_FILE"
print_experiment_info() {
local dataset_directory="$1"
local results_out_dir="$2"
# Get processor name from /proc/cpuinfo
# cpu_name=$(grep -m 1 'model name' /proc/cpuinfo | awk -F': ' '{print $2}' | \
# sed -E 's/\(R\)/_/g' | sed 's/CPU//g' | sed 's/Core(TM)//g' | sed 's/@//g' | \
# sed 's/ //g' | sed 's/ /_/g')
cpu_model=$(grep -m 1 'model name' /proc/cpuinfo | awk -F': ' '{print $2}')
total_ram=$(grep MemTotal /proc/meminfo | awk '{print int($2 / 1000 / 1000)}')
curr_date=$(date --utc)
os_details=$(uname -a)
dataset_name=$(basename "$dataset_directory")
# Combine into a tag
echo "cpu_model: $cpu_model" > "$results_out_dir/info.txt"
echo "total_ram: ${total_ram} GB" >> "$results_out_dir/info.txt"
echo "dataset: $dataset_name" >> "$results_out_dir/info.txt"
echo "os_details: $os_details" >> "$results_out_dir/info.txt"
echo ""
echo "timestamp: $curr_date" >> "$results_out_dir/info.txt"
}
# function used for naming output files
get_file_basename(){
local input_file="$1"
base_name=$(basename "$input_file")
file_name_ext="${base_name//./_}"
echo "$file_name_ext"
}
get_script_basename(){
local script="$1"
script_base_name="${script%.*}"
echo "$script_base_name"
}
# Function for timing benchmarking using hyperfine
benchmark_speed() {
local script="$1"
local input_file="$2"
local output_file="$3"
script_basename=$(get_script_basename "$script")
file_basename=$(get_file_basename "$input_file")
qualifier_name="${file_basename}_${script_basename}"
echo ""
echo "Benchmarking speed of $qualifier_name using hyperfine..."
# Run hyperfine with warmup and timed runs
hyperfine_json="${RESULTS_HYPERFINE_OUT_DIR}/${qualifier_name}_timing.json"
if [ "$DO_BENCHMARK_SPEED" == "1" ]; then
hyperfine --warmup "$WARMUP_RUNS" --min-runs "$TIMED_RUNS" "python ${SCRIPTS_SRC_DIR}/${script} ${input_file} ${output_file}" --export-json "$hyperfine_json"
fi
# Extract the mean time from hyperfine JSON output
avg_execution_time=$(jq '.results[0].mean' "$hyperfine_json")
echo "Average execution time for $script with input $input_file: $avg_execution_time seconds"
# Save the timing result
echo "$file_basename,$script_basename,$avg_execution_time" >> $TIMINGS_FILE
}
benchmark_quality() {
local script="$1"
local input_file="$2"
local output_file="$3"
local ground_truth_file="$4"
script_basename=$(get_script_basename "$script")
file_basename=$(get_file_basename "$input_file")
qualifier_name="${file_basename}_${script_basename}"
echo ""
echo "Calculating similarity score for $qualifier_name ..."
score=$(python "${SCRIPTS_SRC_DIR}/calc_similarity_score.py" "$output_file" "$ground_truth_file")
echo "Similarity score for $qualifier_name: $score "
# Save the timing result
echo "$file_basename,$script_basename,$score" >> $QUALITY_SCORES_FILE
}
# Function for memory profiling using memray
profile_memory() {
local script="$1"
local input_file="$2"
local output_file="$3"
script_basename=$(get_script_basename "$script")
file_basename=$(get_file_basename "$input_file")
qualifier_name="${file_basename}_${script_basename}"
echo ""
echo "Profiling memory usage of $qualifier_name using memray..."
# Generate bin file
MEMRAY_BIN_FILE="$RESULTS_MEMRAY_BIN_DIR/${qualifier_name}.bin"
MEMRAY_JSON_REPORT="${MEMRAY_BIN_FILE%.bin}_stats.json"
if [ "$DO_PROFILE_MEMORY" == "1" ]; then
#python -m memray run --native --force -o "$MEMRAY_BIN_FILE" "./$script" "$input_file" "$output_file"
python -m memray run --force -o "$MEMRAY_BIN_FILE" "${SCRIPTS_SRC_DIR}/${script}" "$input_file" "$output_file"
# Generate memray flamegraph
python -m memray flamegraph --force "$MEMRAY_BIN_FILE"
# Generate memray stats report
python -m memray stats --force --json --output "$MEMRAY_JSON_REPORT" "$MEMRAY_BIN_FILE"
fi
# Extract total memory allocated from the JSON report
TOTAL_BYTES_ALLOCATED=$(jq '.total_bytes_allocated' "$MEMRAY_JSON_REPORT")
TOTAL_ALLOCATIONS=$(jq '.total_num_allocations' "$MEMRAY_JSON_REPORT")
MEMORY_PEAK_BYTES=$(jq '.metadata.peak_memory' "$MEMRAY_JSON_REPORT")
#STATS_TOTAL_ALLOCATIONS=$(jq '.metadata.total_allocations' "$MEMRAY_JSON_REPORT")
echo "Total memory allocated by $qualifier_name: $TOTAL_BYTES_ALLOCATED bytes, $TOTAL_ALLOCATIONS allocations"
# Save the memory usage result
echo "$file_basename,$script_basename,$TOTAL_BYTES_ALLOCATED,$TOTAL_ALLOCATIONS,$MEMORY_PEAK_BYTES" >> $MEMORY_USAGE_FILE
}
# Print experiment info
print_experiment_info "$DATASET_FILES_DIR" "$RESULTS_DIR"
# Main loop to process each script with each input file
for script in "${SCRIPTS[@]}"; do
for input_file in "${INPUT_FILES[@]}"; do
# Define the full path for the file
file_path="${DATASET_FILES_DIR}/${input_file}"
# Define the ground truth file
name_no_ext="${input_file%.*}"
ground_truth_file="${DATASET_GT_DIR}/${name_no_ext}.txt"
# Define the output file path for this run
qualifier_name="$(get_script_basename "$script")_$(get_file_basename "$input_file")"
output_file="$RESULTS_EXTRACT_OUT_DIR/${qualifier_name}_output.txt"
# Call the speed benchmarking function
benchmark_speed "$script" "$file_path" "$output_file"
benchmark_quality "$script" "$file_path" "$output_file" "$ground_truth_file"
# Call the memory profiling function
profile_memory "$script" "$file_path" "$output_file"
done
done
echo "Benchmarking and profiling completed!"
echo ""
echo "You can plot the results using this command:"
echo "./plot_results.py $RESULTS_DIR"