Skip to content

Commit 8e69f1a

Browse files
Merge pull request #64 from mkyl/alley-and-cleanup
LSS and Cleanup
2 parents 684fedb + 74df465 commit 8e69f1a

File tree

4 files changed

+104
-21
lines changed

4 files changed

+104
-21
lines changed

Experiments/Scripts/append_lss.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# The LSS results were established using a different code base. This file takes the resulting csv files and appends the information
2+
# to the end of an overall comparison results file to be used for plotting data in figures used in the paper.
3+
import csv
4+
import pandas as pd
5+
6+
lss_datasets = ['aids', 'dblp', 'eu2005', 'human', 'lubm80', 'yeast', 'youtube']
7+
8+
build_inference_filename = 'Experiments/Results/LSS/result/build_and_inference_ST.csv'
9+
comparison_filename = 'Experiments/comparison_results.csv'
10+
11+
estimator = 'lss'
12+
with open(comparison_filename, 'a') as comparison_file_obj:
13+
writer_obj = csv.writer(comparison_file_obj)
14+
for dataset in lss_datasets:
15+
runtime = 0
16+
results_filename = 'Experiments/Results/LSS/result/' + dataset + '/' + dataset + '_NNGINConcat_freq_80_cv.csv'
17+
with open(build_inference_filename) as build_file_obj:
18+
build_reader_obj = csv.reader(build_file_obj)
19+
for row in build_reader_obj:
20+
if row[0] == dataset:
21+
runtime = row[3]
22+
build_file_obj.close()
23+
24+
with open(results_filename) as results_file_obj:
25+
reader_obj = csv.reader(results_file_obj)
26+
for row in reader_obj:
27+
if row[2] == 'error':
28+
continue
29+
query = 'query' + row[1]
30+
value = row[2]
31+
# now write estimator,dataset,query,value,runtime to a new row in the comparison_results.csv
32+
new_row = [estimator, dataset, query, value, runtime]
33+
writer_obj.writerow(new_row)
34+
results_file_obj.close()
35+
comparison_file_obj.close()
36+
37+
df = pd.read_csv('Experiments/comparison_results.csv')
38+
df.to_parquet('Experiments/comparison_results.parquet')

Experiments/Scripts/comparison_exps.jl

+9-9
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,14 @@ println("Estimating...")
6060
run_estimation_experiments(experiment_params; timeout=TIMEOUT_SEC)
6161
run_estimation_experiments(max_bounds_experiment_params; timeout=TIMEOUT_SEC)
6262

63-
comparison_methods = ["alley", "alleyTPI", "wj", "impr", "jsub", "cs", "cset", "sumrdf"]
63+
comparison_methods = ["alley", "alleyTPI", "wj", "impr", "jsub", "cs", "cset", "sumrdf", "lss"]
6464
x_order = [string(data) for data in datasets]
6565
bounds_x_order = [string(data) for data in bounds_datasets]
6666
legend_order = [params.description for params in experiment_params][1:Int(length(experiment_params)/ length(datasets))]
6767
max_bounds_legend_order = [params.description for params in max_bounds_experiment_params][1:Int(length(max_bounds_experiment_params)/ length(bounds_datasets))]
6868
legend_order = vcat(legend_order, comparison_methods)
6969

70-
colors = [:red :yellow :maroon3 :palevioletred1 :dodgerblue :coral :palegreen :mediumpurple2 :darkgreen :cadetblue1]
70+
colors = [:red :yellow :maroon3 :palevioletred1 :dodgerblue :coral :palegreen :mediumpurple2 :darkgreen :cadetblue1 :goldenrod]
7171

7272
println("Graphing figures 3 and 4...")
7373

@@ -131,12 +131,12 @@ graph_grouped_box_plot(max_bounds_experiment_params;
131131
y_label="Inference Latency log\$_{10}\$ (s)",
132132
filename="fig_6") # bounds runtime
133133

134-
comparison_methods = ["alleyTPI", "sumrdf"]
134+
comparison_methods = ["alleyTPI", "sumrdf", "lss"]
135135
x_order = [string(data) for data in datasets]
136136
bar_legend_order = [params.description for params in smaller_experiment_params][1:Int(length(smaller_experiment_params)/ length(datasets))]
137137
bar_legend_order = vcat(bar_legend_order, comparison_methods)
138138
println("bar legend order: ", bar_legend_order)
139-
bar_plot_colors = [:red :palevioletred1 :cadetblue1]
139+
bar_plot_colors = [:red :palevioletred1 :cadetblue1 :goldenrod]
140140

141141
println("Graphing figures 7 and 8")
142142

@@ -146,9 +146,9 @@ graph_grouped_bar_plot(smaller_experiment_params;
146146
x_order = x_order,
147147
legend_order = bar_legend_order,
148148
ylims=[0, 10],
149-
y_ticks = [1, 2, 3, 4, 5, 6, 7, 8, 9],#[20, 40, 60, 80, 100],
149+
y_ticks = [1, 2, 3, 4, 5, 6, 7, 8],
150150
legend_pos=:topleft,
151-
dimensions = (850, 400),
151+
dimensions = (900, 400),
152152
scale_factor = 1000,
153153
log_scale = true,
154154
group_colors = bar_plot_colors,
@@ -161,9 +161,9 @@ graph_grouped_bar_plot(smaller_experiment_params;
161161
x_order = x_order,
162162
legend_order = bar_legend_order,
163163
legend_pos=:topleft,
164-
ylims=[0, 10],
165-
y_ticks = [1, 2, 3, 4, 5, 6, 7, 8, 9], #[100, 200, 300, 400, 500, 600, 700, 800],
166-
dimensions = (850, 400),
164+
ylims=[0, 11],
165+
y_ticks = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
166+
dimensions = (900, 400),
167167
scale_factor = 1000,
168168
log_scale = true,
169169
group_colors = bar_plot_colors,

Experiments/graph_results.jl

+56-11
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ function graph_box_plot(experiment_params_list::Vector{ExperimentParams};
2121
# load the results
2222
results_filename = params_to_results_filename(experiment_params)
2323
results_path = "Experiments/Results/Estimation_" * results_filename
24-
# println("results path: ", results_path)
2524
results_df = CSV.read(results_path, DataFrame; normalizenames=true)
2625

2726
# keep track of the data points
@@ -105,7 +104,6 @@ function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams}
105104
# load the results
106105
results_filename = params_to_results_filename(experiment_params)
107106
results_path = "Experiments/Results/Estimation_" * results_filename
108-
# println("results path: ", results_path)
109107
results_df = CSV.read(results_path, DataFrame; normalizenames=true)
110108

111109
# get the x_value and grouping (same for all results in this experiment param)
@@ -186,24 +184,36 @@ function comparison_dataset()
186184
dataset = comparison_results[i, :Dataset]
187185
query_path = comparison_results[i, :Query]
188186
if dataset == "lubm80"
189-
comparison_results[i, :QueryType] = match(r".*/lubm80_(.*).txt", query_path).captures[1]
187+
if !isnothing(match(r".*/lubm80_(.*).txt", query_path))
188+
comparison_results[i, :QueryType] = match(r".*/lubm80_(.*).txt", query_path).captures[1]
189+
else
190+
comparison_results[i, :QueryType] = "n/a"
191+
end
190192
elseif dataset in ["aids", "human", "yago"]
191-
comparison_results[i, :QueryType] = match(r"(.*)_.*/.*", query_path).captures[1]
193+
if !isnothing(match(r"(.*)_.*/.*", query_path))
194+
comparison_results[i, :QueryType] = match(r"(.*)_.*/.*", query_path).captures[1]
195+
else
196+
comparison_results[i, :QueryType] = "n/a"
197+
end
192198
else
193-
comparison_results[i, :QueryType] = match(r".*/query_(.*)_.*", query_path).captures[1]
199+
if !isnothing(match(r".*/query_(.*)_.*", query_path))
200+
comparison_results[i, :QueryType] = match(r".*/query_(.*)_.*", query_path).captures[1]
201+
else
202+
comparison_results[i, :QueryType] = "n/a"
203+
end
194204
end
195205
end
196206
results_dict = Dict()
197207
for i in 1:nrow(comparison_results)
198208
dataset = comparison_results[i, :Dataset]
199209
estimator = comparison_results[i, :Estimator]
200-
query_path = comparison_results[i, :Query]
210+
query_path = (estimator == "lss") ? "query" * string(i) : comparison_results[i, :Query]
201211
results_dict[(dataset, estimator, query_path)] = (Estimate=comparison_results[i, :Value],
202212
Runtime=comparison_results[i, :Runtime],
203213
QueryType=comparison_results[i,:QueryType])
204214
end
205215
estimators = unique(comparison_results[:, :Estimator])
206-
println(estimators)
216+
println("Estimators: ", estimators)
207217
return estimators, results_dict
208218
end
209219

@@ -265,12 +275,12 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
265275
current_y = if y_type == estimate_error
266276
min(10^30, max(1, results_df[i, :Estimate])) / results_df[i, :TrueCard]
267277
else # y_type == runtime
268-
results_df[i, :EstimationTime]
278+
typeof(results_df[i, :EstimationTime]) == String ? parse(Float64, results_df[i, :EstimationTime]) : results_df[i, :EstimationTime]
269279
end
270280
true_card[(data, get_query_id(string(experiment_params.dataset), results_df[i, :QueryPath]))] = (results_df[i, :TrueCard], current_x)
271281
# push the errors and their groupings into the correct vector
272282
push!(x_values, string(current_x))
273-
push!(y_values, current_y)
283+
push!(y_values, typeof(current_y) == String ? parse(Float64, current_y) : current_y)
274284
push!(estimators, current_group)
275285
end
276286
end
@@ -283,6 +293,9 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
283293
card = query_card_and_size[1]
284294
size = query_card_and_size[2]
285295
for estimator in estimator_types
296+
if (estimator == "lss")
297+
continue
298+
end
286299
comp_key = (data, estimator, query_path)
287300
(estimate, runtime) = 1, 60 # TODO: We shouldn't use an arbitrary number for runtime here
288301
if haskey(comparison_results, comp_key)
@@ -302,15 +315,37 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
302315
current_y = if y_type == estimate_error
303316
min(10^30, max(1, estimate)) / card
304317
else # y_type == runtime
305-
runtime / 1000.0
318+
typeof(runtime) == String ? parse(Float64, runtime) / 1000 : runtime / 1000.0
306319
end
320+
307321
# push the errors and their groupings into the correct vector
308322
push!(x_values, string(current_x))
309-
push!(y_values, current_y)
323+
push!(y_values, typeof(current_y) == String ? parse(Float64, current_y) : current_y)
310324
push!(estimators, estimator)
311325
end
312326
end
313327

328+
# now handle leftover lss data
329+
if ("lss" in estimator_types)
330+
for results_key in keys(comparison_results)
331+
# results_dict[(dataset, estimator, query_path)] = (Estimate=comparison_results[i, :Value], Runtime=comparison_results[i, :Runtime], QueryType=comparison_results[i,:QueryType])
332+
# look for all the rows where the estimator is lss, then push the appropriate x and y values.
333+
if (results_key[2] == "lss")
334+
current_results = comparison_results[results_key]
335+
current_x = results_key[1]
336+
current_y = if y_type == estimate_error
337+
current_results[1]
338+
else
339+
current_results[2]
340+
end
341+
estimator = "lss"
342+
push!(x_values, string(current_x))
343+
push!(y_values, typeof(current_y) == String ? parse(Float64, current_y) : current_y)
344+
push!(estimators, estimator)
345+
end
346+
end
347+
end
348+
314349
if isnothing(x_order)
315350
x_order = sort(unique(x_values))
316351
end
@@ -409,13 +444,23 @@ function graph_grouped_bar_plot(experiment_params_list::Vector{ExperimentParams}
409444
append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "patents", "yeast", "youtube"])
410445
append!(y_values, [88, 648, 569, 800, 6600, 6900, 6300, 3200])
411446
append!(groups, ["alleyTPI" for _ in 1:8])
447+
append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "yeast", "youtube"])
448+
append!(y_values, [9.023910, 9.067842, 9.018477, 8.981142, 9.010042, 9.045878, 8.992702]) # units of MB
449+
append!(groups, ["lss" for _ in 1:7])
450+
412451
elseif y_type == build_time
413452
append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "patents", "yeast", "youtube"])
414453
append!(y_values, [.3, 4.5, 9.9, .5, 4.2, 8.5, .1, 2.1])
415454
append!(groups, ["sumrdf" for _ in eachindex(y_values)])
416455
append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "patents", "yeast", "youtube"])
417456
append!(y_values, [221, 2518, 17452, 1061, 14233, 11738, 35585, 11044])
418457
append!(groups, ["alleyTPI" for _ in 1:8])
458+
append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "yeast", "youtube"])
459+
# append!(y_values, [1022.6, 29.5023, 3.6737, 3355.36, 492.89, 7047.44, 3130.0165]) # multithreaded results
460+
append!(y_values, [2207.7717, 50.2491, 5.9976, 8105.503, 328.89, 19839.2887, 2309.733]) # single-threaded results
461+
append!(groups, ["lss" for _ in 1:7])
462+
463+
419464
end
420465
for experiment_params in experiment_params_list
421466
# load the results

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ julia> using Pkg;
3737
julia> Pkg.instantiate();
3838
```
3939

40-
3. Download query graphs, data graphs, and true cardinalities from [G-Care](https://github.com/yspark-dblab/gcare) and [In-Memory Subgraph Matching](https://github.com/RapidsAtHKUST/SubgraphMatching)
40+
3. Download query graphs, data graphs, and true cardinalities from [G-Care](https://github.com/yspark-dblab/gcare) and [In-Memory Subgraph Matching](https://github.com/RapidsAtHKUST/SubgraphMatching), also available as [zipped files](https://drive.google.com/drive/folders/1pjJz9ahXFEd3Nd1OxqLA2YNnXGuCVpEp?usp=sharing).
4141

4242
## API
4343

0 commit comments

Comments
 (0)