Merge pull request #64 from mkyl/alley-and-cleanup

diandremiguels · web-flow · commit 8e69f1a751b8 · 2024-07-31T13:37:25.000-04:00
LSS and Cleanup
diff --git a/Experiments/Scripts/append_lss.py b/Experiments/Scripts/append_lss.py
@@ -0,0 +1,38 @@
+# The LSS results were established using a different code base. This file takes the resulting csv files and appends the information
+# to the end of an overall comparison results file to be used for plotting data in figures used in the paper.
+import csv
+import pandas as pd
+
+lss_datasets = ['aids', 'dblp', 'eu2005', 'human', 'lubm80', 'yeast', 'youtube']
+
+build_inference_filename = 'Experiments/Results/LSS/result/build_and_inference_ST.csv'
+comparison_filename = 'Experiments/comparison_results.csv'
+
+estimator = 'lss'
+with open(comparison_filename, 'a') as comparison_file_obj:
+    writer_obj = csv.writer(comparison_file_obj)
+    for dataset in lss_datasets:
+        runtime = 0
+        results_filename = 'Experiments/Results/LSS/result/' + dataset + '/' + dataset + '_NNGINConcat_freq_80_cv.csv'
+        with open(build_inference_filename) as build_file_obj:
+            build_reader_obj = csv.reader(build_file_obj)
+            for row in build_reader_obj:
+                if row[0] == dataset:
+                    runtime = row[3]
+            build_file_obj.close()
+
+        with open(results_filename) as results_file_obj:
+            reader_obj = csv.reader(results_file_obj)
+            for row in reader_obj:
+                if row[2] == 'error':
+                    continue
+                query = 'query' + row[1]
+                value = row[2]
+                # now write estimator,dataset,query,value,runtime to a new row in the comparison_results.csv
+                new_row = [estimator, dataset, query, value, runtime]
+                writer_obj.writerow(new_row)
+            results_file_obj.close()
+    comparison_file_obj.close()
+
+df = pd.read_csv('Experiments/comparison_results.csv')
+df.to_parquet('Experiments/comparison_results.parquet')
diff --git a/Experiments/Scripts/comparison_exps.jl b/Experiments/Scripts/comparison_exps.jl
@@ -60,14 +60,14 @@ println("Estimating...")
 run_estimation_experiments(experiment_params; timeout=TIMEOUT_SEC)
 run_estimation_experiments(max_bounds_experiment_params; timeout=TIMEOUT_SEC)
 
-comparison_methods =  ["alley", "alleyTPI", "wj", "impr", "jsub", "cs", "cset", "sumrdf"]
+comparison_methods =  ["alley", "alleyTPI", "wj", "impr", "jsub", "cs", "cset", "sumrdf", "lss"]
 x_order = [string(data) for data in datasets]
 bounds_x_order = [string(data) for data in bounds_datasets]
 legend_order = [params.description for params in experiment_params][1:Int(length(experiment_params)/ length(datasets))]
 max_bounds_legend_order = [params.description for params in max_bounds_experiment_params][1:Int(length(max_bounds_experiment_params)/ length(bounds_datasets))]
 legend_order = vcat(legend_order, comparison_methods)
 
-colors = [:red :yellow :maroon3 :palevioletred1 :dodgerblue :coral :palegreen :mediumpurple2 :darkgreen :cadetblue1]
+colors = [:red :yellow :maroon3 :palevioletred1 :dodgerblue :coral :palegreen :mediumpurple2 :darkgreen :cadetblue1 :goldenrod]
 
 println("Graphing figures 3 and 4...")
 
@@ -131,12 +131,12 @@ graph_grouped_box_plot(max_bounds_experiment_params;
                                                 y_label="Inference Latency log\$_{10}\$ (s)",
                                                 filename="fig_6") # bounds runtime
 
-comparison_methods =  ["alleyTPI", "sumrdf"]
+comparison_methods =  ["alleyTPI", "sumrdf", "lss"]
 x_order = [string(data) for data in datasets]
 bar_legend_order = [params.description for params in smaller_experiment_params][1:Int(length(smaller_experiment_params)/ length(datasets))]
 bar_legend_order = vcat(bar_legend_order, comparison_methods)
 println("bar legend order: ", bar_legend_order)
-bar_plot_colors = [:red :palevioletred1 :cadetblue1]
+bar_plot_colors = [:red :palevioletred1 :cadetblue1 :goldenrod]
 
 println("Graphing figures 7 and 8")
 
@@ -146,9 +146,9 @@ graph_grouped_bar_plot(smaller_experiment_params;
                         x_order = x_order,
                         legend_order = bar_legend_order,
                         ylims=[0, 10],
-                        y_ticks = [1, 2, 3, 4, 5, 6, 7, 8, 9],#[20, 40, 60, 80, 100],
+                        y_ticks = [1, 2, 3, 4, 5, 6, 7, 8],
                         legend_pos=:topleft,
-                        dimensions = (850, 400),
+                        dimensions = (900, 400),
                         scale_factor = 1000,
                         log_scale = true,
                         group_colors = bar_plot_colors,
@@ -161,9 +161,9 @@ graph_grouped_bar_plot(smaller_experiment_params;
                         x_order = x_order,
                         legend_order = bar_legend_order,
                         legend_pos=:topleft,
-                        ylims=[0, 10],
-                        y_ticks = [1, 2, 3, 4, 5, 6, 7, 8, 9], #[100, 200, 300, 400, 500, 600, 700, 800],
-                        dimensions = (850, 400),
+                        ylims=[0, 11],
+                        y_ticks = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                        dimensions = (900, 400),
                         scale_factor = 1000,
                         log_scale = true,
                         group_colors = bar_plot_colors,
diff --git a/Experiments/graph_results.jl b/Experiments/graph_results.jl
@@ -21,7 +21,6 @@ function graph_box_plot(experiment_params_list::Vector{ExperimentParams};
         # load the results
         results_filename = params_to_results_filename(experiment_params)
         results_path = "Experiments/Results/Estimation_" * results_filename
-        # println("results path: ", results_path)
         results_df = CSV.read(results_path, DataFrame; normalizenames=true)
 
         # keep track of the data points
@@ -105,7 +104,6 @@ function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams}
         # load the results
         results_filename = params_to_results_filename(experiment_params)
         results_path = "Experiments/Results/Estimation_" * results_filename
-        # println("results path: ", results_path)
         results_df = CSV.read(results_path, DataFrame; normalizenames=true)
 
         # get the x_value and grouping (same for all results in this experiment param)
@@ -186,24 +184,36 @@ function comparison_dataset()
         dataset = comparison_results[i, :Dataset]
         query_path = comparison_results[i, :Query]
         if dataset == "lubm80"
-            comparison_results[i, :QueryType] = match(r".*/lubm80_(.*).txt", query_path).captures[1]
+            if !isnothing(match(r".*/lubm80_(.*).txt", query_path))
+                comparison_results[i, :QueryType] = match(r".*/lubm80_(.*).txt", query_path).captures[1]
+            else
+                comparison_results[i, :QueryType] = "n/a"
+            end
         elseif dataset in ["aids", "human", "yago"]
-            comparison_results[i, :QueryType] = match(r"(.*)_.*/.*", query_path).captures[1]
+            if !isnothing(match(r"(.*)_.*/.*", query_path))
+                comparison_results[i, :QueryType] = match(r"(.*)_.*/.*", query_path).captures[1]
+            else
+                comparison_results[i, :QueryType] = "n/a"
+            end
         else
-            comparison_results[i, :QueryType] = match(r".*/query_(.*)_.*", query_path).captures[1]
+            if !isnothing(match(r".*/query_(.*)_.*", query_path))
+                comparison_results[i, :QueryType] = match(r".*/query_(.*)_.*", query_path).captures[1]
+            else
+                comparison_results[i, :QueryType] = "n/a"
+            end
         end
     end
     results_dict = Dict()
     for i in 1:nrow(comparison_results)
         dataset = comparison_results[i, :Dataset]
         estimator = comparison_results[i, :Estimator]
-        query_path = comparison_results[i, :Query]
+        query_path = (estimator == "lss") ? "query" * string(i) : comparison_results[i, :Query]
         results_dict[(dataset, estimator, query_path)] = (Estimate=comparison_results[i, :Value],
                                                             Runtime=comparison_results[i, :Runtime],
                                                             QueryType=comparison_results[i,:QueryType])
     end
     estimators = unique(comparison_results[:, :Estimator])
-    println(estimators)
+    println("Estimators: ", estimators)
     return estimators, results_dict
 end
 
@@ -265,12 +275,12 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
             current_y = if y_type == estimate_error
                 min(10^30, max(1, results_df[i, :Estimate])) / results_df[i, :TrueCard]
             else # y_type == runtime
-                results_df[i, :EstimationTime]
+                typeof(results_df[i, :EstimationTime]) == String ? parse(Float64, results_df[i, :EstimationTime]) : results_df[i, :EstimationTime]
             end
             true_card[(data, get_query_id(string(experiment_params.dataset), results_df[i, :QueryPath]))] = (results_df[i, :TrueCard], current_x)
             # push the errors and their groupings into the correct vector
             push!(x_values, string(current_x))
-            push!(y_values, current_y)
+            push!(y_values, typeof(current_y) == String ? parse(Float64, current_y) : current_y)
             push!(estimators, current_group)
         end
     end
@@ -283,6 +293,9 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
         card = query_card_and_size[1]
         size = query_card_and_size[2]
         for estimator in estimator_types
+            if (estimator == "lss")
+                continue
+            end
             comp_key = (data, estimator, query_path)
             (estimate, runtime) = 1, 60 # TODO: We shouldn't use an arbitrary number for runtime here
             if haskey(comparison_results, comp_key)
@@ -302,15 +315,37 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
             current_y = if y_type == estimate_error
                 min(10^30, max(1, estimate)) / card
             else # y_type == runtime
-                runtime / 1000.0
+                typeof(runtime) == String ? parse(Float64, runtime) / 1000 : runtime / 1000.0
             end
+            
             # push the errors and their groupings into the correct vector
             push!(x_values, string(current_x))
-            push!(y_values, current_y)
+            push!(y_values, typeof(current_y) == String ? parse(Float64, current_y) : current_y)
             push!(estimators, estimator)
         end
     end
 
+    # now handle leftover lss data
+    if ("lss" in estimator_types)
+        for results_key in keys(comparison_results)
+            # results_dict[(dataset, estimator, query_path)] = (Estimate=comparison_results[i, :Value], Runtime=comparison_results[i, :Runtime], QueryType=comparison_results[i,:QueryType])
+            # look for all the rows where the estimator is lss, then push the appropriate x and y values.
+            if (results_key[2] == "lss")
+                current_results = comparison_results[results_key]
+                current_x = results_key[1]
+                current_y = if y_type == estimate_error
+                    current_results[1]
+                else
+                    current_results[2]
+                end
+                estimator = "lss"
+                push!(x_values, string(current_x))
+                push!(y_values, typeof(current_y) == String ? parse(Float64, current_y) : current_y)
+                push!(estimators, estimator)
+            end
+        end
+    end
+
     if isnothing(x_order)
         x_order = sort(unique(x_values))
     end
@@ -409,13 +444,23 @@ function graph_grouped_bar_plot(experiment_params_list::Vector{ExperimentParams}
         append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "patents", "yeast", "youtube"])
         append!(y_values, [88, 648, 569, 800, 6600, 6900, 6300, 3200])
         append!(groups, ["alleyTPI" for _ in 1:8])
+        append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "yeast", "youtube"])
+        append!(y_values, [9.023910, 9.067842, 9.018477, 8.981142, 9.010042, 9.045878, 8.992702]) # units of MB
+        append!(groups, ["lss" for _ in 1:7])
+
     elseif y_type == build_time
         append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "patents", "yeast", "youtube"])
         append!(y_values, [.3, 4.5, 9.9, .5, 4.2, 8.5, .1, 2.1])
         append!(groups, ["sumrdf" for _ in eachindex(y_values)])
         append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "patents", "yeast", "youtube"])
         append!(y_values, [221, 2518, 17452, 1061, 14233, 11738, 35585, 11044])
         append!(groups, ["alleyTPI" for _ in 1:8])
+        append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "yeast", "youtube"])
+        # append!(y_values, [1022.6, 29.5023, 3.6737, 3355.36, 492.89, 7047.44, 3130.0165]) # multithreaded results
+        append!(y_values, [2207.7717, 50.2491, 5.9976, 8105.503, 328.89, 19839.2887, 2309.733]) # single-threaded results
+        append!(groups, ["lss" for _ in 1:7])
+
+
     end
     for experiment_params in experiment_params_list
         # load the results
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ julia> using Pkg;
 julia> Pkg.instantiate();
 ```
 
-3. Download query graphs, data graphs, and true cardinalities from [G-Care](https://github.com/yspark-dblab/gcare) and [In-Memory Subgraph Matching](https://github.com/RapidsAtHKUST/SubgraphMatching)
+3. Download query graphs, data graphs, and true cardinalities from [G-Care](https://github.com/yspark-dblab/gcare) and [In-Memory Subgraph Matching](https://github.com/RapidsAtHKUST/SubgraphMatching), also available as [zipped files](https://drive.google.com/drive/folders/1pjJz9ahXFEd3Nd1OxqLA2YNnXGuCVpEp?usp=sharing).
 
 ## API