uwdb
diff --git a/‎Colorful_Cardinality_Calculation_Tech_Report.pdf
1.55 MB b/‎Colorful_Cardinality_Calculation_Tech_Report.pdf
1.55 MB
diff --git a/‎Experiments/Scripts/comparison_exps.jl
+35-3 b/‎Experiments/Scripts/comparison_exps.jl
+35-3
diff --git a/‎Experiments/Scripts/estimator-failure.jl
+36-24 b/‎Experiments/Scripts/estimator-failure.jl
+36-24
diff --git a/‎Experiments/comparison_results.parquet
25 KB b/‎Experiments/comparison_results.parquet
25 KB
diff --git a/‎Experiments/graph_results.jl
+18-3 b/‎Experiments/graph_results.jl
+18-3
@@ -46,13 +46,13 @@ end
 
 println("Building...")
 
-build_experiments(experiment_params)
+#build_experiments(experiment_params)
 
 println("Estimating...")
 
-run_estimation_experiments(experiment_params; timeout=TIMEOUT_SEC)
+#run_estimation_experiments(experiment_params; timeout=TIMEOUT_SEC)
 
-comparison_methods =  ["alley", "wj", "impr", "jsub", "cs", "cset", "sumrdf"]
+comparison_methods =  ["alley", "alleyTPI", "wj", "impr", "jsub", "cs", "cset", "sumrdf"]
 x_order = [string(data) for data in datasets]
 legend_order = [params.description for params in experiment_params][1:Int(length(experiment_params)/ length(datasets))]
 legend_order = vcat(legend_order, comparison_methods)
@@ -113,3 +113,35 @@ graph_grouped_boxplot_with_comparison_methods(experiment_params;
                                                 y_label="Relative Error log\$_{10}\$",
                                                 x_label = "Query Size",
                                                 filename="query_size_error")
+
+
+
+comparison_methods =  ["alleyTPI", "sumrdf"]
+x_order = [string(data) for data in datasets]
+legend_order = [params.description for params in experiment_params][1:Int(length(experiment_params)/ length(datasets))]
+legend_order = vcat(legend_order, comparison_methods)
+println("Graphing figure 4")
+
+graph_grouped_bar_plot(experiment_params;
+                        grouping=description,
+                        y_type=memory_footprint,
+                        x_order = x_order,
+                        legend_order = legend_order,
+                        ylims=[0, 100],
+                        y_ticks = [20, 40, 60, 80, 100],
+                        legend_pos=:topright,
+                        dimensions = (1000, 550),
+                        y_label="Memory (MBs)",
+                        filename="overall_memory")
+
+graph_grouped_bar_plot(experiment_params;
+                        grouping=description,
+                        y_type=build_time,
+                        x_order = x_order,
+                        legend_order = legend_order,
+                        legend_pos=:topleft,
+                        ylims=[0, 800],
+                        y_ticks = [100, 200, 300, 400, 500, 600, 700, 800],
+                        dimensions = (1000, 550),
+                        y_label="Build Time (s)",
+                        filename="overall_build_time")
@@ -3,30 +3,40 @@ include("../Experiments.jl")
 #datasets = [human, aids]
 datasets = [human, aids, lubm80, yeast, dblp, youtube, eu2005, patents]
 #datasets = [human, aids, yeast, dblp, youtube, eu2005, patents]
-datasets = [yeast]
+#datasets = [human, aids, yeast, dblp, youtube, ]
 queries = load_querysets(datasets)
 num_queries = Dict(string(dataset)=>length(queries[dataset]) for dataset in datasets)
 
 methods, comparison_results = comparison_dataset()
 
 failure_counts = Dict()
 failure_probabilities = Dict()
+missing_counts = Dict()
+zero_counts = Dict()
 for method in methods
     failure_counts[method] = counter(String)
+    missing_counts[method] = counter(String)
+    zero_counts[method] = counter(String)
     failure_probabilities[method] = Dict()
     for dataset in datasets
         string_dataset = string(dataset)
-        for query in queries[dataset]
-            qid = get_query_id(string_dataset, query.query_path)
-            comp_key = (string_dataset, method, qid)
-            if !haskey(comparison_results, comp_key)
-                inc!(failure_counts[method], string_dataset)
-            elseif comparison_results[comp_key].Estimate == 0
-                inc!(failure_counts[method], string_dataset)
-            elseif comparison_results[comp_key].Estimate == Inf
-                inc!(failure_counts[method], string_dataset)
-            elseif comparison_results[comp_key].Estimate == NaN
-                inc!(failure_counts[method], string_dataset)
+        if method == "cset"
+            failure_counts[method][string_dataset] = 0
+        else
+            for query in queries[dataset]
+                qid = get_query_id(string_dataset, query.query_path)
+                comp_key = (string_dataset, method, qid)
+                if !haskey(comparison_results, comp_key)
+                    inc!(failure_counts[method], string_dataset)
+                    inc!(missing_counts[method], string_dataset)
+                elseif comparison_results[comp_key].Estimate == 0
+                    inc!(failure_counts[method], string_dataset)
+                    inc!(zero_counts[method], string_dataset)
+                elseif comparison_results[comp_key].Estimate == Inf
+                    inc!(failure_counts[method], string_dataset)
+                elseif comparison_results[comp_key].Estimate == NaN
+                    inc!(failure_counts[method], string_dataset)
+                end
             end
         end
         failure_probabilities[method][string_dataset] = failure_counts[method][string_dataset] / num_queries[string_dataset]
@@ -35,10 +45,10 @@ end
 
 failure_counts["BSK"] = counter(String)
 failure_counts["BSK++"] = counter(String)
-failure_counts["AvgMix64"] = counter(String)
+failure_counts["AvgMix32"] = counter(String)
 failure_probabilities["BSK"] = Dict()
 failure_probabilities["BSK++"] = Dict()
-failure_probabilities["AvgMix64"] = Dict()
+failure_probabilities["AvgMix32"] = Dict()
 for dataset in datasets
     string_dataset = string(dataset)
     bsk_params = ExperimentParams(deg_stats_type=MaxDegStats,
@@ -83,30 +93,31 @@ for dataset in datasets
     failure_probabilities["BSK++"][string_dataset] = failure_counts["BSK++"][string_dataset] / num_queries[string_dataset]
 
 
-    mix_scheme = [(QuasiStable, 32), (NeighborNodeLabels, 16), (NodeLabels, 16)]
+    mix_scheme = [(Degree, 8), (QuasiStable, 8), (NeighborNodeLabels, 8), (NodeLabels, 8)]
     avg_params = ExperimentParams(dataset=dataset,
-                                    n_replications=2,
-                                    partitioning_scheme=mix_scheme)
+                                    partitioning_scheme=mix_scheme,
+                                    inference_max_paths=500)
 #    build_experiments([avg_params])
 #    run_estimation_experiments([avg_params]; timeout=TIMEOUT_SEC)
     avg_filename = params_to_results_filename(avg_params)
     avg_path = "Experiments/Results/Estimation_" * avg_filename
     avg_df = CSV.read(avg_path, DataFrame; normalizenames=true)
     for i in 1:nrow(avg_df)
         if avg_df[i, :Failure]
-            inc!(failure_counts["AvgMix64"], string_dataset)
+            inc!(failure_counts["AvgMix32"], string_dataset)
         end
     end
-    failure_probabilities["AvgMix64"][string_dataset] = failure_counts["AvgMix64"][string_dataset] / num_queries[string_dataset]
+    failure_probabilities["AvgMix32"][string_dataset] = failure_counts["AvgMix32"][string_dataset] / num_queries[string_dataset]
 end
 
-estimators = ["cs", "wj", "jsub", "impr", "cset", "alley", "BSK", "BSK++", "sumrdf", "AvgMix64"]
+estimators = ["cs", "wj", "jsub", "impr", "cset", "alley", "alleyTPI", "BSK++", "sumrdf", "AvgMix32"]
 
 global latex_table = """
 \\begin{table*}[]
+\\caption{Estimator Failure Rates}
 \\begin{tabular}{|l|l|l|l|l|l|l|l|l|l|l|}
 \\hline
-\\textbf{Dataset\\textbackslash{}Method} """
+\\textbf{Dataset\\textbackslash{}Method} \n"""
 for estimator in estimators
     global latex_table *= """& \\textbf{""" * string(estimator) * """} """
 end
@@ -115,13 +126,14 @@ global latex_table *= """\\\\
 for dataset in datasets
     global latex_table *= """\\textbf{""" * string(dataset) * """} """
     for estimator in estimators
-        global latex_table *= " & " * @sprintf("%.2f", failure_probabilities[estimator][string(dataset)])
+        failure_prob = failure_probabilities[estimator][string(dataset)]
+        red_percent = Int(floor(100 * failure_prob^.5))
+        global latex_table *= " & " * "\\cellcolor{red!" *  string(red_percent) * "!green!50}" * @sprintf("%.2f", failure_prob)
     end
-    global latex_table *= """\\\\ \\hline """
+    global latex_table *= """\\\\ \\hline \n"""
 end
 global latex_table *= """
 \\end{tabular}
-\\caption{Estimator Failure Rates}
 \\label{tbl:estimator-failure}
 \\end{table*}
 """
 
@@ -197,6 +197,7 @@ function comparison_dataset()
                                                             QueryType=comparison_results[i,:QueryType])
     end
     estimators = unique(comparison_results[:, :Estimator])
+    println(estimators)
     return estimators, results_dict
 end
 
@@ -275,14 +276,13 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
         size = query_card_and_size[2]
         for estimator in estimator_types
             comp_key = (data, estimator, query_path)
-            (estimate, runtime) = 1, 10 # TODO: We shouldn't use an arbitrary number for runtime here
+            (estimate, runtime) = 1, 60 # TODO: We shouldn't use an arbitrary number for runtime here
             if haskey(comparison_results, comp_key)
                 result = comparison_results[comp_key]
                 estimate = result.Estimate
                 runtime = result.Runtime
             else
                 push!(estimator_dataset_missing, (estimator, data))
-                continue
             end
 
             current_x = if x_type == dataset
@@ -368,6 +368,21 @@ function graph_grouped_bar_plot(experiment_params_list::Vector{ExperimentParams}
     x_values = []
     y_values = Float64[]
     groups = []
+    if y_type == memory_footprint
+        append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "patents", "yeast", "youtube"])
+        append!(y_values, [1.6, 0.1, 19.5, 2, 5.8, 28, .2, 7.8])
+        append!(groups, ["sumrdf" for _ in 1:8])
+        append!(x_values, ["aids", "human", "lubm80"])
+        append!(y_values, [88, 648, 483])
+        append!(groups, ["alleyTPI" for _ in 1:3])
+    elseif y_type == build_time
+        append!(x_values, ["aids", "human", "lubm80", "dblp", "eu2005", "patents", "yeast", "youtube"])
+        append!(y_values, [.3, 4.5, 9.9, .5, 4.2, 8.5, .1, 2.1])
+        append!(groups, ["sumrdf" for _ in eachindex(y_values)])
+        append!(x_values, ["aids", "human", "lubm80"])
+        append!(y_values, [49, 614, 313])
+        append!(groups, ["alleyTPI" for _ in 1:3])
+    end
     for experiment_params in experiment_params_list
         # load the results
         results_filename = params_to_results_filename(experiment_params)
@@ -516,4 +531,4 @@ function convert_dataset_to_string(data::DATASET)
     else
         return "unknown"
     end
-end
+end