Skip to content

Commit c6ef887

Browse files
committed
Fix cuckoo filter overflow issue
1 parent ff8fdd0 commit c6ef887

File tree

4 files changed

+38
-15
lines changed

4 files changed

+38
-15
lines changed

src/CardinalityWithColors.jl

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using Probably
12
using Probably: BloomFilter, constrain
23
using DataStructures: counter, Dict, Set, Vector, inc!, Queue
34
using AutoHashEquals

src/ColorSummary.jl

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
using Graphs
2-
using Probably
3-
42
"""
53
The ColorSummary struct holds statistical information associated with the colored graph.
64
It keeps detailed information about the number of edges between colors of a particular color and which land in
@@ -14,6 +12,7 @@ mutable struct ColorSummary{DS}
1412
color_label_cardinality::Dict{Color, Dict{Int, Int}} # color_label_cardinality[c][v] = num_vertices
1513
edge_deg::Dict{Int, Dict{Int, Dict{Color, Dict{Color, DS}}}} # edge_deg[e][v2][c1][c2] = degreestat
1614
color_filters::Dict{Color, SmallCuckoo} # color_filters[c] = filter
15+
color_full::Set{Color} # Denotes if a color's filter is full
1716
cycle_probabilities::Dict{CyclePathAndColors, Float64} # cycle_probabilities[[c1, c2], path] = likelihood
1817
cycle_length_probabilities::Dict{Int, Float64} #cycle_probabilities[path_length] = likelihood
1918
max_cycle_size::Int
@@ -176,7 +175,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
176175
edge_stats_time = time() - edge_stats_time
177176
push!(timing_vec, edge_stats_time)
178177

179-
return ColorSummary{DS}(color_label_cardinality, edge_deg, color_filters,
178+
return ColorSummary{DS}(color_label_cardinality, edge_deg, color_filters, Set{Color}(),
180179
cycle_probabilities, cycle_length_probabilities, params.max_cycle_size,
181180
ne(g.graph), nv(g.graph), num_colors, 0)
182181
end

src/QuasiStableCardinalityEstimator.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ to the magnitude of the original weight of the output paths.
5353
- partial_paths::Matrix{Color} - the matrix of partial paths describing the current traversal over the lifted color graph.
5454
- partial_weights::Vector{W} - the list of weights for each partial path.
5555
- num_samples::Int - the number of paths to sample, determining the size of the output.
56-
- sampling_strategy::SAMPLING_STRATEGY - how to select samples from the partial paths. Will only check if this is set to "uniform" to sample uniformly.
56+
- sampling_strategy::SAMPLING_STRATEGY - how to select samples from the partial paths. Will only check if this is set to "uniform" to sample uniformly.
5757
Otherwise, just selects samples by prioritizing those with higher weights.
5858
"""
5959
function sample_paths(partial_paths::Matrix{Color}, partial_weights::Vector{W}, num_samples::Int, sampling_strategy::SAMPLING_STRATEGY) where W
@@ -403,7 +403,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary{DS}; ma
403403
data_label_is_in_color = true
404404
continue
405405
end
406-
if data_label in summary.color_filters[color]
406+
if data_label in summary.color_filters[color] || color in summary.color_full
407407
data_label_is_in_color = true
408408
end
409409
end
@@ -489,7 +489,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary{DS}; ma
489489
data_label_in_color = true
490490
continue
491491
end
492-
if data_label in summary.color_filters[new_color]
492+
if data_label in summary.color_filters[new_color] || new_color in summary.color_full
493493
data_label_in_color = true
494494
end
495495
end

src/UpdateSummary.jl

+32-9
Original file line numberDiff line numberDiff line change
@@ -42,21 +42,26 @@ function add_summary_node!(summary::ColorSummary{AvgDegStats}, node_labels, node
4242
color = choose_color(summary)
4343
# add to the bloom filter
4444
push!(summary.color_filters[color], data_label)
45+
if Probably.loadfactor(summary.color_filters[color]) > .95
46+
push!(summary.color_full, color)
47+
end
4548
# for edge degrees, it decreases the average.
46-
# we want to update all the avg out degrees where this is the landing node (c2 == color && v2 == label),
47-
# and update all the avg in degrees where this is the starting node (c2 == color && v2 == label)
49+
# we want to update all the avg out/in degrees where this is the starting node (c1 == color && v1 == label),
4850
for edge_label in keys(summary.edge_deg)
4951
for node_label in node_labels
5052
if !haskey(summary.edge_deg[edge_label], node_label)
5153
summary.edge_deg[edge_label][node_label] = Dict()
5254
end
53-
for other_color in keys(summary.edge_deg[edge_label][node_label])
54-
current_ds = get(summary.edge_deg[edge_label][node_label][other_color], color, AvgDegStats(0, 0))
55+
if !haskey(summary.edge_deg[edge_label][node_label], color)
56+
summary.edge_deg[edge_label][node_label][color] = Dict()
57+
end
58+
for other_color in keys(summary.edge_deg[edge_label][node_label][color])
59+
current_ds = get(summary.edge_deg[edge_label][node_label][color], other_color, AvgDegStats(0, 0))
5560
current_cardinality = get(summary.color_label_cardinality[color], node_label, 0)
5661
avg_in = current_ds.avg_in * (current_cardinality / (current_cardinality + 1))
5762
avg_out = current_ds.avg_out * (current_cardinality / (current_cardinality + 1))
5863
new_ds = AvgDegStats(avg_in, avg_out)
59-
summary.edge_deg[edge_label][node_label][other_color][color] = new_ds
64+
summary.edge_deg[edge_label][node_label][color][other_color] = new_ds
6065
end
6166
end
6267
end
@@ -79,17 +84,35 @@ Finds and returns the color of the node in the lifted graph summary.
7984
- node - the vertex ID of the node.
8085
"""
8186
function get_node_summary_color(summary::ColorSummary, node)
82-
possible_colors = []
87+
possible_colors = collect(summary.color_full)
8388
for color in keys(summary.color_filters)
8489
filter = summary.color_filters[color]
8590
# in the data graph, the node's data label is just its id - 1
8691
if (node - 1) in filter
8792
push!(possible_colors, color)
8893
end
8994
end
95+
9096
# Since Cuckoo filters are used, there is a chance that there will be false positive results.
91-
# In that case, randomly select one of the colors that indicates it contains this node.
92-
return length(possible_colors) == 0 ? rand(keys(summary.color_filters)) : rand(possible_colors)
97+
# In that case, we select the largest color.
98+
true_color = 0
99+
if length(possible_colors) == 0
100+
println("HERE")
101+
true_color = get_largest_color(summary)
102+
elseif length(possible_colors) == 1
103+
true_color = possible_colors[1]
104+
elseif length(possible_colors) > 1
105+
min_size = Inf
106+
for color in possible_colors
107+
color_size = summary.color_label_cardinality[color][-1]
108+
if color_size < min_size
109+
true_color = color
110+
min_size = color_size
111+
end
112+
end
113+
end
114+
115+
return true_color
93116
end
94117

95118
"""
@@ -159,4 +182,4 @@ function add_summary_edge!(summary::ColorSummary{AvgDegStats}, start_node, end_n
159182
end
160183
end
161184
summary.total_added_edges += 1
162-
end
185+
end

0 commit comments

Comments
 (0)