Skip to content

Commit 434c1da

Browse files
committed
Add: Compactions
1 parent 20566e0 commit 434c1da

File tree

11 files changed

+380
-197
lines changed

11 files changed

+380
-197
lines changed

cpp/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ Instead of spawning additional threads within USearch, we focus on the thread sa
8181
```cpp
8282
#pragma omp parallel for
8383
for (std::size_t i = 0; i < n; ++i)
84-
native.add(key, span_t{vector, dims}, index_add_config_t { .thread = omp_get_thread_num() });
84+
native.add(key, span_t{vector, dims}, index_update_config_t { .thread = omp_get_thread_num() });
8585
```
8686

8787
During initialization, we allocate enough temporary memory for all the cores on the machine.

cpp/bench.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ void index_many(index_at& index, std::size_t n, vector_id_at const* ids, real_at
286286
#pragma omp parallel for schedule(static, 32)
287287
#endif
288288
for (std::size_t i = 0; i < n; ++i) {
289-
index_add_config_t config;
289+
index_update_config_t config;
290290
#if USEARCH_USE_OPENMP
291291
config.thread = omp_get_thread_num();
292292
#endif
@@ -373,7 +373,7 @@ static void single_shot(dataset_at& dataset, index_at& index, bool construct = t
373373
if (progress % 1000 == 0)
374374
printer.print(progress, total);
375375
});
376-
join_attempts = result.cycles;
376+
join_attempts = result.visited_members;
377377
}
378378
}
379379
// Evaluate join quality

cpp/test.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ void test_cosine(index_at& index, std::vector<std::vector<scalar_at>> const& vec
9696
executor_default_t executor;
9797
index.reserve({vectors.size(), executor.size()});
9898
executor.execute_bulk(vectors.size() - 3, [&](std::size_t thread, std::size_t task) {
99-
index_add_config_t config;
99+
index_update_config_t config;
100100
config.thread = thread;
101101
index.add(key_max - task - 3, vectors[task + 3].data(), args..., config);
102102
});
@@ -114,6 +114,9 @@ void test_cosine(index_at& index, std::vector<std::vector<scalar_at>> const& vec
114114
std::vector<scalar_t> vec_recovered_from_view(dimensions);
115115
index.get(key_second, vec_recovered_from_view.data());
116116
expect(std::equal(vector_second, vector_second + dimensions, vec_recovered_from_view.data()));
117+
118+
auto compaction_result = index.compact();
119+
expect(bool(compaction_result));
117120
}
118121

119122
expect(index.memory_usage() > 0);
@@ -201,7 +204,7 @@ template <typename key_at, typename slot_at> void test_tanimoto(std::size_t dime
201204

202205
index.reserve({batch_size + index.size(), executor.size()});
203206
executor.execute_bulk(batch_size, [&](std::size_t thread, std::size_t task) {
204-
index_add_config_t config;
207+
index_update_config_t config;
205208
config.thread = thread;
206209
index.add(task + 25000, scalars.data() + index.scalar_words() * task, config);
207210
});

include/usearch/index.hpp

+133-41
Large diffs are not rendered by default.

include/usearch/index_dense.hpp

+112-83
Large diffs are not rendered by default.

python/README.md

+8-3
Original file line numberDiff line numberDiff line change
@@ -268,10 +268,15 @@ One may often want to evaluate the quality of the constructed index before runni
268268
The trivial way is to measure `recall@1` on the entries already present in the index.
269269

270270
```py
271-
from usearch.eval import recall_members
271+
from usearch.eval import self_recall
272272

273-
assert recall_members(index, exact=True) == (1, 0)
274-
print(recall_members(index, exact=False))
273+
stats: SearchStats = self_recall(index, exact=True)
274+
assert stats.visited_members == 0, "Exact search won't attend index nodes"
275+
assert stats.computed_distances == len(index), "And will compute the distance to every node"
276+
277+
stats: SearchStats = self_recall(index, exact=False)
278+
assert stats.visited_members > 0
279+
assert stats.computed_distances <= len(index)
275280
```
276281

277282
In case you have some ground-truth data for more than one entry, you compare search results against expected values:

python/lib.cpp

+39-31
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ static void add_typed_to_index( //
186186
byte_t const* keys_data = reinterpret_cast<byte_t const*>(keys_info.ptr);
187187

188188
executor_default_t{threads}.execute_bulk(vectors_count, [&](std::size_t thread_idx, std::size_t task_idx) {
189-
index_dense_add_config_t config;
189+
index_dense_update_config_t config;
190190
config.force_vector_copy = force_copy;
191191
config.thread = thread_idx;
192192
key_t key = *reinterpret_cast<key_t const*>(keys_data + task_idx * keys_info.strides[0]);
@@ -246,7 +246,7 @@ static void search_typed( //
246246
dense_index_py_t& index, py::buffer_info& vectors_info, //
247247
std::size_t wanted, bool exact, std::size_t threads, //
248248
py::array_t<key_t>& keys_py, py::array_t<distance_t>& distances_py, py::array_t<Py_ssize_t>& counts_py,
249-
std::atomic<std::size_t>& stats_lookups, std::atomic<std::size_t>& stats_measurements) {
249+
std::atomic<std::size_t>& stats_visited_members, std::atomic<std::size_t>& stats_computed_distances) {
250250

251251
auto keys_py2d = keys_py.template mutable_unchecked<2>();
252252
auto distances_py2d = distances_py.template mutable_unchecked<2>();
@@ -270,8 +270,8 @@ static void search_typed( //
270270
counts_py1d(task_idx) =
271271
static_cast<Py_ssize_t>(result.dump_to(&keys_py2d(task_idx, 0), &distances_py2d(task_idx, 0)));
272272

273-
stats_lookups += result.lookups;
274-
stats_measurements += result.measurements;
273+
stats_visited_members += result.visited_members;
274+
stats_computed_distances += result.computed_distances;
275275
if (PyErr_CheckSignals() != 0)
276276
throw py::error_already_set();
277277
});
@@ -282,7 +282,7 @@ static void search_typed( //
282282
dense_indexes_py_t& indexes, py::buffer_info& vectors_info, //
283283
std::size_t wanted, bool exact, std::size_t threads, //
284284
py::array_t<key_t>& keys_py, py::array_t<distance_t>& distances_py, py::array_t<Py_ssize_t>& counts_py,
285-
std::atomic<std::size_t>& stats_lookups, std::atomic<std::size_t>& stats_measurements) {
285+
std::atomic<std::size_t>& stats_visited_members, std::atomic<std::size_t>& stats_computed_distances) {
286286

287287
auto keys_py2d = keys_py.template mutable_unchecked<2>();
288288
auto distances_py2d = distances_py.template mutable_unchecked<2>();
@@ -324,8 +324,8 @@ static void search_typed( //
324324
wanted));
325325
}
326326

327-
stats_lookups += result.lookups;
328-
stats_measurements += result.measurements;
327+
stats_visited_members += result.visited_members;
328+
stats_computed_distances += result.computed_distances;
329329
if (PyErr_CheckSignals() != 0)
330330
throw py::error_already_set();
331331
}
@@ -363,16 +363,16 @@ static py::tuple search_many_in_index( //
363363
py::array_t<key_t> keys_py({vectors_count, static_cast<Py_ssize_t>(wanted)});
364364
py::array_t<distance_t> distances_py({vectors_count, static_cast<Py_ssize_t>(wanted)});
365365
py::array_t<Py_ssize_t> counts_py(vectors_count);
366-
std::atomic<std::size_t> stats_lookups(0);
367-
std::atomic<std::size_t> stats_measurements(0);
366+
std::atomic<std::size_t> stats_visited_members(0);
367+
std::atomic<std::size_t> stats_computed_distances(0);
368368

369369
// clang-format off
370370
switch (numpy_string_to_kind(vectors_info.format)) {
371-
case scalar_kind_t::b1x8_k: search_typed<b1x8_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_lookups, stats_measurements); break;
372-
case scalar_kind_t::f8_k: search_typed<f8_bits_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_lookups, stats_measurements); break;
373-
case scalar_kind_t::f16_k: search_typed<f16_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_lookups, stats_measurements); break;
374-
case scalar_kind_t::f32_k: search_typed<f32_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_lookups, stats_measurements); break;
375-
case scalar_kind_t::f64_k: search_typed<f64_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_lookups, stats_measurements); break;
371+
case scalar_kind_t::b1x8_k: search_typed<b1x8_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_visited_members, stats_computed_distances); break;
372+
case scalar_kind_t::f8_k: search_typed<f8_bits_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_visited_members, stats_computed_distances); break;
373+
case scalar_kind_t::f16_k: search_typed<f16_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_visited_members, stats_computed_distances); break;
374+
case scalar_kind_t::f32_k: search_typed<f32_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_visited_members, stats_computed_distances); break;
375+
case scalar_kind_t::f64_k: search_typed<f64_t>(index, vectors_info, wanted, exact, threads, keys_py, distances_py, counts_py, stats_visited_members, stats_computed_distances); break;
376376
default: throw std::invalid_argument("Incompatible scalars in the query matrix: " + vectors_info.format);
377377
}
378378
// clang-format on
@@ -381,8 +381,8 @@ static py::tuple search_many_in_index( //
381381
results[0] = keys_py;
382382
results[1] = distances_py;
383383
results[2] = counts_py;
384-
results[3] = stats_lookups.load();
385-
results[4] = stats_measurements.load();
384+
results[3] = stats_visited_members.load();
385+
results[4] = stats_computed_distances.load();
386386
return results;
387387
}
388388

@@ -391,21 +391,18 @@ static std::unordered_map<key_t, key_t> join_index( //
391391
std::size_t max_proposals, bool exact) {
392392

393393
std::unordered_map<key_t, key_t> a_to_b;
394+
dummy_label_to_label_mapping_t b_to_a;
394395
a_to_b.reserve((std::min)(a.size(), b.size()));
395396

396-
// index_join_config_t config;
397-
398-
// config.max_proposals = max_proposals;
399-
// config.exact = exact;
400-
// config.expansion = (std::max)(a.expansion_search(), b.expansion_search());
401-
// std::size_t threads = (std::min)(a.limits().threads(), b.limits().threads());
402-
// executor_default_t executor{threads};
403-
// join_result_t result = dense_index_py_t::join( //
404-
// a, b, config, //
405-
// a_to_b, //
406-
// dummy_label_to_label_mapping_t{}, //
407-
// executor);
408-
// result.error.raise();
397+
index_join_config_t config;
398+
config.max_proposals = max_proposals;
399+
config.exact = exact;
400+
config.expansion = (std::max)(a.expansion_search(), b.expansion_search());
401+
std::size_t threads = (std::min)(a.limits().threads(), b.limits().threads());
402+
executor_default_t executor{threads};
403+
join_result_t result = a.join(b, config, a_to_b, b_to_a, executor);
404+
result.error.raise();
405+
409406
return a_to_b;
410407
}
411408

@@ -418,6 +415,16 @@ static dense_index_py_t copy_index(dense_index_py_t const& index) {
418415
return std::move(result.index);
419416
}
420417

418+
static void compact_index(dense_index_py_t& index, std::size_t threads) {
419+
420+
if (!threads)
421+
threads = std::thread::hardware_concurrency();
422+
if (!index.reserve(index_limits_t(index.size(), threads)))
423+
throw std::invalid_argument("Out of memory!");
424+
425+
index.compact(executor_default_t{threads});
426+
}
427+
421428
// clang-format off
422429
template <typename index_at> void save_index(index_at const& index, std::string const& path) { index.save(path.c_str()).error.raise(); }
423430
template <typename index_at> void load_index(index_at& index, std::string const& path) { index.load(path.c_str()).error.raise(); }
@@ -601,7 +608,7 @@ PYBIND11_MODULE(compiled, m) {
601608
if (!index.reserve(index_limits_t(index.size(), threads)))
602609
throw std::invalid_argument("Out of memory!");
603610

604-
index.compact(executor_default_t{threads});
611+
index.isolate(executor_default_t{threads});
605612
return result.completed;
606613
},
607614
py::arg("key"), py::arg("compact"), py::arg("threads"));
@@ -619,7 +626,7 @@ PYBIND11_MODULE(compiled, m) {
619626
if (!index.reserve(index_limits_t(index.size(), threads)))
620627
throw std::invalid_argument("Out of memory!");
621628

622-
index.compact(executor_default_t{threads});
629+
index.isolate(executor_default_t{threads});
623630
return result.completed;
624631
},
625632
py::arg("key"), py::arg("compact"), py::arg("threads"));
@@ -651,6 +658,7 @@ PYBIND11_MODULE(compiled, m) {
651658
i.def("reset", &reset_index<dense_index_py_t>, py::call_guard<py::gil_scoped_release>());
652659
i.def("clear", &clear_index<dense_index_py_t>, py::call_guard<py::gil_scoped_release>());
653660
i.def("copy", &copy_index, py::call_guard<py::gil_scoped_release>());
661+
i.def("compact", &compact_index, py::call_guard<py::gil_scoped_release>());
654662
i.def("join", &join_index, py::arg("other"), py::arg("max_proposals") = 0, py::arg("exact") = false,
655663
py::call_guard<py::gil_scoped_release>());
656664

python/scripts/test.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -108,21 +108,23 @@ def test_index(
108108
index.add(42, vector)
109109

110110
assert len(index) == 1, "Size after addition"
111-
assert 42 in index, "Presense in the index"
111+
assert 42 in index, "Presence in the index"
112112
assert 42 in index.keys, "Presence among keys"
113-
assert 43 not in index, "Presense in the index, false positive"
113+
assert 43 not in index, "Presence in the index, false positive"
114114
assert index[42] is not None, "Vector recovery"
115115
assert index[43] is None, "Vector recovery, false positive"
116116
assert len(index[42]) == ndim
117117
if numpy_type != np.byte:
118118
assert np.allclose(index[42], vector, atol=0.1)
119119

120-
matches = index.search(vector, 10)
120+
matches: Matches = index.search(vector, 10)
121121
assert len(matches.keys) == 1, "Number of matches"
122-
assert len(matches.keys) == len(matches.distances), "Symmetric match subarrays"
122+
assert len(matches.keys) == len(matches.distances), "Symmetric match sub-arrays"
123123
assert len({match.key for match in matches}) == 1, "Iteration over matches"
124124
assert matches[0].key == 42
125125
assert matches[0].distance == pytest.approx(0, abs=1e-3)
126+
assert matches.computed_distances <= 2
127+
assert matches.visited_members <= 2
126128

127129
# Validating the index structure and metadata:
128130
assert index.max_level >= 0
@@ -137,7 +139,7 @@ def test_index(
137139
index.remove(43)
138140
assert len(index) == 1
139141

140-
# Try insreting back
142+
# Try inserting back
141143
index.add(43, other_vector)
142144
assert len(index) == 2
143145
index.remove(43)
@@ -183,7 +185,7 @@ def test_index(
183185
index = Index.restore(temporary_usearch_filename)
184186
assert index is None
185187

186-
# Try openning a corrupt file
188+
# Try opening a corrupt file
187189
with open(temporary_usearch_filename, "w") as file:
188190
file.write("Some random string")
189191
meta = Index.metadata(temporary_usearch_filename)
@@ -280,15 +282,20 @@ def test_exact_recall(
280282
matches: Matches = index.search(vectors[i], 10, exact=True)
281283
found_labels = matches.keys
282284
assert found_labels[0] == i
285+
assert matches.computed_distances == len(index)
286+
assert matches.visited_members == 0, "Exact search won't traverse the graph"
283287

284288
# Search the whole batch
285289
if batch_size > 1:
286290
matches: BatchMatches = index.search(vectors, 10, exact=True)
291+
assert matches.computed_distances == len(index) * len(vectors)
292+
assert matches.visited_members == 0, "Exact search won't traverse the graph"
293+
287294
found_labels = matches.keys
288295
for i in range(batch_size):
289296
assert found_labels[i, 0] == i
290297

291-
# Match entries aginst themselves
298+
# Match entries against themselves
292299
index_copy: Index = index.copy()
293300
mapping: dict = index.join(index_copy, exact=True)
294301
for man, woman in mapping.items():

python/usearch/eval.py

+42-10
Original file line numberDiff line numberDiff line change
@@ -55,24 +55,51 @@ def random_vectors(
5555
return x
5656

5757

58-
def recall_members(index: Index, sample: float = 1, **kwargs) -> Tuple[float, float]:
59-
"""Simplest benchmark for a quality of search, which queries every
60-
existing member of the index, to make sure approximate search finds
61-
the point itself. Reports 2 metrics - "self-recall" and "efficiency".
58+
@dataclass
59+
class SearchStats:
60+
"""
61+
Contains statistics for one or more search runs, including the number of
62+
internal nodes that were fetched (`visited_members`) and the number
63+
of times the distance metric was invoked (`computed_distances`).
6264
63-
Self-recall is the share of queried vectors, that were succesfully found.
65+
Other derivative metrics include the `mean_recall` and `mean_efficiency`.
66+
Recall is the share of queried vectors, that were successfully found.
6467
Efficiency describes the number of distances that had to be computed for
6568
each query, normalized to size of the `index`. Highest efficiency is 0.(9),
6669
lowest is zero. Highest is achieved, when the distance metric was computed
6770
just once per query. Lowest happens during exact search, when every distance
6871
to every present vector had to be computed.
72+
"""
73+
74+
index_size: int
75+
count_queries: int
76+
count_matches: int
77+
78+
visited_members: int
79+
computed_distances: int
80+
81+
@property
82+
def mean_efficiency(self) -> float:
83+
return 1 - float(self.computed_distances) / (
84+
self.count_queries * self.index_size
85+
)
86+
87+
@property
88+
def mean_recall(self) -> float:
89+
return self.count_matches / self.count_queries
90+
91+
92+
def self_recall(index: Index, sample: float = 1, **kwargs) -> SearchStats:
93+
"""Simplest benchmark for a quality of search, which queries every
94+
existing member of the index, to make sure approximate search finds
95+
the point itself.
6996
7097
:param index: Non-empty pre-constructed index
7198
:type index: Index
7299
:param sample: Share of vectors to search, defaults to 1
73100
:type sample: float
74-
:return: Value from 0 to 1, for the share of found self-references
75-
:rtype: float, float
101+
:return: Evaluation report with key metrics
102+
:rtype: SearchStats
76103
"""
77104
if len(index) == 0:
78105
return 0
@@ -85,9 +112,14 @@ def recall_members(index: Index, sample: float = 1, **kwargs) -> Tuple[float, fl
85112

86113
queries = index.get_vectors(keys, index.dtype)
87114
matches: BatchMatches = index.search(queries, **kwargs)
88-
recall_first: float = matches.recall_first(keys)
89-
efficiency: float = 1 - float(matches.measurements) / (len(keys) * len(index))
90-
return recall_first, efficiency
115+
count_matches: float = matches.count_matches(keys)
116+
return SearchStats(
117+
index_size=len(index),
118+
count_queries=len(keys),
119+
count_matches=count_matches,
120+
visited_members=matches.visited_members,
121+
computed_distances=matches.computed_distances,
122+
)
91123

92124

93125
def measure_seconds(f: Callable) -> Tuple[float, Any]:

0 commit comments

Comments
 (0)