From 878872950fc91efdeccfd1faeb02522c5be7773e Mon Sep 17 00:00:00 2001 From: Maksym Medvied Date: Wed, 25 Oct 2023 20:06:17 +0400 Subject: [PATCH] validate-index: Implement a function to validate index data structures (#208) * validate-index: Implement a function to validate index data structures Example: ``` CREATE EXTENSION lantern; CREATE TABLE small_world ( id SERIAL PRIMARY KEY, v REAL[2] ); INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}'); CREATE INDEX ON small_world USING hnsw (v); SELECT _lantern_internal.validate_index('small_world_v_idx'); ``` The output of the last command: ``` INFO: validate_index() start for small_world_v_idx INFO: index_header = HnswIndexHeaderPage(version=1 vector_dim=3 m=16 ef_construction=128 ef=64 metric_kind=1 num_vectors=2 last_data_block=2 blockmap_page_groups=0) INFO: blocks_nr=3 nodes_nr=2 INFO: blocks for: header 1 blockmap 1 nodes 1 INFO: nodes per block: last block 2 INFO: level=0: nodes 2 directed neighbor edges 2 min neighbors 1 max neighbors 1 INFO: validate_index() done, no issues found. validate_index ---------------- (1 row) ``` To see the indexes that could be passed to the function: ``` postgres=# \d small_world; Table "public.small_world" Column | Type | Collation | Nullable | Default --------+---------+-----------+----------+----------------------------------------- id | integer | | not null | nextval('small_world_id_seq'::regclass) v | real[] | | | Indexes: "small_world_pkey" PRIMARY KEY, btree (id) "small_world_v_idx" hnsw (v) ``` This patch also adds the validate_index() call to existing tests. Because of use of RNG in hnsw_generate_new_level() the number of levels in the newly INSERTed nodes is not deterministic, and validate_index() output may change between runs, because it prints the number of nodes for each level. If you see a sporadic test failures due to different validate_index() info output please remove the validate_index() call from the test. Another solution would be to add an option validate_index() to tell if elog() for the additional info is needed. * src/hnsw/validate_index: run clang-format * src/hnsw/validate_index: use signed batch_size and group_node_first_index They are compared and are used in the same expressions as other unsigned variables anyway. There is no good reason for them to be signed. * src/hnsw/validate_index: change PRIu64 to ul Reference: https://gitlab.com/wireshark/wireshark/-/issues/17895 * src/hnsw/validate_index: remove dangling " " after clang-format * src/hnsw/validate_index: include access/heapam.h instead of access/relation.h for PostgreSQL 11 * src/hnsw/validate_index: clang-format * src/hnsw/validate_index: make elog(INFO, ...) prints optional and enabled by default This is required because some tests are building the HNSW index in a non-deterministic way. * test: make validate_index() output deterministic * src/hnsw/validate_index: use ldb_invariant() instead of assert() * src/hnsw/validate_index: reduce the scope of what's done in LDB_VI_READ_NODE_CHUNK() macro * src/hnsw/validate_index: validate vn_dim properly * src/hnsw/validate_index: add a comment about assumptions and storage format for struct ldb_vi_node * src/hnsw/validate_index: describe what vi here is * src/hnsw/validate_index: cast ldb_HnswGetM() to uint32 to compare with HnswIndexHeaderPage.m * use FirstOffsetNumber and OffsetNumberNext() in the loop over page --- sql/lantern.sql | 4 + src/hnsw.c | 11 + src/hnsw/validate_index.c | 687 +++++++++++++++++++++++++ src/hnsw/validate_index.h | 25 + test/expected/ext_relocation.out | 25 +- test/expected/hnsw_config.out | 9 + test/expected/hnsw_correct.out | 9 + test/expected/hnsw_cost_estimate.out | 32 ++ test/expected/hnsw_create.out | 24 + test/expected/hnsw_create_expr.out | 8 + test/expected/hnsw_dist_func.out | 32 ++ test/expected/hnsw_index_from_file.out | 24 + test/expected/hnsw_insert.out | 26 +- test/expected/hnsw_select.out | 25 + test/expected/hnsw_todo.out | 26 + test/sql/ext_relocation.sql | 2 + test/sql/hnsw_config.sql | 3 + test/sql/hnsw_correct.sql | 5 +- test/sql/hnsw_cost_estimate.sql | 4 + test/sql/hnsw_create.sql | 4 + test/sql/hnsw_create_expr.sql | 1 + test/sql/hnsw_dist_func.sql | 5 + test/sql/hnsw_index_from_file.sql | 3 + test/sql/hnsw_insert.sql | 7 +- test/sql/hnsw_select.sql | 5 + test/sql/hnsw_todo.sql | 4 + 26 files changed, 1003 insertions(+), 7 deletions(-) create mode 100644 src/hnsw/validate_index.c create mode 100644 src/hnsw/validate_index.h diff --git a/sql/lantern.sql b/sql/lantern.sql index b2c9c5b21..d3ceb5846 100644 --- a/sql/lantern.sql +++ b/sql/lantern.sql @@ -30,6 +30,10 @@ CREATE OPERATOR <-> ( ); CREATE SCHEMA _lantern_internal; + +CREATE FUNCTION _lantern_internal.validate_index(index regclass, print_info boolean DEFAULT true) RETURNS VOID + AS 'MODULE_PATHNAME', 'lantern_internal_validate_index' LANGUAGE C STABLE STRICT PARALLEL UNSAFE; + -- operator classes CREATE OR REPLACE FUNCTION _lantern_internal._create_ldb_operator_classes(access_method_name TEXT) RETURNS BOOLEAN AS $$ DECLARE diff --git a/src/hnsw.c b/src/hnsw.c index 6e475ff1e..1da92f794 100644 --- a/src/hnsw.c +++ b/src/hnsw.c @@ -19,6 +19,7 @@ #include "hnsw/options.h" #include "hnsw/scan.h" #include "hnsw/utils.h" +#include "hnsw/validate_index.h" #include "hnsw/vector.h" #include "usearch.h" @@ -358,6 +359,16 @@ Datum vector_l2sq_dist(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8((double)vector_dist(a, b, usearch_metric_l2sq_k)); } +PGDLLEXPORT PG_FUNCTION_INFO_V1(lantern_internal_validate_index); +Datum lantern_internal_validate_index(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + bool print_info = PG_GETARG_BOOL(1); + + ldb_validate_index(indrelid, print_info); + PG_RETURN_VOID(); +} + /* * Get data type for give oid * */ diff --git a/src/hnsw/validate_index.c b/src/hnsw/validate_index.c new file mode 100644 index 000000000..f6fd51e61 --- /dev/null +++ b/src/hnsw/validate_index.c @@ -0,0 +1,687 @@ +#include + +#include "hnsw/validate_index.h" + +#include /* relation_open */ +#include /* IndexGetRelation */ +#include /* PRIu32 */ +#include /* UINT32_MAX */ +#include /* bzero */ +#include /* AllocSetContextCreate */ + +#include "hnsw/external_index.h" /* HnswIndexHeaderPage */ +#include "hnsw/options.h" /* ldb_HnswGetM */ +#include "hnsw/utils.h" /* ldb_invariant */ + +/* vi infix here is for Validate Index */ + +enum ldb_vi_block_type +{ + LDB_VI_BLOCK_UNKNOWN, + LDB_VI_BLOCK_HEADER, + LDB_VI_BLOCK_BLOCKMAP, + LDB_VI_BLOCK_NODES, + LDB_VI_BLOCK_NR, +}; + +/* represents PostgreSQL block in the index */ +struct ldb_vi_block +{ + enum ldb_vi_block_type vp_type; + uint32_t vp_nodes_nr; +}; + +/* + * Represents a stored usearch node. + * Assumes that usearch node has label, dim (size in bytes of the vector + * at the end) and neighbors on different levels. + * + * Please adjust ldb_vi_read_node_carefully() in case if on-storage format changes. + */ +struct ldb_vi_node +{ + BlockNumber vn_block; /* in the index */ + OffsetNumber vn_offset; /* within vn_block */ + uint32 vn_id; /* HnswIndexTuple.id */ + usearch_label_t vn_label; + uint32 vn_dim; /* usearch index_gt::dim_t */ + uint32 vn_level; /* HnswIndexTuple.level, usearch index_gt::level_t */ + uint32 *vn_neighbors_nr; /* number of neighbors for each level */ + uint32 **vn_neighbors; /* array of arrays of neighbors for each level */ +}; + +/* + * TODO add const to parameters wherever needed + * TODO validate groups after max group no + * TODO export entire index into json (for Python) + * TODO test for non-default M + * TODO add execution times for each stage + * TODO check that the vectors are the same as in the table relation + */ + +static void ldb_vi_analyze_blockmap(HnswBlockmapPage *blockmap, + struct ldb_vi_block *vi_blocks, + BlockNumber blocks_nr, + struct ldb_vi_node *vi_nodes, + uint32 nodes_nr) +{ + for(uint32 node_id_in_blockmap = 0; node_id_in_blockmap < HNSW_BLOCKMAP_BLOCKS_PER_PAGE; ++node_id_in_blockmap) { + uint32 node_id = blockmap->first_id + node_id_in_blockmap; + BlockNumber blockno = blockmap->blocknos[ node_id_in_blockmap ]; + if(node_id < nodes_nr) { + if(blockno == 0) { + elog(ERROR, + "blockmap.blocknos[%" PRIu32 + "] == 0 for " + "node_id=%" PRIu32 " nodes_nr=%" PRIu32, + node_id_in_blockmap, + node_id, + nodes_nr); + } + if(blockno >= blocks_nr) { + elog(ERROR, + "blockmap.blocknos[%" PRIu32 "]=%" PRIu32 " >= blocks_nr=%" PRIu32 + " for " + "node_id=%" PRIu32 " nodes_nr=%" PRIu32, + node_id_in_blockmap, + blockno, + blocks_nr, + node_id, + nodes_nr); + } + if(vi_blocks[ blockno ].vp_type == LDB_VI_BLOCK_UNKNOWN) vi_blocks[ blockno ].vp_type = LDB_VI_BLOCK_NODES; + if(vi_blocks[ blockno ].vp_type != LDB_VI_BLOCK_NODES) { + elog(ERROR, + "vi_blocks[%" PRIu32 + "].vp_type=%d != %d for " + "blocks_nr=%" PRIu32 " node_id_in_blockmap=%" PRIu32 " node_id=%" PRIu32 " nodes_nr=%" PRIu32, + blockno, + vi_blocks[ blockno ].vp_type, + LDB_VI_BLOCK_NODES, + blocks_nr, + node_id_in_blockmap, + node_id, + nodes_nr); + } + vi_nodes[ node_id ].vn_block = blockno; + } else if(blockno != 0) { + elog(ERROR, + "blockmap.blocknos[%" PRIu32 "]=%" PRIu32 + " != 0 for " + "node_id=%" PRIu32 " nodes_nr=%" PRIu32, + node_id_in_blockmap, + blockno, + node_id, + nodes_nr); + } + } +} + +static void ldb_vi_read_blockmaps(Relation index, + HnswIndexHeaderPage *index_header, + struct ldb_vi_block *vi_blocks, + BlockNumber blocks_nr, + struct ldb_vi_node *vi_nodes, + uint32 nodes_nr) +{ + /* TODO the outer loop math is mostly copy-pasted from StoreExternalIndex() */ + uint32 blockmap_groupno = 0; + uint32 group_node_first_index = 0; + uint32 nodes_remaining = nodes_nr; + uint32 batch_size = HNSW_BLOCKMAP_BLOCKS_PER_PAGE; + + if(blocks_nr == 0) return; + vi_blocks[ 0 ].vp_type = LDB_VI_BLOCK_HEADER; + while(nodes_remaining != 0) { + if(blockmap_groupno > index_header->blockmap_page_groups) { + elog(ERROR, + "blockmap_groupno=%d > index_header->blockmap_page_groups=%d", + blockmap_groupno, + index_header->blockmap_page_groups); + } + /* TODO see the loop in CreateBlockMapGroup() */ + BlockNumber number_of_blockmaps_in_group = 1u << blockmap_groupno; + BlockNumber group_start = index_header->blockmap_page_group_index[ blockmap_groupno ]; + for(unsigned blockmap_id = 0; blockmap_id < number_of_blockmaps_in_group; ++blockmap_id) { + BlockNumber blockmap_block = group_start + blockmap_id; + BlockNumber expected_special_nextblockno; + + if(blockmap_block >= blocks_nr) { + elog(ERROR, + "blockmap_block=%" PRIu32 " >= blocks_nr=%" PRIu32 " (blockmap_groupno=%d blockmap_id=%d)", + blockmap_block, + blocks_nr, + blockmap_groupno, + blockmap_id); + } + if(vi_blocks[ blockmap_block ].vp_type != LDB_VI_BLOCK_UNKNOWN) { + elog(ERROR, + "vi_blocks[%" PRIu32 "].vp_type=%d (should be %d)", + blockmap_block, + vi_blocks[ blockmap_block ].vp_type, + LDB_VI_BLOCK_UNKNOWN); + } + vi_blocks[ blockmap_block ].vp_type = LDB_VI_BLOCK_BLOCKMAP; + Buffer buf = ReadBuffer(index, blockmap_block); + LockBuffer(buf, BUFFER_LOCK_SHARE); + Page page = BufferGetPage(buf); + + /* see StoreExternalIndexBlockMapGroup() */ + if(PageGetMaxOffsetNumber(page) < FirstOffsetNumber) { + elog(ERROR, + "blockmap_block=%" PRIu32 + " for blockmap_groupno=%d blockmap_id=%d " + "doesn't have HnswBlockmapPage inside", + blockmap_groupno, + blockmap_id, + blockmap_block); + } + HnswBlockmapPage *blockmap = (HnswBlockmapPage *)PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); + if(blockmap->first_id != group_node_first_index + blockmap_id * HNSW_BLOCKMAP_BLOCKS_PER_PAGE) { + elog(ERROR, + "blockmap->first_id=%" PRIu32 + " != " + "group_node_first_index=%d + blockmap_id=%u * HNSW_BLOCKMAP_BLOCKS_PER_PAGE=%d", + blockmap->first_id, + group_node_first_index, + blockmap_id, + HNSW_BLOCKMAP_BLOCKS_PER_PAGE); + } + HnswIndexPageSpecialBlock *special = (HnswIndexPageSpecialBlock *)PageGetSpecialPointer(page); + if(special->firstId != blockmap->first_id) { + elog(ERROR, + "special->firstId=%" PRIu32 " != blockmap->first_id=%" PRIu32 + " for " + "blockmap_block=%" PRIu32 " blockmap_groupno=%d blockmap_id=%d", + special->firstId, + blockmap->first_id, + blockmap_block, + blockmap_groupno, + blockmap_id); + } + if(special->lastId != special->firstId + HNSW_BLOCKMAP_BLOCKS_PER_PAGE - 1) { + elog(ERROR, + "special->lastId=%" PRIu32 " != (special->first_id=%" PRIu32 + " + HNSW_BLOCKMAP_BLOCKS_PER_PAGE=%d - 1) for " + "blockmap_block=%" PRIu32 " blockmap_groupno=%d blockmap_id=%d", + special->lastId, + special->firstId, + HNSW_BLOCKMAP_BLOCKS_PER_PAGE, + blockmap_block, + blockmap_groupno, + blockmap_id); + } + /* TODO confirm this */ + /* + expected_special_nextblockno = blockmap_id == number_of_blockmaps_in_group - 1 ? + InvalidBlockNumber : blockmap_block + 1; + */ + expected_special_nextblockno = blockmap_block + 1; + if(special->nextblockno != expected_special_nextblockno) { + elog(ERROR, + "special->nextblockno=%" PRIu32 " != expected_special_nextblockno=%" PRIu32 + " for " + "blockmap_block=%" PRIu32 " blockmap_groupno=%d blockmap_id=%d", + special->nextblockno, + expected_special_nextblockno, + blockmap_block, + blockmap_groupno, + blockmap_id); + } + ldb_vi_analyze_blockmap(blockmap, vi_blocks, blocks_nr, vi_nodes, nodes_nr); + + UnlockReleaseBuffer(buf); + } + nodes_remaining -= Min(batch_size, nodes_remaining); + group_node_first_index += batch_size; + batch_size = batch_size * 2; + blockmap_groupno++; + } +} + +/* Read a part of the node. Also advance tape_pos by chunk_size. */ +static void ldb_vi_read_node_chunk(const struct ldb_vi_node *vi_node, + void *chunk, + size_t chunk_size, + const char *chunk_name, + void *tape, + unsigned *tape_pos, + unsigned tape_size) +{ + if(*tape_pos + chunk_size > tape_size) { + elog(ERROR, + "Error reading %s: tape_pos=%u + _chunk_size=%zu > tape_size=%u for " + "block=%" PRIu32 " offset=%" PRIu16 " node_id=%" PRIu32, + chunk_name, + *tape_pos, + chunk_size, + tape_size, + vi_node->vn_block, + vi_node->vn_offset, + vi_node->vn_id); + } + memcpy(chunk, (char *)tape + *tape_pos, chunk_size); + *tape_pos += chunk_size; +} + +#define LDB_VI_READ_NODE_CHUNK(_vi_node, _chunk, _tape, _tape_pos, _tape_size) \ + ldb_vi_read_node_chunk((_vi_node), &(_chunk), sizeof(_chunk), #_chunk, (_tape), (_tape_pos), (_tape_size)) + +/* See "Load nodes one by one" loop in usearch index_gt::load() */ +static void ldb_vi_read_node_carefully(void *node_tape, + unsigned node_tape_size, + uint32 vector_dim, + size_t scalar_size, + const uint32 M, + struct ldb_vi_node *vi_node, + uint32 nodes_nr) +{ + unsigned tape_pos = 0; + uint32 level_on_tape; + uint32 neighbors_nr; + uint32 neighbors_max; + uint32 *neighbors; + uint32 unused; + + LDB_VI_READ_NODE_CHUNK(vi_node, vi_node->vn_label, node_tape, &tape_pos, node_tape_size); + LDB_VI_READ_NODE_CHUNK(vi_node, vi_node->vn_dim, node_tape, &tape_pos, node_tape_size); + LDB_VI_READ_NODE_CHUNK(vi_node, level_on_tape, node_tape, &tape_pos, node_tape_size); + + if(vi_node->vn_dim != vector_dim * scalar_size) { + elog(ERROR, + "vi_node->vn_dim=%" PRIu32 " != vector_dim=%" PRIu32 + " * scalar_size=%zu " + "for node_id=%" PRIu32 " block=%" PRIu32 " offset=%" PRIu16, + vi_node->vn_dim, + vector_dim, + scalar_size, + vi_node->vn_id, + vi_node->vn_block, + vi_node->vn_offset); + } + if(level_on_tape != vi_node->vn_level) { + elog(ERROR, + "level_on_tape=%" PRIu32 " != vi_node->vn_level=%" PRIu32 + " for " + "node_id=%" PRIu32 " block=%" PRIu32 " offset=%" PRIu16, + level_on_tape, + vi_node->vn_level, + vi_node->vn_id, + vi_node->vn_block, + vi_node->vn_offset); + } + /* + * Now read lists of neighbors for each level. + * See the comment for usearch neighbors_ref_t for the description of neighbors for one level. + * See usearch precompute_ for the max numbers of neighbors for each level. + * See usearch neighbors_ for the layour of neighbors for different levels on the tape. + * + * connectivity is M + * connectivity_max_base is M * base_level_multiple() + * + * base_level_multiple() in usearch is 2. + */ + vi_node->vn_neighbors_nr = palloc_array(typeof(*(vi_node->vn_neighbors_nr)), vi_node->vn_level + 1); + vi_node->vn_neighbors = palloc_array(typeof(*(vi_node->vn_neighbors)), vi_node->vn_level + 1); + for(uint32 level = 0; level <= vi_node->vn_level; ++level) { + neighbors_max = level == 0 ? M * 2 : M; + LDB_VI_READ_NODE_CHUNK(vi_node, neighbors_nr, node_tape, &tape_pos, node_tape_size); + + if(neighbors_nr > neighbors_max) { + elog(ERROR, + "neighbors_nr=%" PRIu32 " > neighbors_max=%" PRIu32 + " for " + "level=%" PRIu32 + " tape_pos=%u node_tape_size=%u " + "node_id=%" PRIu32 " block=%" PRIu32 " offset=%" PRIu16, + neighbors_nr, + neighbors_max, + level, + tape_pos, + node_tape_size, + vi_node->vn_id, + vi_node->vn_block, + vi_node->vn_offset); + } + neighbors = palloc_array(typeof(*neighbors), neighbors_nr); + for(uint32 i = 0; i < neighbors_nr; ++i) { + LDB_VI_READ_NODE_CHUNK(vi_node, neighbors[ i ], node_tape, &tape_pos, node_tape_size); + if(neighbors[ i ] >= nodes_nr) { + elog(ERROR, + "neighbors[%" PRIu32 "]=%" PRIu32 " >= nodes_nr=%" PRIu32 + " for " + "neighbors_nr=%" PRIu32 " neighbors_max=%" PRIu32 " level=%" PRIu32 + " tape_pos=%u node_tape_size=%u " + "node_id=%" PRIu32 " block=%" PRIu32 " offset=%" PRIu16, + i, + neighbors[ i ], + nodes_nr, + neighbors_nr, + neighbors_max, + level, + tape_pos, + node_tape_size, + vi_node->vn_id, + vi_node->vn_block, + vi_node->vn_offset); + } + } + for(uint32 i = neighbors_nr; i < neighbors_max; ++i) + LDB_VI_READ_NODE_CHUNK(vi_node, unused, node_tape, &tape_pos, node_tape_size); + vi_node->vn_neighbors_nr[ level ] = neighbors_nr; + vi_node->vn_neighbors[ level ] = neighbors; + } + /* the vector of floats is at the end */ + tape_pos += vi_node->vn_dim; + if(tape_pos != node_tape_size) { + elog(ERROR, + "tape_pos=%u != node_tape_size=%u for " + "node_id=%" PRIu32 " block=%" PRIu32 " offset=%" PRIu16, + tape_pos, + node_tape_size, + vi_node->vn_id, + vi_node->vn_block, + vi_node->vn_offset); + } +} + +#undef LDB_VI_READ_NODE_CHUNK + +static void ldb_vi_read_nodes(Relation index, + const HnswIndexHeaderPage *index_header, + struct ldb_vi_block *vi_blocks, + BlockNumber blocks_nr, + struct ldb_vi_node *vi_nodes, + uint32 nodes_nr) +{ + /* see usearch_init_options_t.quantization in PopulateUsearchOpts() */ + const size_t scalar_size = sizeof(float); + + for(uint32_t i = 0; i < nodes_nr; ++i) { + if(vi_nodes[ i ].vn_block == InvalidBlockNumber) + elog(ERROR, "vi_nodes[%" PRIu32 "].vn_block == InvalidBlockNumber", vi_nodes[ i ].vn_block); + } + for(BlockNumber block = 0; block < blocks_nr; ++block) { + if(vi_blocks[ block ].vp_type != LDB_VI_BLOCK_NODES) continue; + Buffer buf = ReadBuffer(index, block); + LockBuffer(buf, BUFFER_LOCK_SHARE); + Page page = BufferGetPage(buf); + + if(PageGetMaxOffsetNumber(page) < FirstOffsetNumber) + elog(ERROR, "block=%" PRIu32 " is supposed to have nodes but it doesn't have any", block); + + for(OffsetNumber offset = FirstOffsetNumber; offset <= PageGetMaxOffsetNumber(page); + offset = OffsetNumberNext(offset)) { + ItemId item_id = PageGetItemId(page, offset); + HnswIndexTuple *index_tuple = (HnswIndexTuple *)PageGetItem(page, item_id); + unsigned index_tuple_length = ItemIdGetLength(item_id); + uint32 node_id; + + if(sizeof(*index_tuple) > index_tuple_length) { + elog(ERROR, + "sizeof(*index_tuple)=%zu > index_tuple_length=%u for " + "block=%" PRIu32 " offset=%" PRIu16, + sizeof(*index_tuple), + index_tuple_length, + block, + offset); + } + node_id = index_tuple->id; + if(node_id >= nodes_nr) { + elog(ERROR, + "node_id=%" PRIu32 " >= nodes_nr=%" PRIu32 + " for " + "block=%" PRIu32 " offset=%" PRIu16, + node_id, + nodes_nr, + block, + offset); + } + if(vi_nodes[ node_id ].vn_block != block) { + elog(ERROR, + "vi_nodes[%" PRIu32 "].vn_block=%" PRIu32 " != block=%" PRIu32 + " for " + "offset=%" PRIu16, + node_id, + vi_nodes[ node_id ].vn_block, + block, + offset); + } + if(vi_nodes[ node_id ].vn_offset != InvalidOffsetNumber) { + elog(ERROR, + "vi_nodes[%" PRIu32 "].vn_offset=%" PRIu32 " != InvalidOffsetNumber=%" PRIu32 + " for " + "block=%" PRIu32, + node_id, + vi_nodes[ node_id ].vn_offset, + InvalidOffsetNumber, + block); + } + if(sizeof(*index_tuple) + index_tuple->size != index_tuple_length) { + elog(ERROR, + "sizeof(*index_tuple)=%zu + index_tuple->size=%" PRIu32 + " != index_tuple_length=%u for " + "node_id=%" PRIu32 " nodes_nr=%" PRIu32 " block=%" PRIu32 " offset=%" PRIu16, + sizeof(*index_tuple), + index_tuple->size, + index_tuple_length, + node_id, + nodes_nr, + block, + offset); + } + vi_nodes[ node_id ].vn_offset = offset; + vi_nodes[ node_id ].vn_id = node_id; + vi_nodes[ node_id ].vn_level = index_tuple->level; + ldb_vi_read_node_carefully(&index_tuple->node, + index_tuple->size, + index_header->vector_dim, + scalar_size, + index_header->m, + &vi_nodes[ node_id ], + nodes_nr); + } + UnlockReleaseBuffer(buf); + } +} + +static void ldb_vi_print_statistics(struct ldb_vi_block *vi_blocks, + BlockNumber blocks_nr, + struct ldb_vi_node *vi_nodes, + uint32 nodes_nr) +{ + BlockNumber last_block = InvalidBlockNumber; + uint32 blocks_per_blocktype[ LDB_VI_BLOCK_NR ]; + uint32 min_nodes_per_block = UINT32_MAX; + uint32 max_nodes_per_block = 0; + uint32 max_level = 0; + uint32 *nodes_per_level; + uint64 *edges_per_level; + uint32 *min_neighbors_per_level; + uint32 *max_neighbors_per_level; + + bzero(&blocks_per_blocktype, sizeof(blocks_per_blocktype)); + for(BlockNumber block = 0; block < blocks_nr; ++block) ++blocks_per_blocktype[ vi_blocks[ block ].vp_type ]; + elog(INFO, + "blocks for: header %" PRIu32 " blockmap %" PRIu32 " nodes %" PRIu32, + blocks_per_blocktype[ LDB_VI_BLOCK_HEADER ], + blocks_per_blocktype[ LDB_VI_BLOCK_BLOCKMAP ], + blocks_per_blocktype[ LDB_VI_BLOCK_NODES ]); + + for(uint32 i = 0; i < nodes_nr; ++i) ++vi_blocks[ vi_nodes[ i ].vn_block ].vp_nodes_nr; + /* because in the next loop the condition is "block > 0" */ + ldb_invariant(vi_blocks[ 0 ].vp_type == LDB_VI_BLOCK_HEADER, "block 0 should be the header"); + for(BlockNumber block = blocks_nr - 1; block > 0; --block) { + if(vi_blocks[ block ].vp_type == LDB_VI_BLOCK_NODES) { + last_block = block; + break; + } + } + for(BlockNumber block = 0; block < blocks_nr; ++block) { + if(vi_blocks[ block ].vp_type == LDB_VI_BLOCK_NODES && block != last_block) { + min_nodes_per_block = Min(min_nodes_per_block, vi_blocks[ block ].vp_nodes_nr); + max_nodes_per_block = Max(max_nodes_per_block, vi_blocks[ block ].vp_nodes_nr); + } + } + if(blocks_per_blocktype[ LDB_VI_BLOCK_NODES ] == 0) { + elog(INFO, "nodes per block: 0 blocks with nodes"); + } else if(blocks_per_blocktype[ LDB_VI_BLOCK_NODES ] == 1) { + elog(INFO, "nodes per block: last block %" PRIu32, vi_blocks[ last_block ].vp_nodes_nr); + } else { + elog(INFO, + "nodes per block: min (except last) %" PRIu32 " max (except last) %" PRIu32 " last %" PRIu32, + min_nodes_per_block, + max_nodes_per_block, + vi_blocks[ last_block ].vp_nodes_nr); + } + + for(uint32 i = 0; i < nodes_nr; ++i) max_level = Max(max_level, vi_nodes[ i ].vn_level); + + nodes_per_level = palloc0_array(typeof(*nodes_per_level), max_level + 1); + edges_per_level = palloc0_array(typeof(*edges_per_level), max_level + 1); + min_neighbors_per_level = palloc0_array(typeof(*min_neighbors_per_level), max_level + 1); + max_neighbors_per_level = palloc0_array(typeof(*max_neighbors_per_level), max_level + 1); + for(uint32 level = 0; level <= max_level; ++level) { + min_neighbors_per_level[ level ] = UINT32_MAX; + max_neighbors_per_level[ level ] = 0; + } + for(uint32 i = 0; i < nodes_nr; ++i) { + struct ldb_vi_node *node = &vi_nodes[ i ]; + + ++nodes_per_level[ node->vn_level ]; + for(uint32 level = 0; level <= node->vn_level; ++level) { + edges_per_level[ level ] += node->vn_neighbors_nr[ level ]; + min_neighbors_per_level[ level ] = Min(min_neighbors_per_level[ level ], node->vn_neighbors_nr[ level ]); + max_neighbors_per_level[ level ] = Max(max_neighbors_per_level[ level ], node->vn_neighbors_nr[ level ]); + if(0) { + /* useful for debugging */ + for(uint32 n = 0; n < node->vn_neighbors_nr[ level ]; ++n) { + elog(INFO, + "node %" PRIu32 " level %" PRIu32 " neighbor %" PRIu32 ": %" PRIu32, + i, + level, + n, + node->vn_neighbors[ level ][ n ]); + } + } + } + } + for(uint32 level = 0; level <= max_level; ++level) { + if(min_neighbors_per_level[ level ] == UINT32_MAX) min_neighbors_per_level[ level ] = 0; + } + for(uint32 level = 0; level <= max_level; ++level) { + elog(INFO, + "level=%" PRIu32 ": nodes %" PRIu32 + " directed neighbor edges %lu " + "min neighbors %" PRIu32 " max neighbors %" PRIu32, + level, + nodes_per_level[ level ], + edges_per_level[ level ], + min_neighbors_per_level[ level ], + max_neighbors_per_level[ level ]); + } + pfree(max_neighbors_per_level); + pfree(min_neighbors_per_level); + pfree(edges_per_level); + pfree(nodes_per_level); +} + +void ldb_vi_free_neighbors(struct ldb_vi_node *vi_nodes, uint32 nodes_nr) +{ + for(uint32 i = 0; i < nodes_nr; ++i) { + struct ldb_vi_node *node = &vi_nodes[ i ]; + + for(uint32 level = 0; level <= node->vn_level; ++level) pfree(node->vn_neighbors[ level ]); + pfree(node->vn_neighbors); + pfree(node->vn_neighbors_nr); + } +} + +void ldb_validate_index(Oid indrelid, bool print_info) +{ + Relation index; + BlockNumber header_blockno = 0; + Buffer header_buf; + Page header_page; + HnswIndexHeaderPage *index_header; + MemoryContext memCtx; + MemoryContext saveCtx; + BlockNumber blocks_nr; + uint32 nodes_nr; + struct ldb_vi_block *vi_blocks; + struct ldb_vi_node *vi_nodes; + + /* the code here doesn't change the index, so AccessShareLock is enough */ + index = relation_open(indrelid, AccessShareLock); + + if(print_info) { + elog(INFO, "validate_index() start for %s with Oid=%u", RelationGetRelationName(index), indrelid); + } else { + elog(INFO, "validate_index() start for %s", RelationGetRelationName(index)); + } + memCtx = AllocSetContextCreate(CurrentMemoryContext, "hnsw validate_index context", ALLOCSET_DEFAULT_SIZES); + saveCtx = MemoryContextSwitchTo(memCtx); + + header_buf = ReadBuffer(index, header_blockno); + LockBuffer(header_buf, BUFFER_LOCK_EXCLUSIVE); + header_page = BufferGetPage(header_buf); + index_header = (HnswIndexHeaderPage *)PageGetContents(header_page); + if(index_header->magicNumber != LDB_WAL_MAGIC_NUMBER) { + elog(ERROR, + "Invalid HnswIndexHeaderPage.magicNumber (page %" PRIu32 ", got %x, expected %x)", + header_blockno, + index_header->magicNumber, + LDB_WAL_MAGIC_NUMBER); + } + if(index_header->m != (uint32)ldb_HnswGetM(index)) { + elog(ERROR, "index_header->m=%" PRIu32 " != ldb_HnswGetM(index)=%d", index_header->m, ldb_HnswGetM(index)); + } + if(print_info) { + elog(INFO, + "index_header = HnswIndexHeaderPage(" + "version=%" PRIu32 " vector_dim=%" PRIu32 " m=%" PRIu32 " ef_construction=%" PRIu32 " ef=%" PRIu32 + " metric_kind=%d num_vectors=%" PRIu32 " last_data_block=%" PRIu32 " blockmap_page_groups=%" PRIu32 ")", + index_header->version, + index_header->vector_dim, + index_header->m, + index_header->ef_construction, + index_header->ef, + index_header->metric_kind, + index_header->num_vectors, + index_header->last_data_block, + index_header->blockmap_page_groups); + } + + blocks_nr = RelationGetNumberOfBlocksInFork(index, MAIN_FORKNUM); + nodes_nr = index_header->num_vectors; + if(print_info) { + elog(INFO, "blocks_nr=%" PRIu32 " nodes_nr=%" PRIu32, blocks_nr, nodes_nr); + } + /* TODO check nodes_nr against index_header->blockmap_page_groups */ + + vi_blocks = palloc0_array(typeof(*vi_blocks), blocks_nr); + vi_nodes = palloc0_array(typeof(*vi_nodes), nodes_nr); + for(uint32 i = 0; i < nodes_nr; ++i) { + vi_nodes[ i ].vn_block = InvalidBlockNumber; + vi_nodes[ i ].vn_offset = InvalidOffsetNumber; + } + + ldb_vi_read_blockmaps(index, index_header, vi_blocks, blocks_nr, vi_nodes, nodes_nr); + for(BlockNumber block = 0; block < blocks_nr; ++block) { + if(vi_blocks[ block ].vp_type == LDB_VI_BLOCK_UNKNOWN) { + elog(ERROR, "vi_blocks[%" PRIu32 "].vp_type == LDB_VI_BLOCK_UNKNOWN (but it should be known now)", block); + } + } + ldb_vi_read_nodes(index, index_header, vi_blocks, blocks_nr, vi_nodes, nodes_nr); + if(print_info) ldb_vi_print_statistics(vi_blocks, blocks_nr, vi_nodes, nodes_nr); + + ldb_vi_free_neighbors(vi_nodes, nodes_nr); + pfree(vi_nodes); + pfree(vi_blocks); + + UnlockReleaseBuffer(header_buf); + MemoryContextSwitchTo(saveCtx); + MemoryContextDelete(memCtx); + elog(INFO, "validate_index() done, no issues found."); + relation_close(index, AccessShareLock); +} diff --git a/src/hnsw/validate_index.h b/src/hnsw/validate_index.h new file mode 100644 index 000000000..8dba6ef8c --- /dev/null +++ b/src/hnsw/validate_index.h @@ -0,0 +1,25 @@ +#ifndef LDB_HNSW_VALIDATE_INDEX_H +#define LDB_HNSW_VALIDATE_INDEX_H + +#include + +/* + * This function checks integrity of the data structures in the index relation. + * + * How it works: + * - it creates ldb_vi_block for each block of the index relation and + * ldb_vi_node for each node inside the index relation; + * - it loads all blockmap groups and analyzes mappings between nodes and + * blocks; + * - it loads all the nodes with their neighbors; + * - it also prints statistics about blocks and nodes, which is useful for + * understanding of what's inside the index; + * - it assumes that PostgreSQL-level data structures are intact (i.e. the page + * header and the mapping between offsets and items is correct for each page); + * - in case if a corruption of the data structure is found the function prints + * an error message with details about the place and surrounding data + * structures. + */ +void ldb_validate_index(Oid indrelid, bool print_info); + +#endif diff --git a/test/expected/ext_relocation.out b/test/expected/ext_relocation.out index c5e0db07c..9dc24635e 100644 --- a/test/expected/ext_relocation.out +++ b/test/expected/ext_relocation.out @@ -34,14 +34,15 @@ WHERE d.deptype = 'e' AND e.extname = 'lantern' ORDER BY 1, 3; extschema | proname | proschema -----------+------------------------------+------------------- + schema1 | validate_index | _lantern_internal schema1 | _create_ldb_operator_classes | _lantern_internal schema1 | ldb_generic_dist | schema1 - schema1 | ldb_generic_dist | schema1 + schema1 | l2sq_dist | schema1 schema1 | hnsw_handler | schema1 - schema1 | cos_dist | schema1 schema1 | hamming_dist | schema1 - schema1 | l2sq_dist | schema1 -(7 rows) + schema1 | cos_dist | schema1 + schema1 | ldb_generic_dist | schema1 +(8 rows) -- show all the extension operators SELECT ne.nspname AS extschema, op.oprname, np.nspname AS proschema @@ -70,6 +71,14 @@ CREATE INDEX hnsw_index ON small_world USING hnsw(v) WITH (dim=3); INFO: done init usearch index INFO: inserted 8 elements INFO: done saving 8 vectors +SELECT _lantern_internal.validate_index('hnsw_index', false); +INFO: validate_index() start for hnsw_index +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + \set ON_ERROR_STOP off -- lantern does not support relocation. -- Postgres will not allow it to support this since its objects span over more than one schema @@ -99,6 +108,14 @@ CREATE INDEX hnsw_index2 ON small_world USING hnsw(v) WITH (dim=3); INFO: done init usearch index INFO: inserted 8 elements INFO: done saving 8 vectors +SELECT _lantern_internal.validate_index('hnsw_index2', false); +INFO: validate_index() start for hnsw_index2 +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + \set ON_ERROR_STOP off -- extension function cannot be found without schema-qualification SELECT l2sq_dist(ARRAY[1.0, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]); diff --git a/test/expected/hnsw_config.out b/test/expected/hnsw_config.out index 9069e6c1e..02b2c9120 100644 --- a/test/expected/hnsw_config.out +++ b/test/expected/hnsw_config.out @@ -52,3 +52,12 @@ SHOW hnsw.init_k; 10 (1 row) +-- Validate the index data structures +SELECT _lantern_internal.validate_index('small_world_v_idx', false); +INFO: validate_index() start for small_world_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + diff --git a/test/expected/hnsw_correct.out b/test/expected/hnsw_correct.out index 2719efe09..05eb5483d 100644 --- a/test/expected/hnsw_correct.out +++ b/test/expected/hnsw_correct.out @@ -47,3 +47,12 @@ WHERE ---------+---------------+------------------+-----------------+-------------------- (0 rows) +-- Validate the index data structures +SELECT _lantern_internal.validate_index('small_world_v_idx', false); +INFO: validate_index() start for small_world_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + diff --git a/test/expected/hnsw_cost_estimate.out b/test/expected/hnsw_cost_estimate.out index 7940f805e..7f91ce2e9 100644 --- a/test/expected/hnsw_cost_estimate.out +++ b/test/expected/hnsw_cost_estimate.out @@ -69,6 +69,14 @@ DEBUG: LANTERN - --------------------- t (1 row) +SELECT _lantern_internal.validate_index('empty_idx', false); +INFO: validate_index() start for empty_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + DROP INDEX empty_idx; -- Case 1, more data in index. -- Should see higher cost than Case 0. @@ -89,6 +97,14 @@ DEBUG: LANTERN - --------------------- t (1 row) +SELECT _lantern_internal.validate_index('hnsw_idx', false); +INFO: validate_index() start for hnsw_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + DROP INDEX hnsw_idx; -- Case 2, higher M. -- Should see higher cost than Case 1. @@ -109,6 +125,14 @@ DEBUG: LANTERN - --------------------- t (1 row) +SELECT _lantern_internal.validate_index('hnsw_idx', false); +INFO: validate_index() start for hnsw_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + DROP INDEX hnsw_idx; -- Case 3, higher ef. -- Should see higher cost than Case 2. @@ -129,4 +153,12 @@ DEBUG: LANTERN - --------------------- t (1 row) +SELECT _lantern_internal.validate_index('hnsw_idx', false); +INFO: validate_index() start for hnsw_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + DROP INDEX hnsw_idx; diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out index 28eb6a63a..84b77c3fd 100644 --- a/test/expected/hnsw_create.out +++ b/test/expected/hnsw_create.out @@ -34,6 +34,14 @@ SELECT * FROM ldb_get_indexes('sift_base1k'); sift_base1k_v_idx | 632 kB | CREATE INDEX sift_base1k_v_idx ON public.sift_base1k USING hnsw (v) WITH (dim='128', m='4') | 632 kB (1 row) +SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); +INFO: validate_index() start for sift_base1k_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + -- Validate that index creation works with a larger number of vectors \ir utils/sift10k_array.sql CREATE TABLE IF NOT EXISTS sift_base10k ( @@ -54,6 +62,14 @@ EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' LIMIT 1 Order By: (v <-> '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) (3 rows) +SELECT _lantern_internal.validate_index('hnsw_idx', false); +INFO: validate_index() start for hnsw_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + --- Validate that M values inside the allowed range [2, 128] do not throw an error CREATE INDEX ON small_world USING hnsw (v) WITH (M=2); INFO: done init usearch index @@ -117,6 +133,14 @@ INSERT INTO small_world4 (id, vector) VALUES ('000', '{1,0,0,0}'), ('001', '{1,0,0,1}'), ('010', '{1,0,1,0}'); +SELECT _lantern_internal.validate_index('small_world4_hnsw_idx', false); +INFO: validate_index() start for small_world4_hnsw_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + -- without the index, I can change the dimension of a vector element DROP INDEX small_world4_hnsw_idx; UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001'; diff --git a/test/expected/hnsw_create_expr.out b/test/expected/hnsw_create_expr.out index 70a942c85..7527d2080 100644 --- a/test/expected/hnsw_create_expr.out +++ b/test/expected/hnsw_create_expr.out @@ -68,6 +68,14 @@ CREATE INDEX ON test_table USING hnsw (int_to_fixed_binary_real_array(id)) WITH INFO: done init usearch index INFO: inserted 3 elements INFO: done saving 3 vectors +SELECT _lantern_internal.validate_index('test_table_int_to_fixed_binary_real_array_idx', false); +INFO: validate_index() start for test_table_int_to_fixed_binary_real_array_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + \set ON_ERROR_STOP off -- This should result in an error that dimensions does not match CREATE INDEX ON test_table USING hnsw (int_to_dynamic_binary_real_array(id)) WITH (M=2); diff --git a/test/expected/hnsw_dist_func.out b/test/expected/hnsw_dist_func.out index 3006d4ffd..a649c6dd1 100644 --- a/test/expected/hnsw_dist_func.out +++ b/test/expected/hnsw_dist_func.out @@ -239,3 +239,35 @@ SELECT ROUND(hamming_dist(v, '{0,0}')::numeric, 2) FROM extra_small_world_ham OR 4.00 (4 rows) +SELECT _lantern_internal.validate_index('small_world_l2_v_idx', false); +INFO: validate_index() start for small_world_l2_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('small_world_cos_v_idx', false); +INFO: validate_index() start for small_world_cos_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('small_world_ham_v_idx', false); +INFO: validate_index() start for small_world_ham_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('extra_small_world_ham_v_idx', false); +INFO: validate_index() start for extra_small_world_ham_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + diff --git a/test/expected/hnsw_index_from_file.out b/test/expected/hnsw_index_from_file.out index b05f0482a..918f49915 100644 --- a/test/expected/hnsw_index_from_file.out +++ b/test/expected/hnsw_index_from_file.out @@ -30,6 +30,14 @@ CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_ind INFO: done init usearch index INFO: done loading usearch index INFO: done saving 1000 vectors +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); +INFO: validate_index() start for hnsw_l2_index +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | total_index_size ---------------+--------+----------------------------------------------------------------------------------------------------------------------------------------------+------------------ @@ -94,6 +102,14 @@ CREATE INDEX hnsw_cos_index ON sift_base1k USING hnsw (v) WITH (_experimental_in INFO: done init usearch index INFO: done loading usearch index INFO: done saving 1000 vectors +SELECT _lantern_internal.validate_index('hnsw_cos_index', false); +INFO: validate_index() start for hnsw_cos_index +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | total_index_size ----------------+--------+------------------------------------------------------------------------------------------------------------------------------------------------+------------------ @@ -142,6 +158,14 @@ CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_ind INFO: done init usearch index INFO: done loading usearch index INFO: done saving 1000 vectors +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); +INFO: validate_index() start for hnsw_l2_index +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + -- This should not throw error, but the first result will not be 0 as vector 777 is deleted from the table SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v <-> :'v777' LIMIT 10; round diff --git a/test/expected/hnsw_insert.out b/test/expected/hnsw_insert.out index 6140c130b..4ceb8896c 100644 --- a/test/expected/hnsw_insert.out +++ b/test/expected/hnsw_insert.out @@ -15,6 +15,14 @@ CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); INFO: done init usearch index INFO: inserted 0 elements INFO: done saving 0 vectors +SELECT _lantern_internal.validate_index('small_world_v_idx', false); +INFO: validate_index() start for small_world_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + -- Insert rows with valid vector data INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}'); INSERT INTO small_world (v) VALUES (NULL); @@ -101,6 +109,14 @@ LIMIT 10; Order By: (v <-> '{0,0,0}'::real[]) (3 rows) +SELECT _lantern_internal.validate_index('small_world_v_idx', false); +INFO: validate_index() start for small_world_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + -- Test the index with a larger number of vectors CREATE TABLE sift_base10k ( id SERIAL PRIMARY KEY, @@ -112,10 +128,18 @@ INFO: inserted 0 elements INFO: done saving 0 vectors \COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444'; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Index Scan using hnsw_idx on sift_base10k Order By: (v <-> '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) (2 rows) +SELECT _lantern_internal.validate_index('hnsw_idx', false); +INFO: validate_index() start for hnsw_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + diff --git a/test/expected/hnsw_select.out b/test/expected/hnsw_select.out index 8ca6810fd..1fcc450a7 100644 --- a/test/expected/hnsw_select.out +++ b/test/expected/hnsw_select.out @@ -206,3 +206,28 @@ SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v <-> '' -- todo:: Verify joins work and still use index -- todo:: Verify incremental sorts work +-- Validate index data structures +SELECT _lantern_internal.validate_index('small_world_v_idx', false); +INFO: validate_index() start for small_world_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); +INFO: validate_index() start for sift_base1k_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('test1_v_idx', false); +INFO: validate_index() start for test1_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + diff --git a/test/expected/hnsw_todo.out b/test/expected/hnsw_todo.out index 70f0824ee..cb3a610cd 100644 --- a/test/expected/hnsw_todo.out +++ b/test/expected/hnsw_todo.out @@ -20,9 +20,19 @@ CREATE INDEX ON small_world_l2 USING hnsw (vector dist_l2sq_ops); INFO: done init usearch index INFO: inserted 8 elements INFO: done saving 8 vectors +SELECT _lantern_internal.validate_index('small_world_l2_vector_idx', false); +INFO: validate_index() start for small_world_l2_vector_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + -- this should be supported CREATE INDEX ON small_world_l2 USING hnsw (vector_int dist_l2sq_int_ops); ERROR: operator class "dist_l2sq_int_ops" does not exist for access method "hnsw" +SELECT _lantern_internal.validate_index('small_world_l2_vector_int_idx', false); +ERROR: relation "small_world_l2_vector_int_idx" does not exist at character 41 -- this should use index EXPLAIN (COSTS FALSE) SELECT id, ROUND(l2sq_dist(vector_int, array[0,1,0])::numeric, 2) as dist @@ -52,6 +62,14 @@ CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_ind INFO: done init usearch index INFO: done loading usearch index INFO: done saving 1000 vectors +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); +INFO: validate_index() start for hnsw_l2_index +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + -- The 1001 and 1002 vectors will be ignored in search, so the first row will not be 0 in result SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 1; round @@ -75,6 +93,14 @@ CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_ind INFO: done init usearch index INFO: done loading usearch index INFO: done saving 1000 vectors +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); +INFO: validate_index() start for hnsw_l2_index +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + -- The first row will not be 0 now as the vector under id=777 was updated to 1,1,1,1... but it was indexed with different vector -- So the usearch index can not find 1,1,1,1,1.. vector in the index and wrong results will be returned -- This is an expected behaviour for now diff --git a/test/sql/ext_relocation.sql b/test/sql/ext_relocation.sql index 1748da2f9..4f2f8d8fb 100644 --- a/test/sql/ext_relocation.sql +++ b/test/sql/ext_relocation.sql @@ -37,6 +37,7 @@ SET search_path TO public, schema1; SELECT l2sq_dist(ARRAY[1.0, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]); CREATE INDEX hnsw_index ON small_world USING hnsw(v) WITH (dim=3); +SELECT _lantern_internal.validate_index('hnsw_index', false); \set ON_ERROR_STOP off -- lantern does not support relocation. @@ -58,6 +59,7 @@ ORDER BY 1, 3; SET search_path TO public, schema2; --extension access method is still accessible since access methods are not schema-qualified CREATE INDEX hnsw_index2 ON small_world USING hnsw(v) WITH (dim=3); +SELECT _lantern_internal.validate_index('hnsw_index2', false); \set ON_ERROR_STOP off -- extension function cannot be found without schema-qualification diff --git a/test/sql/hnsw_config.sql b/test/sql/hnsw_config.sql index a2660e662..4240c535b 100644 --- a/test/sql/hnsw_config.sql +++ b/test/sql/hnsw_config.sql @@ -19,3 +19,6 @@ SHOW hnsw.init_k; -- Reset all parameters and verify that hnsw.init_k was reset RESET ALL; SHOW hnsw.init_k; + +-- Validate the index data structures +SELECT _lantern_internal.validate_index('small_world_v_idx', false); diff --git a/test/sql/hnsw_correct.sql b/test/sql/hnsw_correct.sql index ef07aebc6..14fed457a 100644 --- a/test/sql/hnsw_correct.sql +++ b/test/sql/hnsw_correct.sql @@ -44,4 +44,7 @@ JOIN results_wo_index b USING (row_num) WHERE - a.id != b.id; \ No newline at end of file + a.id != b.id; + +-- Validate the index data structures +SELECT _lantern_internal.validate_index('small_world_v_idx', false); diff --git a/test/sql/hnsw_cost_estimate.sql b/test/sql/hnsw_cost_estimate.sql index e38d18ff6..59e08dac4 100644 --- a/test/sql/hnsw_cost_estimate.sql +++ b/test/sql/hnsw_cost_estimate.sql @@ -55,22 +55,26 @@ CREATE TABLE empty_table(id SERIAL PRIMARY KEY, v REAL[2]); CREATE INDEX empty_idx ON empty_table USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=2, dim=2); SET _lantern_internal.is_test = true; SELECT is_cost_estimate_within_error('EXPLAIN SELECT * FROM empty_table ORDER BY v <-> ''{1,2}'' LIMIT 10', 0.47); +SELECT _lantern_internal.validate_index('empty_idx', false); DROP INDEX empty_idx; -- Case 1, more data in index. -- Should see higher cost than Case 0. CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); SELECT is_cost_estimate_within_error(format(:'explain_query_template', :'v4444'), 3.00); +SELECT _lantern_internal.validate_index('hnsw_idx', false); DROP INDEX hnsw_idx; -- Case 2, higher M. -- Should see higher cost than Case 1. CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=20, ef_construction=10, ef=4, dim=128); SELECT is_cost_estimate_within_error(format(:'explain_query_template', :'v4444'), 3.27); +SELECT _lantern_internal.validate_index('hnsw_idx', false); DROP INDEX hnsw_idx; -- Case 3, higher ef. -- Should see higher cost than Case 2. CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=20, ef_construction=10, ef=16, dim=128); SELECT is_cost_estimate_within_error(format(:'explain_query_template', :'v4444'), 3.91); +SELECT _lantern_internal.validate_index('hnsw_idx', false); DROP INDEX hnsw_idx; diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql index f85713458..f0082d8be 100644 --- a/test/sql/hnsw_create.sql +++ b/test/sql/hnsw_create.sql @@ -9,12 +9,14 @@ -- Validate that creating a secondary index works CREATE INDEX ON sift_base1k USING hnsw (v) WITH (dim=128, M=4); SELECT * FROM ldb_get_indexes('sift_base1k'); +SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); -- Validate that index creation works with a larger number of vectors \ir utils/sift10k_array.sql CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' LIMIT 10; +SELECT _lantern_internal.validate_index('hnsw_idx', false); --- Validate that M values inside the allowed range [2, 128] do not throw an error @@ -65,6 +67,8 @@ INSERT INTO small_world4 (id, vector) VALUES ('001', '{1,0,0,1}'), ('010', '{1,0,1,0}'); +SELECT _lantern_internal.validate_index('small_world4_hnsw_idx', false); + -- without the index, I can change the dimension of a vector element DROP INDEX small_world4_hnsw_idx; UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001'; diff --git a/test/sql/hnsw_create_expr.sql b/test/sql/hnsw_create_expr.sql index ecabfbaf9..cae3ad888 100644 --- a/test/sql/hnsw_create_expr.sql +++ b/test/sql/hnsw_create_expr.sql @@ -71,6 +71,7 @@ INSERT INTO test_table VALUES (0), (1), (7); -- This should success CREATE INDEX ON test_table USING hnsw (int_to_fixed_binary_real_array(id)) WITH (M=2); +SELECT _lantern_internal.validate_index('test_table_int_to_fixed_binary_real_array_idx', false); \set ON_ERROR_STOP off -- This should result in an error that dimensions does not match diff --git a/test/sql/hnsw_dist_func.sql b/test/sql/hnsw_dist_func.sql index c2767c4a4..c7b908cf8 100644 --- a/test/sql/hnsw_dist_func.sql +++ b/test/sql/hnsw_dist_func.sql @@ -98,3 +98,8 @@ CREATE TABLE extra_small_world_ham ( INSERT INTO extra_small_world_ham (v) VALUES ('{0,0}'), ('{1,1}'), ('{2,2}'), ('{3,3}'); CREATE INDEX ON extra_small_world_ham USING hnsw (v dist_hamming_ops) WITH (dim=2); SELECT ROUND(hamming_dist(v, '{0,0}')::numeric, 2) FROM extra_small_world_ham ORDER BY v <-> '{0,0}'; + +SELECT _lantern_internal.validate_index('small_world_l2_v_idx', false); +SELECT _lantern_internal.validate_index('small_world_cos_v_idx', false); +SELECT _lantern_internal.validate_index('small_world_ham_v_idx', false); +SELECT _lantern_internal.validate_index('extra_small_world_ham_v_idx', false); diff --git a/test/sql/hnsw_index_from_file.sql b/test/sql/hnsw_index_from_file.sql index a2e72adf6..863e6d181 100644 --- a/test/sql/hnsw_index_from_file.sql +++ b/test/sql/hnsw_index_from_file.sql @@ -18,6 +18,7 @@ CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_ind \set ON_ERROR_STOP on -- Validate that creating an index from file works CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2.usearch'); +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); SELECT * FROM ldb_get_indexes('sift_base1k'); SET enable_seqscan = false; @@ -38,6 +39,7 @@ DROP TABLE sift_base1k CASCADE; -- Validate that creating an index from file works with cosine distance function CREATE INDEX hnsw_cos_index ON sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-cos.usearch'); +SELECT _lantern_internal.validate_index('hnsw_cos_index', false); SELECT * FROM ldb_get_indexes('sift_base1k'); SELECT v AS v777 FROM sift_base1k WHERE id = 777 \gset @@ -55,5 +57,6 @@ DROP TABLE sift_base1k CASCADE; \ir utils/sift1k_array.sql DELETE FROM sift_base1k WHERE id=777; CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2.usearch'); +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); -- This should not throw error, but the first result will not be 0 as vector 777 is deleted from the table SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v <-> :'v777' LIMIT 10; diff --git a/test/sql/hnsw_insert.sql b/test/sql/hnsw_insert.sql index 39e72187b..96f931a88 100644 --- a/test/sql/hnsw_insert.sql +++ b/test/sql/hnsw_insert.sql @@ -13,6 +13,7 @@ CREATE TABLE small_world ( v REAL[2] ); CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); +SELECT _lantern_internal.validate_index('small_world_v_idx', false); -- Insert rows with valid vector data INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}'); @@ -69,6 +70,8 @@ ORDER BY v <-> '{0,0,0}' LIMIT 10; +SELECT _lantern_internal.validate_index('small_world_v_idx', false); + -- Test the index with a larger number of vectors CREATE TABLE sift_base10k ( id SERIAL PRIMARY KEY, @@ -77,4 +80,6 @@ CREATE TABLE sift_base10k ( CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); \COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444'; + +SELECT _lantern_internal.validate_index('hnsw_idx', false); diff --git a/test/sql/hnsw_select.sql b/test/sql/hnsw_select.sql index 6143407e2..d0d0efecd 100644 --- a/test/sql/hnsw_select.sql +++ b/test/sql/hnsw_select.sql @@ -73,3 +73,8 @@ SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v <-> '' -- todo:: Verify joins work and still use index -- todo:: Verify incremental sorts work + +-- Validate index data structures +SELECT _lantern_internal.validate_index('small_world_v_idx', false); +SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); +SELECT _lantern_internal.validate_index('test1_v_idx', false); diff --git a/test/sql/hnsw_todo.sql b/test/sql/hnsw_todo.sql index 07753c60d..9a6e637b1 100644 --- a/test/sql/hnsw_todo.sql +++ b/test/sql/hnsw_todo.sql @@ -22,9 +22,11 @@ SET enable_seqscan = false; \set ON_ERROR_STOP off CREATE INDEX ON small_world_l2 USING hnsw (vector dist_l2sq_ops); +SELECT _lantern_internal.validate_index('small_world_l2_vector_idx', false); -- this should be supported CREATE INDEX ON small_world_l2 USING hnsw (vector_int dist_l2sq_int_ops); +SELECT _lantern_internal.validate_index('small_world_l2_vector_int_idx', false); -- this should use index EXPLAIN (COSTS FALSE) @@ -48,6 +50,7 @@ INSERT INTO sift_base1k (id, v) VALUES (1102, array_fill(2, ARRAY[128])); SELECT v AS v1001 FROM sift_base1k WHERE id = 1001 \gset CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2.usearch'); +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); -- The 1001 and 1002 vectors will be ignored in search, so the first row will not be 0 in result SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 1; @@ -59,6 +62,7 @@ DROP TABLE sift_base1k CASCADE; \ir utils/sift1k_array.sql UPDATE sift_base1k SET v=:'v1001' WHERE id=777; CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2.usearch'); +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); -- The first row will not be 0 now as the vector under id=777 was updated to 1,1,1,1... but it was indexed with different vector -- So the usearch index can not find 1,1,1,1,1.. vector in the index and wrong results will be returned -- This is an expected behaviour for now