From 3bf70237684b4a5ee5d5ada0f8cd88e174165cd5 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Wed, 26 Jul 2023 18:53:26 -0700 Subject: [PATCH] Bidirectional Iterators (#588) * tweak iterator api to make it easier to add bidirectionality * debugging btree reverse iteration * reduce time we hold locks during btree_split_child_leaf * further refine the locking in btree node splits and fix reverse iterator bug * btree iterator init at key other than min and add btree_iterator_seek * splinterdb_iterator_prev implmentated and working * clang formatting * improve the trunk iterator logic * corrections for pull request * more pull request fixes * assert fix * more pull request feedback * iterator stress test, bug fixes, formatting * final bit of pr feedback * formatting --------- Co-authored-by: Evan West --- include/splinterdb/splinterdb.h | 28 +- src/btree.c | 543 +++++++++++++++++++------- src/btree.h | 7 +- src/btree_private.h | 1 + src/iterator.h | 68 +++- src/merge.c | 382 ++++++++++++------ src/merge.h | 4 +- src/routing_filter.c | 10 +- src/shard_log.c | 35 +- src/splinterdb.c | 41 +- src/trunk.c | 431 +++++++++++++------- src/trunk.h | 8 +- tests/functional/btree_test.c | 81 ++-- tests/functional/log_test.c | 10 +- tests/functional/test_functionality.c | 50 +-- tests/unit/btree_stress_test.c | 192 +++++++-- tests/unit/btree_test.c | 2 +- tests/unit/splinterdb_quick_test.c | 125 +++++- tests/unit/splinterdb_stress_test.c | 76 ++++ 19 files changed, 1537 insertions(+), 557 deletions(-) diff --git a/include/splinterdb/splinterdb.h b/include/splinterdb/splinterdb.h index 7277cb29b..ccb125643 100644 --- a/include/splinterdb/splinterdb.h +++ b/include/splinterdb/splinterdb.h @@ -330,12 +330,36 @@ splinterdb_iterator_deinit(splinterdb_iterator *iter); // Checks that the iterator status is OK (no errors) and that get_current() // will succeed. If false, there are two possibilities: -// 1. Iterator has passed the final item. In this case, status() == 0 +// 1. Iterator is out of bounds. In this case, status() == 0 // 2. Iterator has encountered an error. In this case, status() != 0 _Bool splinterdb_iterator_valid(splinterdb_iterator *iter); -// Attempts to advance the iterator to the next item. +/* + * splinterdb_iterator_can_next -- + * splinterdb_iterator_can_prev -- + * + * Knowing the iterator is invalid does not provide enough information to + * determine if next and prev are safe operations. + * + * These functions provide granular information on which operations are safe. + * splinterdb_iterator_can_next == TRUE <-> splinterdb_iterator_next is safe + * splinterdb_iterator_can_prev == TRUE <-> splinterdb_iterator_prev is safe + */ +_Bool +splinterdb_iterator_can_prev(splinterdb_iterator *iter); + +_Bool +splinterdb_iterator_can_next(splinterdb_iterator *iter); + +// Moves the iterator to the previous item. +// Precondition for calling: splinterdb_iterator_can_prev() returns TRUE +// Any error will cause valid() == false and be visible with status() +void +splinterdb_iterator_prev(splinterdb_iterator *iter); + +// Moves the iterator to the next item. +// Precondition for calling: splinterdb_iterator_can_next() returns TRUE // Any error will cause valid() == false and be visible with status() void splinterdb_iterator_next(splinterdb_iterator *iter); diff --git a/src/btree.c b/src/btree.c index fbdbc5a5b..c2999ea2e 100644 --- a/src/btree.c +++ b/src/btree.c @@ -44,6 +44,43 @@ * ***************************************************************** */ +/* + * ***************************************************************** + * Locking rules for BTree: + * 1. Locks must be acquired in the following order: read->claim->write + * 2. If a thread holds two locks, it must hold the lock that dominates + * both locks. For instance, if it has locked two children, we must + * lock their parent. + * 3. Threads may traverse from one node to the next without acquiring + * a lock upon the node that dominates them by first releasing the + * held lock and then taking a leap of faith by acquiring the lock + * on the second. + * 4. They may also traverse down the tree to a single leaf using hand + * over hand locking. + * 5. All threads follow the locking patterns in 3 or 4. They only hold + * a single lock at a time. + * + * Exceptions to these rules: + * 1. find_btree_node_and_get_idx_bounds(): To find the end_idx of the + * range iterator, we acquire a read lock on the leaf which holds + * the max_key. However, at the same time we hold a claim on the + * current leaf. We may not be holding the node that dominates these + * two leaves. + * 2. btree_split_child_leaf(): When splitting a leaf we hold write locks + * on the leaf we're splitting, its parent, and its original next leaf. + * However, this next leaf may have a different parent than the leaf + * we split. + * + * Why are these exceptions okay: + * Because by (5) we know that every other thread is holding only a single + * lock or is either an iterator finding the end_idx or performing a split. + * In either of these exception cases, we always acquire locks in increasing + * leaf order, thus, a thread will never hold a lock while attempting to + * acquire a lock on a previous leaf. As such, we can always safely wait for + * other threads to complete their work. + * ***************************************************************** + */ + /* Threshold for splitting instead of defragmenting. */ #define BTREE_SPLIT_THRESHOLD(page_size) ((page_size) / 2) @@ -840,16 +877,18 @@ btree_splitting_pivot(const btree_config *cfg, // IN } static inline void -btree_split_leaf_build_right_node(const btree_config *cfg, // IN - const btree_hdr *left_hdr, // IN - leaf_incorporate_spec *spec, // IN - leaf_splitting_plan plan, // IN +btree_split_leaf_build_right_node(const btree_config *cfg, // IN + const btree_hdr *left_hdr, // IN + uint64 left_addr, // IN + leaf_incorporate_spec *spec, // IN + leaf_splitting_plan plan, // IN btree_hdr *right_hdr, uint64 *generation) // IN/OUT { /* Build the right node. */ memmove(right_hdr, left_hdr, sizeof(*right_hdr)); right_hdr->generation++; + right_hdr->prev_addr = left_addr; btree_reset_node_entries(cfg, right_hdr); uint64 num_left_entries = btree_num_entries(left_hdr); uint64 dst_idx = 0; @@ -1284,6 +1323,8 @@ btree_unblock_dec_ref(cache *cc, btree_config *cfg, uint64 root_addr) * Upon completion: * - all nodes unlocked * - the insertion is complete + * + * This function violates our locking rules. See comment at top of file. */ static inline int btree_split_child_leaf(cache *cc, @@ -1298,13 +1339,20 @@ btree_split_child_leaf(cache *cc, { btree_node right_child; - /* p: claim, c: claim, rc: - */ + /* + * We indicate locking using these labels + * c = child, the leaf we're splitting + * p = parent, the parent of child + * rc = right child, the new leaf we're adding + * cn = child next, the child's original next + * + * starting locks: + * p: claim, c: claim, rc: -, cn: - + */ leaf_splitting_plan plan = btree_build_leaf_splitting_plan(cfg, child->hdr, spec); - /* p: claim, c: claim, rc: - */ - btree_alloc(cc, mini, btree_height(child->hdr), @@ -1312,10 +1360,30 @@ btree_split_child_leaf(cache *cc, NULL, PAGE_TYPE_MEMTABLE, &right_child); - - /* p: claim, c: claim, rc: write */ + /* p: claim, c: claim, rc: write, cn: - */ btree_node_lock(cc, cfg, parent); + /* p: write, c: claim, rc: write, cn: - */ + + btree_node child_next; + child_next.addr = child->hdr->next_addr; + if (child_next.addr != 0) { + btree_node_get(cc, cfg, &child_next, PAGE_TYPE_MEMTABLE); + uint64 child_next_wait = 1; + while (!btree_node_claim(cc, cfg, &child_next)) { + btree_node_unget(cc, cfg, &child_next); + platform_sleep_ns(child_next_wait); + child_next_wait = + child_next_wait > 2048 ? child_next_wait : 2 * child_next_wait; + btree_node_get(cc, cfg, &child_next, PAGE_TYPE_MEMTABLE); + } + btree_node_lock(cc, cfg, &child_next); + } + /* p: write, c: claim, rc: write, cn: write if exists */ + + btree_node_lock(cc, cfg, child); + /* p: write, c: write, rc: write, cn: write if exists */ + { /* limit the scope of pivot_key, since subsequent mutations of the nodes * may invalidate the memory it points to. @@ -1330,19 +1398,20 @@ btree_split_child_leaf(cache *cc, platform_assert(success); } btree_node_full_unlock(cc, cfg, parent); + /* p: unlocked, c: write, rc: write, cn: write if exists */ - /* p: fully unlocked, c: claim, rc: write */ + // set prev pointer from child's original next to right_child + if (child_next.addr != 0) { + child_next.hdr->prev_addr = right_child.addr; + btree_node_full_unlock(cc, cfg, &child_next); + } + /* p: unlocked, c: write, rc: write, cn: unlocked */ btree_split_leaf_build_right_node( - cfg, child->hdr, spec, plan, right_child.hdr, generation); - - /* p: fully unlocked, c: claim, rc: write */ - + cfg, child->hdr, child->addr, spec, plan, right_child.hdr, generation); btree_node_full_unlock(cc, cfg, &right_child); + /* p: unlocked, c: write, rc: unlocked, cn: unlocked */ - /* p: fully unlocked, c: claim, rc: fully unlocked */ - - btree_node_lock(cc, cfg, child); btree_split_leaf_cleanup_left_node( cfg, scratch, child->hdr, spec, plan, right_child.addr); if (plan.insertion_goes_left) { @@ -1351,8 +1420,7 @@ btree_split_child_leaf(cache *cc, platform_assert(incorporated); } btree_node_full_unlock(cc, cfg, child); - - /* p: fully unlocked, c: fully unlocked, rc: fully unlocked */ + /* p: unlocked, c: unlocked, rc: unlocked, cn: unlocked */ return 0; } @@ -2357,10 +2425,12 @@ btree_lookup_and_merge_async(cache *cc, // IN /* *----------------------------------------------------------------------------- - * btree_iterator_init -- - * btree_iterator_get_curr -- - * btree_iterator_advance -- - * btree_iterator_at_end + * btree_iterator_init -- + * btree_iterator_prev -- + * btree_iterator_curr -- + * btree_iterator_next -- + * btree_iterator_can_prev -- + * btree_iterator_can_next -- * * This iterator implementation supports an upper bound key ub. Given * an upper bound, the iterator will return only keys strictly less @@ -2391,13 +2461,24 @@ btree_lookup_and_merge_async(cache *cc, // IN *----------------------------------------------------------------------------- */ static bool32 -btree_iterator_is_at_end(btree_iterator *itor) +btree_iterator_can_prev(iterator *base_itor) { - return itor->curr.addr == itor->end_addr && itor->idx == itor->end_idx; + btree_iterator *itor = (btree_iterator *)base_itor; + return itor->idx >= itor->curr_min_idx + && (itor->curr_min_idx != itor->end_idx + || itor->curr.addr != itor->end_addr); +} + +static bool32 +btree_iterator_can_next(iterator *base_itor) +{ + btree_iterator *itor = (btree_iterator *)base_itor; + return itor->curr.addr != itor->end_addr + || (itor->idx < itor->end_idx && itor->curr_min_idx != itor->end_idx); } void -btree_iterator_get_curr(iterator *base_itor, key *curr_key, message *data) +btree_iterator_curr(iterator *base_itor, key *curr_key, message *data) { debug_assert(base_itor != NULL); btree_iterator *itor = (btree_iterator *)base_itor; @@ -2407,7 +2488,7 @@ btree_iterator_get_curr(iterator *base_itor, key *curr_key, message *data) btree_print_tree(itor->cc, itor->cfg, itor->root_addr); } */ - debug_assert(!btree_iterator_is_at_end(itor)); + debug_assert(iterator_can_curr(base_itor)); debug_assert(itor->idx < btree_num_entries(itor->curr.hdr)); debug_assert(itor->curr.page != NULL); debug_assert(itor->curr.page->disk_addr == itor->curr.addr); @@ -2427,6 +2508,51 @@ btree_iterator_get_curr(iterator *base_itor, key *curr_key, message *data) } } +// helper function to find a key within a btree node +// at a height specified by the iterator +static inline int64 +find_key_in_node(btree_iterator *itor, + btree_hdr *hdr, + key target, + comparison position_rule, + bool32 *found) +{ + bool32 loc_found; + if (found == NULL) { + found = &loc_found; + } + + int64 tmp; + if (itor->height == 0) { + tmp = btree_find_tuple(itor->cfg, hdr, target, found); + } else if (itor->height > hdr->height) { + // so we will always exceed height in future lookups + itor->height = (uint32)-1; + return 0; // this iterator is invalid, so return 0 for all lookups + } else { + tmp = btree_find_pivot(itor->cfg, hdr, itor->min_key, found); + } + + switch (position_rule) { + case less_than: + if (*found) { + --tmp; + } + // fallthrough + case less_than_or_equal: + break; + case greater_than_or_equal: + if (!*found) { + ++tmp; + } + break; + case greater_than: + ++tmp; + break; + } + return tmp; +} + static void btree_iterator_find_end(btree_iterator *itor) { @@ -2446,24 +2572,8 @@ btree_iterator_find_end(btree_iterator *itor) if (key_is_positive_infinity(itor->max_key)) { itor->end_idx = btree_num_entries(end.hdr); } else { - bool32 found; - int64 tmp; - if (itor->height == 0) { - tmp = btree_find_tuple(itor->cfg, end.hdr, itor->max_key, &found); - if (!found) { - tmp++; - } - } else if (itor->height > end.hdr->height) { - tmp = 0; - itor->height = - (uint32)-1; // So we will always exceed height in future lookups - } else { - tmp = btree_find_pivot(itor->cfg, end.hdr, itor->max_key, &found); - if (!found) { - tmp++; - } - } - itor->end_idx = tmp; + itor->end_idx = find_key_in_node( + itor, end.hdr, itor->max_key, greater_than_or_equal, NULL); } btree_node_unget(itor->cc, itor->cfg, &end); @@ -2476,7 +2586,7 @@ btree_iterator_find_end(btree_iterator *itor) * ---------------------------------------------------------------------------- */ static void -btree_iterator_advance_leaf(btree_iterator *itor) +btree_iterator_next_leaf(btree_iterator *itor) { cache *cc = itor->cc; btree_config *cfg = itor->cfg; @@ -2486,7 +2596,8 @@ btree_iterator_advance_leaf(btree_iterator *itor) btree_node_unget(cc, cfg, &itor->curr); itor->curr.addr = next_addr; btree_node_get(cc, cfg, &itor->curr, itor->page_type); - itor->idx = 0; + itor->idx = 0; + itor->curr_min_idx = -1; while (itor->curr.addr == itor->end_addr && itor->curr.hdr->generation != itor->end_generation) @@ -2533,36 +2644,237 @@ btree_iterator_advance_leaf(btree_iterator *itor) } } +/* + * ---------------------------------------------------------------------------- + * Move to the previous leaf when we've reached the beginning of one leaf. + * ---------------------------------------------------------------------------- + */ +static void +btree_iterator_prev_leaf(btree_iterator *itor) +{ + cache *cc = itor->cc; + btree_config *cfg = itor->cfg; + + debug_only uint64 curr_addr = itor->curr.addr; + uint64 prev_addr = itor->curr.hdr->prev_addr; + btree_node_unget(cc, cfg, &itor->curr); + itor->curr.addr = prev_addr; + btree_node_get(cc, cfg, &itor->curr, itor->page_type); + + /* + * The previous leaf may have split in between our release of the + * old curr node and the new one. In this case, we can just walk + * forward until we find the leaf whose successor is our old leaf. + */ + while (itor->curr.hdr->next_addr != curr_addr) { + uint64 next_addr = itor->curr.hdr->next_addr; + btree_node_unget(cc, cfg, &itor->curr); + itor->curr.addr = next_addr; + btree_node_get(cc, cfg, &itor->curr, itor->page_type); + } + + itor->idx = btree_num_entries(itor->curr.hdr) - 1; + + /* Do a quick check whether this entire leaf is within the range. */ + key first_key = itor->height ? btree_get_pivot(cfg, itor->curr.hdr, 0) + : btree_get_tuple_key(cfg, itor->curr.hdr, 0); + if (btree_key_compare(cfg, itor->min_key, first_key) < 0) { + itor->curr_min_idx = -1; + } else { + bool32 found; + itor->curr_min_idx = + itor->height + ? btree_find_pivot(cfg, itor->curr.hdr, itor->min_key, &found) + : btree_find_tuple(cfg, itor->curr.hdr, itor->min_key, &found); + if (!found) { + itor->curr_min_idx++; + } + } + if (itor->curr.hdr->prev_addr == 0 && itor->curr_min_idx == -1) { + itor->curr_min_idx = 0; + } + + // FIXME: To prefetch: + // 1. we just moved from one extent to the next + // 2. this can't be the last extent + /* if (itor->do_prefetch */ + /* && !btree_addrs_share_extent(cc, last_addr, itor->curr.addr) */ + /* && itor->curr.hdr->next_extent_addr != 0 */ + /* && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) */ + /* { */ + /* // IO prefetch the next extent */ + /* cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); + */ + /* } */ +} + platform_status -btree_iterator_advance(iterator *base_itor) +btree_iterator_next(iterator *base_itor) { debug_assert(base_itor != NULL); btree_iterator *itor = (btree_iterator *)base_itor; // We should not be calling advance on an empty iterator - debug_assert(!btree_iterator_is_at_end(itor)); + debug_assert(btree_iterator_can_next(base_itor)); debug_assert(itor->idx < btree_num_entries(itor->curr.hdr)); itor->idx++; - if (!btree_iterator_is_at_end(itor) - && itor->idx == btree_num_entries(itor->curr.hdr)) + if (itor->idx == btree_num_entries(itor->curr.hdr) + && btree_iterator_can_next(base_itor)) { - btree_iterator_advance_leaf(itor); + btree_iterator_next_leaf(itor); } - debug_assert(btree_iterator_is_at_end(itor) - || itor->idx < btree_num_entries(itor->curr.hdr)); + debug_assert( + !btree_iterator_can_next(base_itor) + || (0 <= itor->idx && itor->idx < btree_num_entries(itor->curr.hdr))); return STATUS_OK; } +platform_status +btree_iterator_prev(iterator *base_itor) +{ + debug_assert(base_itor != NULL); + btree_iterator *itor = (btree_iterator *)base_itor; + + // We should not be calling prev on an empty iterator + debug_assert(btree_iterator_can_prev(base_itor)); + debug_assert(itor->idx >= 0); + + itor->idx--; + if (itor->curr_min_idx == -1 && itor->idx == -1) { + btree_iterator_prev_leaf(itor); + } + + debug_assert( + !btree_iterator_can_prev(base_itor) + || (0 <= itor->idx && itor->idx < btree_num_entries(itor->curr.hdr))); + return STATUS_OK; +} + +// This function voilates our locking rules. See comment at top of file. +static inline void +find_btree_node_and_get_idx_bounds(btree_iterator *itor, + key target, + comparison position_rule) +{ + // lookup the node that contains target + btree_lookup_node(itor->cc, + itor->cfg, + itor->root_addr, + target, + itor->height, + itor->page_type, + &itor->curr, + NULL); + + /* + * We have to claim curr in order to prevent possible deadlocks + * with insertion threads while finding the end node. + * + * Note that we can't lookup end first because, if there's a split + * between looking up end and looking up curr, we could end up in a + * situation where end comes before curr in the tree! (We could + * prevent this by holding a claim on end while looking up curr, + * but that would essentially be the same as the code below.) + * + * Note that the approach in advance (i.e. releasing and reaquiring + * a lock on curr) is not viable here because we are not + * necessarily searching for the 0th entry in curr. Thus a split + * of curr while we have released it could mean that we really want + * to start at curr's right sibling (after the split). So we'd + * have to redo the search from scratch after releasing curr. + * + * So we take a claim on curr instead. + */ + while (!btree_node_claim(itor->cc, itor->cfg, &itor->curr)) { + btree_node_unget(itor->cc, itor->cfg, &itor->curr); + btree_lookup_node(itor->cc, + itor->cfg, + itor->root_addr, + target, + itor->height, + itor->page_type, + &itor->curr, + NULL); + } + + btree_iterator_find_end(itor); + + /* Once we've found end, we can unclaim curr. */ + btree_node_unclaim(itor->cc, itor->cfg, &itor->curr); + + // find the index of the minimum key + bool32 found; + int64 tmp = find_key_in_node( + itor, itor->curr.hdr, itor->min_key, greater_than_or_equal, &found); + // If min key doesn't exist in current node, but is: + // 1) in range: Min idx = smallest key > min_key + // 2) out of range: Min idx = -1 + itor->curr_min_idx = !found && tmp == 0 ? --tmp : tmp; + // if min_key is not within the current node but there is no previous node + // then set curr_min_idx to 0 + if (itor->curr_min_idx == -1 && itor->curr.hdr->prev_addr == 0) { + itor->curr_min_idx = 0; + } + + // find the index of the actual target + itor->idx = + find_key_in_node(itor, itor->curr.hdr, target, position_rule, &found); + + // check if we already need to move to the prev/next leaf + if (itor->curr.addr != itor->end_addr + && itor->idx == btree_num_entries(itor->curr.hdr)) + { + btree_iterator_next_leaf(itor); + itor->curr_min_idx = 0; // we came from an irrelevant leaf + } + if (itor->curr_min_idx == -1 && itor->idx == -1) { + btree_iterator_prev_leaf(itor); + } +} + +/* + * Seek to a given key within the btree + * seek_type defines where the iterator is positioned relative to the target + * key. + */ platform_status -btree_iterator_at_end(iterator *itor, bool32 *at_end) +btree_iterator_seek(iterator *base_itor, key seek_key, comparison seek_type) { - debug_assert(itor != NULL); - *at_end = btree_iterator_is_at_end((btree_iterator *)itor); + debug_assert(base_itor != NULL); + btree_iterator *itor = (btree_iterator *)base_itor; + + if (btree_key_compare(itor->cfg, seek_key, itor->min_key) < 0 + || btree_key_compare(itor->cfg, seek_key, itor->max_key) > 0) + { + return STATUS_BAD_PARAM; + } + + // check if seek_key is within our current node + key first_key = itor->height + ? btree_get_pivot(itor->cfg, itor->curr.hdr, 0) + : btree_get_tuple_key(itor->cfg, itor->curr.hdr, 0); + key last_key = + itor->height + ? btree_get_pivot(itor->cfg, itor->curr.hdr, itor->end_idx - 1) + : btree_get_tuple_key(itor->cfg, itor->curr.hdr, itor->end_idx - 1); + + if (btree_key_compare(itor->cfg, seek_key, first_key) >= 0 + && btree_key_compare(itor->cfg, seek_key, last_key) <= 0) + { + // seek_key is within our current leaf. So just directly search for it + bool32 found; + itor->idx = + find_key_in_node(itor, itor->curr.hdr, seek_key, seek_type, &found); + platform_assert(0 <= itor->idx); + } else { + // seek key is not within our current leaf. So find the correct leaf + find_btree_node_and_get_idx_bounds(itor, seek_key, seek_type); + } return STATUS_OK; } @@ -2590,9 +2902,12 @@ btree_iterator_print(iterator *itor) } const static iterator_ops btree_iterator_ops = { - .get_curr = btree_iterator_get_curr, - .at_end = btree_iterator_at_end, - .advance = btree_iterator_advance, + .curr = btree_iterator_curr, + .can_prev = btree_iterator_can_prev, + .can_next = btree_iterator_can_next, + .next = btree_iterator_next, + .prev = btree_iterator_prev, + .seek = btree_iterator_seek, .print = btree_iterator_print, }; @@ -2600,7 +2915,7 @@ const static iterator_ops btree_iterator_ops = { /* *----------------------------------------------------------------------------- * Caller must guarantee: - * max_key needs to be valid until at_end() returns true + * min_key and max_key need to be valid until iterator deinitialized *----------------------------------------------------------------------------- */ void @@ -2611,6 +2926,8 @@ btree_iterator_init(cache *cc, page_type page_type, key min_key, key max_key, + key start_key, + comparison start_type, bool32 do_prefetch, uint32 height) { @@ -2618,11 +2935,18 @@ btree_iterator_init(cache *cc, debug_assert(page_type == PAGE_TYPE_MEMTABLE || page_type == PAGE_TYPE_BRANCH); - debug_assert(!key_is_null(min_key) && !key_is_null(max_key)); + debug_assert(!key_is_null(min_key) && !key_is_null(max_key) + && !key_is_null(start_key)); if (btree_key_compare(cfg, min_key, max_key) > 0) { max_key = min_key; } + if (btree_key_compare(cfg, start_key, min_key) < 0) { + start_key = min_key; + } + if (btree_key_compare(cfg, start_key, max_key) > 0) { + start_key = max_key; + } ZERO_CONTENTS(itor); itor->cc = cc; @@ -2635,73 +2959,7 @@ btree_iterator_init(cache *cc, itor->page_type = page_type; itor->super.ops = &btree_iterator_ops; - btree_lookup_node(itor->cc, - itor->cfg, - itor->root_addr, - min_key, - itor->height, - itor->page_type, - &itor->curr, - NULL); - /* - * We have to claim curr in order to prevent possible deadlocks - * with insertion threads while finding the end node. - * - * Note that we can't lookup end first because, if there's a split - * between looking up end and looking up curr, we could end up in a - * situation where end comes before curr in the tree! (We could - * prevent this by holding a claim on end while looking up curr, - * but that would essentially be the same as the code below.) - * - * Note that the approach in advance (i.e. releasing and reaquiring - * a lock on curr) is not viable here because we are not - * necessarily searching for the 0th entry in curr. Thus a split - * of curr while we have released it could mean that we really want - * to start at curr's right sibling (after the split). So we'd - * have to redo the search from scratch after releasing curr. - * - * So we take a claim on curr instead. - */ - while (!btree_node_claim(cc, cfg, &itor->curr)) { - btree_node_unget(cc, cfg, &itor->curr); - btree_lookup_node(itor->cc, - itor->cfg, - itor->root_addr, - min_key, - itor->height, - itor->page_type, - &itor->curr, - NULL); - } - - btree_iterator_find_end(itor); - - /* Once we've found end, we can unclaim curr. */ - btree_node_unclaim(cc, cfg, &itor->curr); - - bool32 found; - int64 tmp; - if (itor->height == 0) { - tmp = btree_find_tuple(itor->cfg, itor->curr.hdr, min_key, &found); - if (!found) { - tmp++; - } - } else if (itor->height > itor->curr.hdr->height) { - tmp = 0; - } else { - tmp = btree_find_pivot(itor->cfg, itor->curr.hdr, min_key, &found); - if (!found) { - tmp++; - } - platform_assert(0 <= tmp); - } - itor->idx = tmp; - - if (!btree_iterator_is_at_end(itor) - && itor->idx == btree_num_entries(itor->curr.hdr)) - { - btree_iterator_advance_leaf(itor); - } + find_btree_node_and_get_idx_bounds(itor, start_key, start_type); if (itor->do_prefetch && itor->curr.hdr->next_extent_addr != 0 && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) @@ -2710,7 +2968,7 @@ btree_iterator_init(cache *cc, cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); } - debug_assert(btree_iterator_is_at_end(itor) + debug_assert(!iterator_can_curr((iterator *)itor) || itor->idx < btree_num_entries(itor->curr.hdr)); } @@ -2846,6 +3104,7 @@ btree_pack_create_next_node(btree_pack_req *req, uint64 height, key pivot) if (0 < req->num_edges[height]) { btree_node *old_node = btree_pack_get_current_node(req, height); old_node->hdr->next_addr = new_node.addr; + new_node.hdr->prev_addr = old_node->addr; if (!btree_addrs_share_extent(req->cc, old_node->addr, new_node.addr)) { btree_pack_link_extent(req, height, new_node.addr); } @@ -2981,10 +3240,9 @@ btree_pack(btree_pack_req *req) key tuple_key = NEGATIVE_INFINITY_KEY; message data; - bool32 at_end; - while (SUCCESS(iterator_at_end(req->itor, &at_end)) && !at_end) { - iterator_get_curr(req->itor, &tuple_key, &data); + while (iterator_can_next(req->itor)) { + iterator_curr(req->itor, &tuple_key, &data); if (!btree_pack_can_fit_tuple(req, tuple_key, data)) { platform_error_log("%s(): req->num_tuples=%lu exceeded output size " "limit, req->max_tuples=%lu\n", @@ -3000,7 +3258,12 @@ btree_pack(btree_pack_req *req) btree_pack_abort(req); return rc; } - iterator_advance(req->itor); + rc = iterator_next(req->itor); + if (!SUCCESS(rc)) { + platform_error_log("%s error status: %d\n", __func__, rc.r); + btree_pack_abort(req); + return rc; + } } btree_pack_post_loop(req, tuple_key); @@ -3081,22 +3344,22 @@ btree_count_in_range_by_iterator(cache *cc, PAGE_TYPE_BRANCH, min_key, max_key, + min_key, + TRUE, TRUE, 0); memset(stats, 0, sizeof(*stats)); - bool32 at_end; - iterator_at_end(itor, &at_end); - while (!at_end) { + while (iterator_can_next(itor)) { key curr_key; message msg; - iterator_get_curr(itor, &curr_key, &msg); + iterator_curr(itor, &curr_key, &msg); stats->num_kvs++; stats->key_bytes += key_length(curr_key); stats->message_bytes += message_length(msg); - iterator_advance(itor); - iterator_at_end(itor, &at_end); + platform_status rc = iterator_next(itor); + platform_assert_status_ok(rc); } btree_iterator_deinit(&btree_itor); } diff --git a/src/btree.h b/src/btree.h index beb7318bb..30f533b34 100644 --- a/src/btree.h +++ b/src/btree.h @@ -139,7 +139,8 @@ typedef struct btree_iterator { uint64 root_addr; btree_node curr; - uint64 idx; + int64 idx; + int64 curr_min_idx; uint64 end_addr; uint64 end_idx; uint64 end_generation; @@ -312,11 +313,13 @@ btree_lookup_and_merge_async(cache *cc, // IN void btree_iterator_init(cache *cc, btree_config *cfg, - btree_iterator *iterator, + btree_iterator *itor, uint64 root_addr, page_type page_type, key min_key, key max_key, + key start_key, + comparison start_type, bool32 do_prefetch, uint32 height); diff --git a/src/btree_private.h b/src/btree_private.h index bd2ed1c6c..f6b27bd91 100644 --- a/src/btree_private.h +++ b/src/btree_private.h @@ -33,6 +33,7 @@ typedef node_offset table_entry; * ************************************************************************* */ struct ONDISK btree_hdr { + uint64 prev_addr; uint64 next_addr; uint64 next_extent_addr; uint64 generation; diff --git a/src/iterator.h b/src/iterator.h index 7c2e714b0..7a2c69f65 100644 --- a/src/iterator.h +++ b/src/iterator.h @@ -8,19 +8,31 @@ typedef struct iterator iterator; -typedef void (*iterator_get_curr_fn)(iterator *itor, - key *curr_key, - message *msg); -typedef platform_status (*iterator_at_end_fn)(iterator *itor, bool32 *at_end); -typedef platform_status (*iterator_advance_fn)(iterator *itor); +// for seek +typedef enum comparison { + less_than, + less_than_or_equal, + greater_than, + greater_than_or_equal, +} comparison; + +typedef void (*iterator_curr_fn)(iterator *itor, key *curr_key, message *msg); +typedef bool32 (*iterator_bound_fn)(iterator *itor); +typedef platform_status (*iterator_step_fn)(iterator *itor); +typedef platform_status (*iterator_seek_fn)(iterator *itor, + key seek_key, + comparison from_above); typedef void (*iterator_print_fn)(iterator *itor); typedef struct iterator_ops { /* Callers should not modify data pointed to by *key or *data */ - iterator_get_curr_fn get_curr; - iterator_at_end_fn at_end; - iterator_advance_fn advance; - iterator_print_fn print; + iterator_curr_fn curr; + iterator_bound_fn can_prev; + iterator_bound_fn can_next; + iterator_step_fn next; + iterator_step_fn prev; + iterator_seek_fn seek; + iterator_print_fn print; } iterator_ops; // To sub-class iterator, make an iterator your first field @@ -28,22 +40,48 @@ struct iterator { const iterator_ops *ops; }; +// It is safe to call curr whenever iterator_in_range() returns true +// otherwise the behavior of iterator_curr is undefined static inline void -iterator_get_curr(iterator *itor, key *curr_key, message *msg) +iterator_curr(iterator *itor, key *curr_key, message *msg) +{ + itor->ops->curr(itor, curr_key, msg); +} + +static inline bool32 +iterator_can_prev(iterator *itor) +{ + return itor->ops->can_prev(itor); +} + +static inline bool32 +iterator_can_next(iterator *itor) +{ + return itor->ops->can_next(itor); +} + +static inline bool32 +iterator_can_curr(iterator *itor) +{ + return itor->ops->can_next(itor) && itor->ops->can_prev(itor); +} + +static inline platform_status +iterator_next(iterator *itor) { - itor->ops->get_curr(itor, curr_key, msg); + return itor->ops->next(itor); } static inline platform_status -iterator_at_end(iterator *itor, bool32 *at_end) +iterator_prev(iterator *itor) { - return itor->ops->at_end(itor, at_end); + return itor->ops->prev(itor); } static inline platform_status -iterator_advance(iterator *itor) +iterator_seek(iterator *itor, key seek_key, comparison seek_type) { - return itor->ops->advance(itor); + return itor->ops->seek(itor, seek_key, seek_type); } static inline void diff --git a/src/merge.c b/src/merge.c index 3cd2f4945..753d87231 100644 --- a/src/merge.c +++ b/src/merge.c @@ -18,18 +18,26 @@ struct merge_behavior { /* Function declarations and iterator_ops */ void -merge_get_curr(iterator *itor, key *curr_key, message *data); +merge_curr(iterator *itor, key *curr_key, message *data); + +bool32 +merge_can_prev(iterator *itor); + +bool32 +merge_can_next(iterator *itor); platform_status -merge_at_end(iterator *itor, bool32 *at_end); +merge_next(iterator *itor); platform_status -merge_advance(iterator *itor); +merge_prev(iterator *itor); static iterator_ops merge_ops = { - .get_curr = merge_get_curr, - .at_end = merge_at_end, - .advance = merge_advance, + .curr = merge_curr, + .can_prev = merge_can_prev, + .can_next = merge_can_next, + .next = merge_next, + .prev = merge_prev, }; /* @@ -43,10 +51,12 @@ static iterator_ops merge_ops = { static inline int bsearch_comp(const ordered_iterator *itor_one, const ordered_iterator *itor_two, + const bool32 forwards, const data_config *cfg, bool32 *keys_equal) { int cmp = data_key_compare(cfg, itor_one->curr_key, itor_two->curr_key); + cmp = forwards ? cmp : -cmp; *keys_equal = (cmp == 0); if (cmp == 0) { cmp = itor_two->seq - itor_one->seq; @@ -57,15 +67,22 @@ bsearch_comp(const ordered_iterator *itor_one, return cmp; } +struct merge_ctxt { + bool32 forwards; + data_config *cfg; +}; + /* Comparison function for sort of the min ritor array */ static int merge_comp(const void *one, const void *two, void *ctxt) { + struct merge_ctxt *m_ctxt = (struct merge_ctxt *)ctxt; const ordered_iterator *itor_one = *(ordered_iterator **)one; const ordered_iterator *itor_two = *(ordered_iterator **)two; - data_config *cfg = (data_config *)ctxt; + bool32 forwards = m_ctxt->forwards; + data_config *cfg = m_ctxt->cfg; bool32 ignore_keys_equal; - return bsearch_comp(itor_one, itor_two, cfg, &ignore_keys_equal); + return bsearch_comp(itor_one, itor_two, forwards, cfg, &ignore_keys_equal); } // Returns index (from base0) where key belongs @@ -74,6 +91,7 @@ bsearch_insert(register const ordered_iterator *key, ordered_iterator **base0, const size_t nmemb, const data_config *cfg, + bool32 forwards, bool32 *prev_equal_out, bool32 *next_equal_out) { @@ -87,7 +105,7 @@ bsearch_insert(register const ordered_iterator *key, for (lim = nmemb; lim != 0; lim >>= 1) { p = base + (lim >> 1); bool32 keys_equal; - cmp = bsearch_comp(key, *p, cfg, &keys_equal); + cmp = bsearch_comp(key, *p, forwards, cfg, &keys_equal); debug_assert(cmp != 0); if (cmp > 0) { /* key > p: move right */ @@ -107,7 +125,7 @@ bsearch_insert(register const ordered_iterator *key, static inline void set_curr_ordered_iterator(const data_config *cfg, ordered_iterator *itor) { - iterator_get_curr(itor->itor, &itor->curr_key, &itor->curr_data); + iterator_curr(itor->itor, &itor->curr_key, &itor->curr_data); debug_assert(key_is_user_key(itor->curr_key)); } @@ -138,7 +156,10 @@ debug_verify_sorted(debug_only merge_iterator *merge_itor, if (merge_itor->ordered_iterators[index]->next_key_equal) { debug_assert(cmp == 0); } else { - debug_assert(cmp < 0); + if (merge_itor->forwards) + debug_assert(cmp < 0); + else + debug_assert(cmp > 0); } #endif } @@ -154,19 +175,18 @@ advance_and_resort_min_ritor(merge_iterator *merge_itor) merge_itor->ordered_iterators[0]->next_key_equal = FALSE; merge_itor->ordered_iterators[0]->curr_key = NULL_KEY; merge_itor->ordered_iterators[0]->curr_data = NULL_MESSAGE; - rc = iterator_advance(merge_itor->ordered_iterators[0]->itor); - if (!SUCCESS(rc)) { - return rc; + if (merge_itor->forwards) { + rc = iterator_next(merge_itor->ordered_iterators[0]->itor); + } else { + rc = iterator_prev(merge_itor->ordered_iterators[0]->itor); } - bool32 at_end; - // if it's exhausted, kill it and move the ritors up the queue. - rc = iterator_at_end(merge_itor->ordered_iterators[0]->itor, &at_end); if (!SUCCESS(rc)) { return rc; } - if (UNLIKELY(at_end)) { + // if it's exhausted, kill it and move the ritors up the queue. + if (UNLIKELY(!iterator_can_curr(merge_itor->ordered_iterators[0]->itor))) { merge_itor->num_remaining--; ordered_iterator *tmp = merge_itor->ordered_iterators[0]; for (int i = 0; i < merge_itor->num_remaining; ++i) { @@ -192,6 +212,7 @@ advance_and_resort_min_ritor(merge_iterator *merge_itor) merge_itor->ordered_iterators + 1, merge_itor->num_remaining - 1, merge_itor->cfg, + merge_itor->forwards, &prev_equal, &next_equal); debug_assert(index >= 0); @@ -339,9 +360,13 @@ static platform_status advance_one_loop(merge_iterator *merge_itor, bool32 *retry) { *retry = FALSE; - // Determine whether we're at the end. + // Determine whether we're no longer in range. if (merge_itor->num_remaining == 0) { - merge_itor->at_end = TRUE; + if (merge_itor->forwards) { + merge_itor->can_next = FALSE; + } else { + merge_itor->can_prev = FALSE; + } return STATUS_OK; } @@ -379,6 +404,106 @@ advance_one_loop(merge_iterator *merge_itor, bool32 *retry) return STATUS_OK; } +/* + * Given a merge_iterator with initialized ordered_iterators restore the + * following invariants: + * 1. Each iterator holds the appropriate key and value + * 2. Dead iterators come after live iterators. + * 3. Live iterators are in sorted order. (depends on merge_itor->forwards) + * 4. Live iterators that have equal keys indicate so. + * 5. The first key in the sorted order has all its iterators merged. + */ +static platform_status +setup_ordered_iterators(merge_iterator *merge_itor) +{ + platform_status rc = STATUS_OK; + ordered_iterator *temp; + merge_itor->can_prev = FALSE; + merge_itor->can_next = FALSE; + + // Move all the dead iterators to the end and count how many are still alive. + merge_itor->num_remaining = merge_itor->num_trees; + int i = 0; + while (i < merge_itor->num_remaining) { + // determine if the merge itor can go prev/next based upon ordered_itors + if (iterator_can_prev(merge_itor->ordered_iterators[i]->itor)) { + merge_itor->can_prev = TRUE; + } + if (iterator_can_next(merge_itor->ordered_iterators[i]->itor)) { + merge_itor->can_next = TRUE; + } + + if (!iterator_can_curr(merge_itor->ordered_iterators[i]->itor)) { + ordered_iterator *tmp = + merge_itor->ordered_iterators[merge_itor->num_remaining - 1]; + merge_itor->ordered_iterators[merge_itor->num_remaining - 1] = + merge_itor->ordered_iterators[i]; + merge_itor->ordered_iterators[i] = tmp; + merge_itor->num_remaining--; + } else { + set_curr_ordered_iterator(merge_itor->cfg, + merge_itor->ordered_iterators[i]); + i++; + } + } + struct merge_ctxt merge_args; + merge_args.forwards = merge_itor->forwards; + merge_args.cfg = merge_itor->cfg; + platform_sort_slow(merge_itor->ordered_iterators, + merge_itor->num_remaining, + sizeof(*merge_itor->ordered_iterators), + merge_comp, + &merge_args, + &temp); + // Generate initial value for next_key_equal bits + for (int i = 0; i + 1 < merge_itor->num_remaining; ++i) { + int cmp = + data_key_compare(merge_itor->cfg, + merge_itor->ordered_iterators[i]->curr_key, + merge_itor->ordered_iterators[i + 1]->curr_key); + if (merge_itor->forwards) { + debug_assert(cmp <= 0); + } else { + debug_assert(cmp >= 0); + } + merge_itor->ordered_iterators[i]->next_key_equal = (cmp == 0); + } + + bool32 retry; + rc = advance_one_loop(merge_itor, &retry); + + if (retry && SUCCESS(rc)) { + if (merge_itor->forwards) { + rc = merge_next((iterator *)merge_itor); + } else { + rc = merge_prev((iterator *)merge_itor); + } + } + + if (!SUCCESS(rc)) { + // destroy the iterator + platform_status merge_iterator_rc = + merge_iterator_destroy(platform_get_heap_id(), &merge_itor); + if (!SUCCESS(merge_iterator_rc)) { + platform_error_log( + "setup_ordered_iterators: exception while releasing\n"); + if (SUCCESS(rc)) { + platform_error_log("setup_ordered_iterators: clobbering rc\n"); + rc = merge_iterator_rc; + } + } + platform_error_log("setup_ordered_iterators: exception: %s\n", + platform_status_to_string(rc)); + return rc; + } + + if (!key_is_null(merge_itor->curr_key)) { + debug_assert_message_type_valid(merge_itor); + } + + return rc; +} + /* *----------------------------------------------------------------------------- * merge_iterator_create -- @@ -400,10 +525,9 @@ merge_iterator_create(platform_heap_id hid, merge_behavior merge_mode, merge_iterator **out_itor) { - int i; - platform_status rc = STATUS_OK, merge_iterator_rc; - merge_iterator *merge_itor; - ordered_iterator *temp; + int i; + platform_status rc = STATUS_OK; + merge_iterator *merge_itor; if (!out_itor || !itor_arr || !cfg || num_trees < 0 || num_trees >= ARRAY_SIZE(merge_itor->ordered_iterator_stored)) @@ -436,9 +560,9 @@ merge_iterator_create(platform_heap_id hid, merge_itor->finalize_updates = merge_mode == MERGE_FULL; merge_itor->emit_deletes = merge_mode != MERGE_FULL; - merge_itor->at_end = FALSE; merge_itor->cfg = cfg; merge_itor->curr_key = NULL_KEY; + merge_itor->forwards = TRUE; // index -1 initializes the pad variable for (i = -1; i < num_trees; i++) { @@ -453,75 +577,12 @@ merge_iterator_create(platform_heap_id hid, &merge_itor->ordered_iterator_stored[i]; } - // Move all the dead iterators to the end and count how many are still alive. - merge_itor->num_remaining = num_trees; - i = 0; - while (i < merge_itor->num_remaining) { - bool32 at_end; - rc = iterator_at_end(merge_itor->ordered_iterators[i]->itor, &at_end); - if (!SUCCESS(rc)) { - goto destroy; - } - if (at_end) { - ordered_iterator *tmp = - merge_itor->ordered_iterators[merge_itor->num_remaining - 1]; - merge_itor->ordered_iterators[merge_itor->num_remaining - 1] = - merge_itor->ordered_iterators[i]; - merge_itor->ordered_iterators[i] = tmp; - merge_itor->num_remaining--; - } else { - set_curr_ordered_iterator(cfg, merge_itor->ordered_iterators[i]); - i++; - } - } - platform_sort_slow(merge_itor->ordered_iterators, - merge_itor->num_remaining, - sizeof(*merge_itor->ordered_iterators), - merge_comp, - merge_itor->cfg, - &temp); - // Generate initial value for next_key_equal bits - for (i = 0; i + 1 < merge_itor->num_remaining; ++i) { - int cmp = - data_key_compare(merge_itor->cfg, - merge_itor->ordered_iterators[i]->curr_key, - merge_itor->ordered_iterators[i + 1]->curr_key); - debug_assert(cmp <= 0); - merge_itor->ordered_iterators[i]->next_key_equal = (cmp == 0); - } - - bool32 retry; - rc = advance_one_loop(merge_itor, &retry); + rc = setup_ordered_iterators(merge_itor); if (!SUCCESS(rc)) { - goto out; - } - - if (retry) { - rc = merge_advance((iterator *)merge_itor); - } - - goto out; - -destroy: - merge_iterator_rc = merge_iterator_destroy(hid, &merge_itor); - if (!SUCCESS(merge_iterator_rc)) { - platform_error_log("merge_iterator_create: exception while releasing\n"); - if (SUCCESS(rc)) { - platform_error_log("merge_iterator_create: clobbering rc\n"); - rc = merge_iterator_rc; - } + return rc; } -out: - if (!SUCCESS(rc)) { - platform_error_log("merge_iterator_create: exception: %s\n", - platform_status_to_string(rc)); - } else { - *out_itor = merge_itor; - if (!merge_itor->at_end) { - debug_assert_message_type_valid(merge_itor); - } - } + *out_itor = merge_itor; return rc; } @@ -543,70 +604,99 @@ merge_iterator_destroy(platform_heap_id hid, merge_iterator **merge_itor) return STATUS_OK; } - /* *----------------------------------------------------------------------------- - * merge_at_end -- + * merge_iterator_set_direction -- * - * Checks if more values are left in the merge itor. + * Prepares the merge iterator for traveling either forwards or backwards + * by calling next or prev respectively on each iterator and then + * resorting. * * Results: - * Returns TRUE if the itor is at end, FALSE otherwise. - * - * Side effects: - * None. + * 0 if successful, error otherwise *----------------------------------------------------------------------------- */ -platform_status -merge_at_end(iterator *itor, // IN - bool32 *at_end) // OUT +static platform_status +merge_iterator_set_direction(merge_iterator *merge_itor, bool32 forwards) { - merge_iterator *merge_itor = (merge_iterator *)itor; - *at_end = merge_itor->at_end; - debug_assert(*at_end == key_is_null(merge_itor->curr_key)); + debug_assert(merge_itor->forwards != forwards); + platform_status rc; + merge_itor->forwards = forwards; - return STATUS_OK; + // Step every iterator, both alive and dead, in the appropriate direction. + for (int i = 0; i < merge_itor->num_trees; i++) { + if (forwards && iterator_can_next(merge_itor->ordered_iterators[i]->itor)) + { + iterator_next(merge_itor->ordered_iterators[i]->itor); + } + if (!forwards + && iterator_can_prev(merge_itor->ordered_iterators[i]->itor)) { + iterator_prev(merge_itor->ordered_iterators[i]->itor); + } + } + + // restore iterator invariants + rc = setup_ordered_iterators(merge_itor); + + return rc; } + /* *----------------------------------------------------------------------------- - * merge_get_curr -- + * merge_can_prev and merge_can_next -- * - * Returns current key and data from the merge itor. + * Checks if the iterator is able to move prev or next. + * The half open range [start_key, end_key) defines the iterator's bounds. * * Results: - * Current key and data. + * Returns TRUE if the iterator can move as requested, FALSE otherwise. * * Side effects: * None. *----------------------------------------------------------------------------- */ -void -merge_get_curr(iterator *itor, key *curr_key, message *data) +bool32 +merge_can_prev(iterator *itor) { merge_iterator *merge_itor = (merge_iterator *)itor; - debug_assert(!merge_itor->at_end); - *curr_key = merge_itor->curr_key; - *data = merge_itor->curr_data; + return merge_itor->can_prev; +} + +bool32 +merge_can_next(iterator *itor) +{ + merge_iterator *merge_itor = (merge_iterator *)itor; + return merge_itor->can_next; } /* *----------------------------------------------------------------------------- - * merge_advance -- + * merge_get_curr -- * - * Merges the next key from the array of input iterators. + * Returns current key and data from the merge itor. * * Results: - * 0 if successful, error otherwise + * Current key and data. + * + * Side effects: + * None. *----------------------------------------------------------------------------- */ -platform_status -merge_advance(iterator *itor) +void +merge_curr(iterator *itor, key *curr_key, message *data) { - platform_status rc = STATUS_OK; + debug_assert(iterator_can_curr(itor)); merge_iterator *merge_itor = (merge_iterator *)itor; + *curr_key = merge_itor->curr_key; + *data = merge_itor->curr_data; +} - bool32 retry; +static inline platform_status +merge_advance_helper(merge_iterator *merge_itor) +{ + platform_status rc = STATUS_OK; + bool32 retry; do { merge_itor->curr_key = NULL_KEY; merge_itor->curr_data = NULL_MESSAGE; @@ -622,7 +712,49 @@ merge_advance(iterator *itor) } } while (retry); - return STATUS_OK; + return rc; +} + +/* + *----------------------------------------------------------------------------- + * merge_next -- + * + * Merges the next key from the array of input iterators. + * + * Results: + * 0 if successful, error otherwise + *----------------------------------------------------------------------------- + */ +platform_status +merge_next(iterator *itor) +{ + merge_iterator *merge_itor = (merge_iterator *)itor; + + if (!merge_itor->forwards) { + return merge_iterator_set_direction(merge_itor, TRUE); + } + return merge_advance_helper(merge_itor); +} + +/* + *----------------------------------------------------------------------------- + * merge_prev -- + * + * Merges the prev key from the array of input iterators. + * + * Results: + * 0 if successful, error otherwise + *----------------------------------------------------------------------------- + */ +platform_status +merge_prev(iterator *itor) +{ + merge_iterator *merge_itor = (merge_iterator *)itor; + + if (merge_itor->forwards) { + return merge_iterator_set_direction(merge_itor, FALSE); + } + return merge_advance_helper(merge_itor); } void @@ -632,7 +764,7 @@ merge_iterator_print(merge_iterator *merge_itor) key curr_key; message data; data_config *data_cfg = merge_itor->cfg; - iterator_get_curr(&merge_itor->super, &curr_key, &data); + iterator_curr(&merge_itor->super, &curr_key, &data); platform_default_log("****************************************\n"); platform_default_log("** merge iterator\n"); @@ -642,16 +774,14 @@ merge_iterator_print(merge_iterator *merge_itor) platform_default_log("** curr: %s\n", key_string(data_cfg, curr_key)); platform_default_log("----------------------------------------\n"); for (i = 0; i < merge_itor->num_trees; i++) { - bool32 at_end; - iterator_at_end(merge_itor->ordered_iterators[i]->itor, &at_end); platform_default_log("%u: ", merge_itor->ordered_iterators[i]->seq); - if (at_end) { + if (!iterator_can_curr(merge_itor->ordered_iterators[i]->itor)) { platform_default_log("# : "); } else { platform_default_log("_ : "); } if (i < merge_itor->num_remaining) { - iterator_get_curr( + iterator_curr( merge_itor->ordered_iterators[i]->itor, &curr_key, &data); platform_default_log("%s\n", key_string(data_cfg, curr_key)); } else { diff --git a/src/merge.h b/src/merge.h index d9ab6a58f..0556e0fa2 100644 --- a/src/merge.h +++ b/src/merge.h @@ -62,11 +62,13 @@ typedef struct merge_iterator { bool32 merge_messages; bool32 finalize_updates; bool32 emit_deletes; - bool32 at_end; + bool32 can_prev; + bool32 can_next; int num_remaining; // number of ritors not at end data_config *cfg; // point message tree data config key curr_key; // current key message curr_data; // current data + bool32 forwards; // Padding so ordered_iterators[-1] is valid ordered_iterator ordered_iterator_stored_pad; diff --git a/src/routing_filter.c b/src/routing_filter.c index 9c8838cec..04b5d15f7 100644 --- a/src/routing_filter.c +++ b/src/routing_filter.c @@ -1215,20 +1215,18 @@ routing_filter_verify(cache *cc, uint16 value, iterator *itor) { - bool32 at_end; - iterator_at_end(itor, &at_end); - while (!at_end) { + while (iterator_can_next(itor)) { key curr_key; message msg; - iterator_get_curr(itor, &curr_key, &msg); + iterator_curr(itor, &curr_key, &msg); debug_assert(key_is_user_key(curr_key)); uint64 found_values; platform_status rc = routing_filter_lookup(cc, cfg, filter, curr_key, &found_values); platform_assert_status_ok(rc); platform_assert(routing_filter_is_value_found(found_values, value)); - iterator_advance(itor); - iterator_at_end(itor, &at_end); + rc = iterator_next(itor); + platform_assert_status_ok(rc); } } diff --git a/src/shard_log.c b/src/shard_log.c index ca6718a74..7249bb6e2 100644 --- a/src/shard_log.c +++ b/src/shard_log.c @@ -38,16 +38,20 @@ static log_ops shard_log_ops = { }; void -shard_log_iterator_get_curr(iterator *itor, key *curr_key, message *msg); -platform_status -shard_log_iterator_at_end(iterator *itor, bool32 *at_end); +shard_log_iterator_curr(iterator *itor, key *curr_key, message *msg); +bool32 +shard_log_iterator_can_prev(iterator *itor); +bool32 +shard_log_iterator_can_next(iterator *itor); platform_status -shard_log_iterator_advance(iterator *itor); +shard_log_iterator_next(iterator *itor); + const static iterator_ops shard_log_iterator_ops = { - .get_curr = shard_log_iterator_get_curr, - .at_end = shard_log_iterator_at_end, - .advance = shard_log_iterator_advance, + .curr = shard_log_iterator_curr, + .can_prev = shard_log_iterator_can_prev, + .can_next = shard_log_iterator_can_next, + .next = shard_log_iterator_next, .print = NULL, }; @@ -428,24 +432,29 @@ shard_log_iterator_deinit(platform_heap_id hid, shard_log_iterator *itor) } void -shard_log_iterator_get_curr(iterator *itorh, key *curr_key, message *msg) +shard_log_iterator_curr(iterator *itorh, key *curr_key, message *msg) { shard_log_iterator *itor = (shard_log_iterator *)itorh; *curr_key = log_entry_key(itor->entries[itor->pos]); *msg = log_entry_message(itor->entries[itor->pos]); } -platform_status -shard_log_iterator_at_end(iterator *itorh, bool32 *at_end) +bool32 +shard_log_iterator_can_prev(iterator *itorh) { shard_log_iterator *itor = (shard_log_iterator *)itorh; - *at_end = itor->pos == itor->num_entries; + return itor->pos >= 0; +} - return STATUS_OK; +bool32 +shard_log_iterator_can_next(iterator *itorh) +{ + shard_log_iterator *itor = (shard_log_iterator *)itorh; + return itor->pos < itor->num_entries; } platform_status -shard_log_iterator_advance(iterator *itorh) +shard_log_iterator_next(iterator *itorh) { shard_log_iterator *itor = (shard_log_iterator *)itorh; itor->pos++; diff --git a/src/splinterdb.c b/src/splinterdb.c index b3654ad67..4c2656c2c 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -614,8 +614,13 @@ splinterdb_iterator_init(const splinterdb *kvs, // IN start_key = key_create_from_slice(user_start_key); } - platform_status rc = trunk_range_iterator_init( - kvs->spl, range_itor, start_key, POSITIVE_INFINITY_KEY, UINT64_MAX); + platform_status rc = trunk_range_iterator_init(kvs->spl, + range_itor, + NEGATIVE_INFINITY_KEY, + POSITIVE_INFINITY_KEY, + start_key, + greater_than_or_equal, + UINT64_MAX); if (!SUCCESS(rc)) { platform_free(kvs->spl->heap_id, *iter); return platform_status_to_int(rc); @@ -642,20 +647,42 @@ splinterdb_iterator_valid(splinterdb_iterator *kvi) if (!SUCCESS(kvi->last_rc)) { return FALSE; } - bool32 at_end; iterator *itor = &(kvi->sri.super); - kvi->last_rc = iterator_at_end(itor, &at_end); + return iterator_can_curr(itor); +} + +_Bool +splinterdb_iterator_can_prev(splinterdb_iterator *kvi) +{ + if (!SUCCESS(kvi->last_rc)) { + return FALSE; + } + iterator *itor = &(kvi->sri.super); + return iterator_can_prev(itor); +} + +_Bool +splinterdb_iterator_can_next(splinterdb_iterator *kvi) +{ if (!SUCCESS(kvi->last_rc)) { return FALSE; } - return !at_end; + iterator *itor = &(kvi->sri.super); + return iterator_can_next(itor); } void splinterdb_iterator_next(splinterdb_iterator *kvi) { iterator *itor = &(kvi->sri.super); - kvi->last_rc = iterator_advance(itor); + kvi->last_rc = iterator_next(itor); +} + +void +splinterdb_iterator_prev(splinterdb_iterator *kvi) +{ + iterator *itor = &(kvi->sri.super); + kvi->last_rc = iterator_prev(itor); } int @@ -674,7 +701,7 @@ splinterdb_iterator_get_current(splinterdb_iterator *iter, // IN message msg; iterator *itor = &(iter->sri.super); - iterator_get_curr(itor, &result_key, &msg); + iterator_curr(itor, &result_key, &msg); *value = message_slice(msg); *outkey = key_slice(result_key); } diff --git a/src/trunk.c b/src/trunk.c index d087e842d..2bc1447eb 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -737,15 +737,6 @@ typedef struct trunk_btree_skiperator { btree_iterator itor[TRUNK_MAX_PIVOTS]; } trunk_btree_skiperator; -// for find_pivot -typedef enum lookup_type { - invalid = 0, - less_than, - less_than_or_equal, - greater_than, - greater_than_or_equal, -} lookup_type; - // for for_each_node typedef bool32 (*node_fn)(trunk_handle *spl, uint64 addr, void *arg); @@ -785,7 +776,7 @@ typedef union { static inline uint64 trunk_pivot_size (trunk_handle *spl); static inline key trunk_get_pivot (trunk_handle *spl, trunk_node *node, uint16 pivot_no); static inline trunk_pivot_data *trunk_get_pivot_data (trunk_handle *spl, trunk_node *node, uint16 pivot_no); -static inline uint16 trunk_find_pivot (trunk_handle *spl, trunk_node *node, key target, lookup_type comp); +static inline uint16 trunk_find_pivot (trunk_handle *spl, trunk_node *node, key target, comparison comp); platform_status trunk_add_pivot (trunk_handle *spl, trunk_node *parent, trunk_node *child, uint16 pivot_no); static inline uint16 trunk_num_children (trunk_handle *spl, trunk_node *node); static inline uint16 trunk_num_pivot_keys (trunk_handle *spl, trunk_node *node); @@ -846,22 +837,24 @@ void trunk_print_node (platform_log static void trunk_print_pivots (platform_log_handle *log_handle, trunk_handle *spl, trunk_node *node); static void trunk_print_branches_and_bundles(platform_log_handle *log_handle, trunk_handle *spl, trunk_node *node); static void trunk_btree_skiperator_init (trunk_handle *spl, trunk_btree_skiperator *skip_itor, trunk_node *node, uint16 branch_idx, key_buffer pivots[static TRUNK_MAX_PIVOTS]); -void trunk_btree_skiperator_get_curr (iterator *itor, key *curr_key, message *data); -platform_status trunk_btree_skiperator_advance (iterator *itor); -platform_status trunk_btree_skiperator_at_end (iterator *itor, bool32 *at_end); +void trunk_btree_skiperator_curr (iterator *itor, key *curr_key, message *data); +platform_status trunk_btree_skiperator_next (iterator *itor); +bool32 trunk_btree_skiperator_can_prev (iterator *itor); +bool32 trunk_btree_skiperator_can_next (iterator *itor); void trunk_btree_skiperator_print (iterator *itor); void trunk_btree_skiperator_deinit (trunk_handle *spl, trunk_btree_skiperator *skip_itor); bool32 trunk_verify_node (trunk_handle *spl, trunk_node *node); void trunk_maybe_reclaim_space (trunk_handle *spl); +// clang-format on + const static iterator_ops trunk_btree_skiperator_ops = { - .get_curr = trunk_btree_skiperator_get_curr, - .at_end = trunk_btree_skiperator_at_end, - .advance = trunk_btree_skiperator_advance, + .curr = trunk_btree_skiperator_curr, + .can_prev = trunk_btree_skiperator_can_prev, + .can_next = trunk_btree_skiperator_can_next, + .next = trunk_btree_skiperator_next, .print = trunk_btree_skiperator_print, }; -// clang-format on - static inline data_config * trunk_data_config(trunk_handle *spl) { @@ -1586,7 +1579,7 @@ lowerbound(uint32 size) * Used by find_pivot */ static inline void -trunk_update_lowerbound(uint16 *lo, uint16 *mid, int cmp, lookup_type comp) +trunk_update_lowerbound(uint16 *lo, uint16 *mid, int cmp, comparison comp) { switch (comp) { case less_than: @@ -1613,7 +1606,7 @@ static inline uint16 trunk_find_pivot(trunk_handle *spl, trunk_node *node, key target, - lookup_type comp) + comparison comp) { debug_assert(node != NULL); uint16 lo_idx = 0, mid_idx; @@ -3336,6 +3329,8 @@ trunk_memtable_iterator_init(trunk_handle *spl, uint64 root_addr, key min_key, key max_key, + key start_key, + comparison start_type, bool32 is_live, bool32 inc_ref) { @@ -3349,6 +3344,8 @@ trunk_memtable_iterator_init(trunk_handle *spl, PAGE_TYPE_MEMTABLE, min_key, max_key, + start_key, + start_type, FALSE, 0); } @@ -3443,6 +3440,8 @@ trunk_memtable_compact_and_build_filter(trunk_handle *spl, memtable_root_addr, NEGATIVE_INFINITY_KEY, POSITIVE_INFINITY_KEY, + NEGATIVE_INFINITY_KEY, + greater_than_or_equal, FALSE, FALSE); btree_pack_req req; @@ -4818,6 +4817,8 @@ trunk_branch_iterator_init(trunk_handle *spl, trunk_branch *branch, key min_key, key max_key, + key start_key, + comparison start_type, bool32 do_prefetch, bool32 should_inc_ref) { @@ -4834,6 +4835,8 @@ trunk_branch_iterator_init(trunk_handle *spl, PAGE_TYPE_BRANCH, min_key, max_key, + start_key, + start_type, do_prefetch, 0); } @@ -4906,6 +4909,8 @@ trunk_btree_skiperator_init(trunk_handle *spl, &skip_itor->branch, pivot_min_key, pivot_max_key, + pivot_min_key, + greater_than_or_equal, TRUE, TRUE); iterator_started = FALSE; @@ -4914,13 +4919,13 @@ trunk_btree_skiperator_init(trunk_handle *spl, bool32 at_end; if (skip_itor->curr != skip_itor->end) { - iterator_at_end(&skip_itor->itor[skip_itor->curr].super, &at_end); + at_end = !iterator_can_next(&skip_itor->itor[skip_itor->curr].super); } else { at_end = TRUE; } while (skip_itor->curr != skip_itor->end && at_end) { - iterator_at_end(&skip_itor->itor[skip_itor->curr].super, &at_end); + at_end = !iterator_can_next(&skip_itor->itor[skip_itor->curr].super); if (!at_end) { break; } @@ -4929,28 +4934,26 @@ trunk_btree_skiperator_init(trunk_handle *spl, } void -trunk_btree_skiperator_get_curr(iterator *itor, key *curr_key, message *data) +trunk_btree_skiperator_curr(iterator *itor, key *curr_key, message *data) { debug_assert(itor != NULL); trunk_btree_skiperator *skip_itor = (trunk_btree_skiperator *)itor; - iterator_get_curr(&skip_itor->itor[skip_itor->curr].super, curr_key, data); + iterator_curr(&skip_itor->itor[skip_itor->curr].super, curr_key, data); } platform_status -trunk_btree_skiperator_advance(iterator *itor) +trunk_btree_skiperator_next(iterator *itor) { debug_assert(itor != NULL); trunk_btree_skiperator *skip_itor = (trunk_btree_skiperator *)itor; - platform_status rc = - iterator_advance(&skip_itor->itor[skip_itor->curr].super); + platform_status rc = iterator_next(&skip_itor->itor[skip_itor->curr].super); if (!SUCCESS(rc)) { return rc; } - bool32 at_end; - iterator_at_end(&skip_itor->itor[skip_itor->curr].super, &at_end); + bool32 at_end = !iterator_can_next(&skip_itor->itor[skip_itor->curr].super); while (skip_itor->curr != skip_itor->end && at_end) { - iterator_at_end(&skip_itor->itor[skip_itor->curr].super, &at_end); + at_end = !iterator_can_next(&skip_itor->itor[skip_itor->curr].super); if (!at_end) break; skip_itor->curr++; @@ -4959,17 +4962,26 @@ trunk_btree_skiperator_advance(iterator *itor) return STATUS_OK; } -platform_status -trunk_btree_skiperator_at_end(iterator *itor, bool32 *at_end) +bool32 +trunk_btree_skiperator_can_prev(iterator *itor) { trunk_btree_skiperator *skip_itor = (trunk_btree_skiperator *)itor; if (skip_itor->curr == skip_itor->end) { - *at_end = TRUE; - return STATUS_OK; + return FALSE; } - iterator_at_end(&skip_itor->itor[skip_itor->curr].super, at_end); - return STATUS_OK; + return iterator_can_prev(&skip_itor->itor[skip_itor->curr].super); +} + +bool32 +trunk_btree_skiperator_can_next(iterator *itor) +{ + trunk_btree_skiperator *skip_itor = (trunk_btree_skiperator *)itor; + if (skip_itor->curr == skip_itor->end) { + return FALSE; + } + + return iterator_can_next(&skip_itor->itor[skip_itor->curr].super); } void @@ -5728,6 +5740,8 @@ trunk_split_leaf(trunk_handle *spl, PAGE_TYPE_BRANCH, min_key, max_key, + min_key, + greater_than_or_equal, TRUE, 1); rough_itor[branch_offset] = &rough_btree_itor[branch_offset].super; @@ -5745,8 +5759,7 @@ trunk_split_leaf(trunk_handle *spl, /* * 2. Use rough merge iterator to determine pivots for new leaves */ - bool32 at_end; - rc = iterator_at_end(&rough_merge_itor->super, &at_end); + bool32 at_end = !iterator_can_next(&rough_merge_itor->super); platform_assert_status_ok(rc); uint64 rough_count_kv_bytes; @@ -5760,7 +5773,7 @@ trunk_split_leaf(trunk_handle *spl, { key curr_key; message pivot_data_message; - iterator_get_curr( + iterator_curr( &rough_merge_itor->super, &curr_key, &pivot_data_message); const btree_pivot_data *pivot_data = @@ -5768,8 +5781,9 @@ trunk_split_leaf(trunk_handle *spl, rough_count_num_tuples += pivot_data->stats.num_kvs; rough_count_kv_bytes += pivot_data->stats.key_bytes + pivot_data->stats.message_bytes; - iterator_advance(&rough_merge_itor->super); - iterator_at_end(&rough_merge_itor->super, &at_end); + rc = iterator_next(&rough_merge_itor->super); + platform_assert_status_ok(rc); + at_end = !iterator_can_next(&rough_merge_itor->super); } if (num_leaves == 0) { @@ -5780,7 +5794,7 @@ trunk_split_leaf(trunk_handle *spl, if (!at_end) { key curr_key; message dummy_data; - iterator_get_curr(&rough_merge_itor->super, &curr_key, &dummy_data); + iterator_curr(&rough_merge_itor->super, &curr_key, &dummy_data); debug_assert(key_length(curr_key) <= trunk_max_key_size(spl)); // copy new pivot (in parent) of new leaf key_buffer_init_from_key( @@ -6018,18 +6032,24 @@ trunk_split_root(trunk_handle *spl, trunk_node *root) *----------------------------------------------------------------------------- */ void -trunk_range_iterator_get_curr(iterator *itor, key *curr_key, message *data); +trunk_range_iterator_curr(iterator *itor, key *curr_key, message *data); +bool32 +trunk_range_iterator_can_prev(iterator *itor); +bool32 +trunk_range_iterator_can_next(iterator *itor); platform_status -trunk_range_iterator_at_end(iterator *itor, bool32 *at_end); +trunk_range_iterator_next(iterator *itor); platform_status -trunk_range_iterator_advance(iterator *itor); +trunk_range_iterator_prev(iterator *itor); void trunk_range_iterator_deinit(trunk_range_iterator *range_itor); const static iterator_ops trunk_range_iterator_ops = { - .get_curr = trunk_range_iterator_get_curr, - .at_end = trunk_range_iterator_at_end, - .advance = trunk_range_iterator_advance, + .curr = trunk_range_iterator_curr, + .can_prev = trunk_range_iterator_can_prev, + .can_next = trunk_range_iterator_can_next, + .next = trunk_range_iterator_next, + .prev = trunk_range_iterator_prev, }; platform_status @@ -6037,24 +6057,34 @@ trunk_range_iterator_init(trunk_handle *spl, trunk_range_iterator *range_itor, key min_key, key max_key, + key start_key, + comparison start_type, uint64 num_tuples) { debug_assert(!key_is_null(min_key)); debug_assert(!key_is_null(max_key)); + debug_assert(!key_is_null(start_key)); range_itor->spl = spl; range_itor->super.ops = &trunk_range_iterator_ops; range_itor->num_branches = 0; range_itor->num_tuples = num_tuples; - key_buffer_init_from_key(&range_itor->min_key, spl->heap_id, min_key); - key_buffer_init_from_key(&range_itor->max_key, spl->heap_id, max_key); + range_itor->merge_itor = NULL; + range_itor->can_prev = TRUE; + range_itor->can_next = TRUE; - if (trunk_key_compare(spl, max_key, min_key) <= 0) { - range_itor->at_end = TRUE; - return STATUS_OK; + if (trunk_key_compare(spl, min_key, start_key) > 0) { + // in bounds, start at min + start_key = min_key; + } + if (trunk_key_compare(spl, max_key, start_key) <= 0) { + // out of bounds, start at max + start_key = max_key; } - range_itor->at_end = FALSE; + // copy over global min and max + key_buffer_init_from_key(&range_itor->min_key, spl->heap_id, min_key); + key_buffer_init_from_key(&range_itor->max_key, spl->heap_id, max_key); ZERO_ARRAY(range_itor->compacted); @@ -6102,8 +6132,12 @@ trunk_range_iterator_init(trunk_handle *spl, // index btrees uint16 height = trunk_node_height(&node); for (uint16 h = height; h > 0; h--) { - uint16 pivot_no = trunk_find_pivot( - spl, &node, key_buffer_key(&range_itor->min_key), less_than_or_equal); + uint16 pivot_no; + if (start_type != less_than) { + pivot_no = trunk_find_pivot(spl, &node, start_key, less_than_or_equal); + } else { + pivot_no = trunk_find_pivot(spl, &node, start_key, less_than); + } debug_assert(pivot_no < trunk_num_children(spl, &node)); trunk_pivot_data *pdata = trunk_get_pivot_data(spl, &node, pivot_no); @@ -6152,19 +6186,19 @@ trunk_range_iterator_init(trunk_handle *spl, range_itor->num_branches++; } - // have a leaf, use to get rebuild key - key rebuild_key = + // have a leaf, use to establish local bounds + key local_min = + trunk_key_compare(spl, trunk_min_key(spl, &node), min_key) > 0 + ? trunk_min_key(spl, &node) + : min_key; + key local_max = trunk_key_compare(spl, trunk_max_key(spl, &node), max_key) < 0 ? trunk_max_key(spl, &node) : max_key; key_buffer_init_from_key( - &range_itor->rebuild_key, spl->heap_id, rebuild_key); - key_buffer local_max_key; - if (trunk_key_compare(spl, max_key, rebuild_key) < 0) { - key_buffer_init_from_key(&local_max_key, spl->heap_id, max_key); - } else { - key_buffer_init_from_key(&local_max_key, spl->heap_id, rebuild_key); - } + &range_itor->local_min_key, spl->heap_id, local_min); + key_buffer_init_from_key( + &range_itor->local_max_key, spl->heap_id, local_max); trunk_node_unget(spl->cc, &node); @@ -6180,24 +6214,28 @@ trunk_range_iterator_init(trunk_handle *spl, trunk_branch_iterator_init(spl, btree_itor, branch, - key_buffer_key(&range_itor->min_key), - key_buffer_key(&local_max_key), + key_buffer_key(&range_itor->local_min_key), + key_buffer_key(&range_itor->local_max_key), + start_key, + start_type, do_prefetch, FALSE); } else { uint64 mt_root_addr = branch->root_addr; bool32 is_live = branch_no == 0; - trunk_memtable_iterator_init(spl, - btree_itor, - mt_root_addr, - key_buffer_key(&range_itor->min_key), - key_buffer_key(&local_max_key), - is_live, - FALSE); + trunk_memtable_iterator_init( + spl, + btree_itor, + mt_root_addr, + key_buffer_key(&range_itor->local_min_key), + key_buffer_key(&range_itor->local_max_key), + start_key, + start_type, + is_live, + FALSE); } range_itor->itor[i] = &btree_itor->super; } - key_buffer_deinit(&local_max_key); platform_status rc = merge_iterator_create(spl->heap_id, spl->cfg.data_cfg, @@ -6209,62 +6247,80 @@ trunk_range_iterator_init(trunk_handle *spl, return rc; } - bool32 at_end; - iterator_at_end(&range_itor->merge_itor->super, &at_end); + bool32 in_range = iterator_can_curr(&range_itor->merge_itor->super); /* * if the merge itor is already exhausted, and there are more keys in the - * db/range, move to next leaf + * db/range, move to prev/next leaf */ - if (at_end) { - KEY_CREATE_LOCAL_COPY(rc, - rebuild_key, - spl->heap_id, - key_buffer_key(&range_itor->rebuild_key)); - if (!SUCCESS(rc)) { - return rc; - } - uint64 temp_tuples = range_itor->num_tuples; - trunk_range_iterator_deinit(range_itor); - if (trunk_key_compare(spl, rebuild_key, max_key) < 0) { - rc = trunk_range_iterator_init( - spl, range_itor, rebuild_key, max_key, temp_tuples); + if (!in_range && start_type >= greater_than) { + if (trunk_key_compare(spl, local_max, max_key) < 0) { + trunk_range_iterator_deinit(range_itor); + rc = trunk_range_iterator_init(spl, + range_itor, + min_key, + max_key, + local_max, + start_type, + range_itor->num_tuples); + if (!SUCCESS(rc)) { + return rc; + } + } else { + range_itor->can_next = FALSE; + range_itor->can_prev = + iterator_can_prev(&range_itor->merge_itor->super); + } + } + if (!in_range && start_type <= less_than_or_equal) { + if (trunk_key_compare(spl, local_min, min_key) > 0) { + trunk_range_iterator_deinit(range_itor); + rc = trunk_range_iterator_init(spl, + range_itor, + min_key, + max_key, + local_min, + start_type, + range_itor->num_tuples); if (!SUCCESS(rc)) { return rc; } - at_end = range_itor->at_end; + } else { + range_itor->can_prev = FALSE; + range_itor->can_next = + iterator_can_next(&range_itor->merge_itor->super); } } - - range_itor->at_end = at_end; - return rc; } void -trunk_range_iterator_get_curr(iterator *itor, key *curr_key, message *data) +trunk_range_iterator_curr(iterator *itor, key *curr_key, message *data) { debug_assert(itor != NULL); trunk_range_iterator *range_itor = (trunk_range_iterator *)itor; - iterator_get_curr(&range_itor->merge_itor->super, curr_key, data); + iterator_curr(&range_itor->merge_itor->super, curr_key, data); } platform_status -trunk_range_iterator_advance(iterator *itor) +trunk_range_iterator_next(iterator *itor) { - debug_assert(itor != NULL); trunk_range_iterator *range_itor = (trunk_range_iterator *)itor; - iterator_advance(&range_itor->merge_itor->super); + debug_assert(range_itor != NULL); + platform_assert(range_itor->can_next); + + platform_status rc = iterator_next(&range_itor->merge_itor->super); + if (!SUCCESS(rc)) { + return rc; + } range_itor->num_tuples++; - bool32 at_end; - iterator_at_end(&range_itor->merge_itor->super, &at_end); - platform_status rc; - // robj: shouldn't this be a while loop, like in the init function? - if (at_end) { + range_itor->can_prev = TRUE; + range_itor->can_next = iterator_can_next(&range_itor->merge_itor->super); + if (!range_itor->can_next) { KEY_CREATE_LOCAL_COPY(rc, - rebuild_key, + min_key, range_itor->spl->heap_id, - key_buffer_key(&range_itor->rebuild_key)); + key_buffer_key(&range_itor->min_key)); if (!SUCCESS(rc)) { return rc; } @@ -6275,18 +6331,30 @@ trunk_range_iterator_advance(iterator *itor) if (!SUCCESS(rc)) { return rc; } - trunk_range_iterator_deinit(range_itor); - rc = trunk_range_iterator_init(range_itor->spl, - range_itor, - rebuild_key, - max_key, - range_itor->num_tuples); + KEY_CREATE_LOCAL_COPY(rc, + local_max_key, + range_itor->spl->heap_id, + key_buffer_key(&range_itor->local_max_key)); if (!SUCCESS(rc)) { return rc; } - if (!range_itor->at_end) { - iterator_at_end(&range_itor->merge_itor->super, &at_end); - platform_assert(!at_end); + + // if there is more data to get, rebuild the iterator for next leaf + if (trunk_key_compare(range_itor->spl, local_max_key, max_key) < 0) { + uint64 temp_tuples = range_itor->num_tuples; + trunk_range_iterator_deinit(range_itor); + rc = trunk_range_iterator_init(range_itor->spl, + range_itor, + min_key, + max_key, + local_max_key, + greater_than_or_equal, + temp_tuples); + if (!SUCCESS(rc)) { + return rc; + } + debug_assert(range_itor->can_next + == iterator_can_next(&range_itor->merge_itor->super)); } } @@ -6294,40 +6362,104 @@ trunk_range_iterator_advance(iterator *itor) } platform_status -trunk_range_iterator_at_end(iterator *itor, bool32 *at_end) +trunk_range_iterator_prev(iterator *itor) { - debug_assert(itor != NULL); trunk_range_iterator *range_itor = (trunk_range_iterator *)itor; + debug_assert(itor != NULL); + platform_assert(range_itor->can_prev); + + platform_status rc = iterator_prev(&range_itor->merge_itor->super); + if (!SUCCESS(rc)) { + return rc; + } + range_itor->num_tuples++; + range_itor->can_next = TRUE; + range_itor->can_prev = iterator_can_prev(&range_itor->merge_itor->super); + if (!range_itor->can_prev) { + KEY_CREATE_LOCAL_COPY(rc, + min_key, + range_itor->spl->heap_id, + key_buffer_key(&range_itor->min_key)); + if (!SUCCESS(rc)) { + return rc; + } + KEY_CREATE_LOCAL_COPY(rc, + max_key, + range_itor->spl->heap_id, + key_buffer_key(&range_itor->max_key)); + if (!SUCCESS(rc)) { + return rc; + } + KEY_CREATE_LOCAL_COPY(rc, + local_min_key, + range_itor->spl->heap_id, + key_buffer_key(&range_itor->local_min_key)); + if (!SUCCESS(rc)) { + return rc; + } + + // if there is more data to get, rebuild the iterator for prev leaf + if (trunk_key_compare(range_itor->spl, local_min_key, min_key) > 0) { + trunk_range_iterator_deinit(range_itor); + rc = trunk_range_iterator_init(range_itor->spl, + range_itor, + min_key, + max_key, + local_min_key, + less_than, + range_itor->num_tuples); + if (!SUCCESS(rc)) { + return rc; + } + debug_assert(range_itor->can_prev + == iterator_can_prev(&range_itor->merge_itor->super)); + } + } - *at_end = range_itor->at_end; return STATUS_OK; } +bool32 +trunk_range_iterator_can_prev(iterator *itor) +{ + debug_assert(itor != NULL); + trunk_range_iterator *range_itor = (trunk_range_iterator *)itor; + + return range_itor->can_prev; +} + +bool32 +trunk_range_iterator_can_next(iterator *itor) +{ + debug_assert(itor != NULL); + trunk_range_iterator *range_itor = (trunk_range_iterator *)itor; + + return range_itor->can_next; +} + void trunk_range_iterator_deinit(trunk_range_iterator *range_itor) { - // If the iterator is at end, then it has already been deinitialized - if (range_itor->at_end) { - return; - } trunk_handle *spl = range_itor->spl; - merge_iterator_destroy(range_itor->spl->heap_id, &range_itor->merge_itor); - for (uint64 i = 0; i < range_itor->num_branches; i++) { - btree_iterator *btree_itor = &range_itor->btree_itor[i]; - if (range_itor->compacted[i]) { - uint64 root_addr = btree_itor->root_addr; - trunk_branch_iterator_deinit(spl, btree_itor, FALSE); - btree_unblock_dec_ref(spl->cc, &spl->cfg.btree_cfg, root_addr); - } else { - uint64 mt_gen = range_itor->memtable_start_gen - i; - trunk_memtable_iterator_deinit(spl, btree_itor, mt_gen, FALSE); - trunk_memtable_dec_ref(spl, mt_gen); + if (range_itor->merge_itor != NULL) { + merge_iterator_destroy(range_itor->spl->heap_id, &range_itor->merge_itor); + for (uint64 i = 0; i < range_itor->num_branches; i++) { + btree_iterator *btree_itor = &range_itor->btree_itor[i]; + if (range_itor->compacted[i]) { + uint64 root_addr = btree_itor->root_addr; + trunk_branch_iterator_deinit(spl, btree_itor, FALSE); + btree_unblock_dec_ref(spl->cc, &spl->cfg.btree_cfg, root_addr); + } else { + uint64 mt_gen = range_itor->memtable_start_gen - i; + trunk_memtable_iterator_deinit(spl, btree_itor, mt_gen, FALSE); + trunk_memtable_dec_ref(spl, mt_gen); + } } + key_buffer_deinit(&range_itor->min_key); + key_buffer_deinit(&range_itor->max_key); + key_buffer_deinit(&range_itor->local_min_key); + key_buffer_deinit(&range_itor->local_max_key); } - - key_buffer_deinit(&range_itor->min_key); - key_buffer_deinit(&range_itor->max_key); - key_buffer_deinit(&range_itor->rebuild_key); } /* @@ -7380,22 +7512,27 @@ trunk_range(trunk_handle *spl, void *arg) { trunk_range_iterator *range_itor = TYPED_MALLOC(spl->heap_id, range_itor); - platform_status rc = trunk_range_iterator_init( - spl, range_itor, start_key, POSITIVE_INFINITY_KEY, num_tuples); + platform_status rc = trunk_range_iterator_init(spl, + range_itor, + start_key, + POSITIVE_INFINITY_KEY, + start_key, + greater_than_or_equal, + num_tuples); if (!SUCCESS(rc)) { goto destroy_range_itor; } - bool32 at_end; - iterator_at_end(&range_itor->super, &at_end); - - for (int i = 0; i < num_tuples && !at_end; i++) { + for (int i = 0; i < num_tuples && iterator_can_next(&range_itor->super); i++) + { key curr_key; message data; - iterator_get_curr(&range_itor->super, &curr_key, &data); + iterator_curr(&range_itor->super, &curr_key, &data); func(curr_key, data, arg); - iterator_advance(&range_itor->super); - iterator_at_end(&range_itor->super, &at_end); + rc = iterator_next(&range_itor->super); + if (!SUCCESS(rc)) { + goto destroy_range_itor; + } } destroy_range_itor: diff --git a/src/trunk.h b/src/trunk.h index 13a1fd9e8..15b6ad3a2 100644 --- a/src/trunk.h +++ b/src/trunk.h @@ -231,10 +231,12 @@ typedef struct trunk_range_iterator { uint64 memtable_end_gen; bool32 compacted[TRUNK_RANGE_ITOR_MAX_BRANCHES]; merge_iterator *merge_itor; - bool32 at_end; + bool32 can_prev; + bool32 can_next; key_buffer min_key; key_buffer max_key; - key_buffer rebuild_key; + key_buffer local_min_key; + key_buffer local_max_key; btree_iterator btree_itor[TRUNK_RANGE_ITOR_MAX_BRANCHES]; trunk_branch branch[TRUNK_RANGE_ITOR_MAX_BRANCHES]; @@ -350,6 +352,8 @@ trunk_range_iterator_init(trunk_handle *spl, trunk_range_iterator *range_itor, key min_key, key max_key, + key start_key, + comparison start_type, uint64 num_tuples); void trunk_range_iterator_deinit(trunk_range_iterator *range_itor); diff --git a/tests/functional/btree_test.c b/tests/functional/btree_test.c index e1997e805..b9bee3c6d 100644 --- a/tests/functional/btree_test.c +++ b/tests/functional/btree_test.c @@ -664,6 +664,8 @@ test_btree_basic(cache *cc, PAGE_TYPE_MEMTABLE, NEGATIVE_INFINITY_KEY, POSITIVE_INFINITY_KEY, + NEGATIVE_INFINITY_KEY, + greater_than_or_equal, FALSE, 0); platform_default_log("btree iterator init time %luns\n", @@ -838,6 +840,8 @@ test_btree_create_packed_trees(cache *cc, PAGE_TYPE_MEMTABLE, NEGATIVE_INFINITY_KEY, POSITIVE_INFINITY_KEY, + NEGATIVE_INFINITY_KEY, + greater_than_or_equal, FALSE, 0); @@ -866,8 +870,9 @@ test_count_tuples_in_range(cache *cc, key high_key, uint64 *count) // OUTPUT { - btree_iterator itor; - uint64 i; + platform_status rc; + btree_iterator itor; + uint64 i; *count = 0; for (i = 0; i < num_trees; i++) { if (!btree_verify_tree(cc, cfg, root_addr[i], type)) { @@ -878,15 +883,22 @@ test_count_tuples_in_range(cache *cc, PAGE_TYPE_BRANCH); platform_assert(0); } - btree_iterator_init( - cc, cfg, &itor, root_addr[i], type, low_key, high_key, TRUE, 0); - bool32 at_end; - iterator_at_end(&itor.super, &at_end); + btree_iterator_init(cc, + cfg, + &itor, + root_addr[i], + type, + low_key, + high_key, + low_key, + greater_than_or_equal, + TRUE, + 0); key last_key = NULL_KEY; - while (!at_end) { + while (iterator_can_curr(&itor.super)) { key curr_key; message data; - iterator_get_curr(&itor.super, &curr_key, &data); + iterator_curr(&itor.super, &curr_key, &data); if (!key_is_null(last_key) && data_key_compare(cfg->data_cfg, last_key, curr_key) > 0) { @@ -938,13 +950,17 @@ test_count_tuples_in_range(cache *cc, platform_assert(0); } (*count)++; - iterator_advance(&itor.super); - iterator_at_end(&itor.super, &at_end); + rc = iterator_next(&itor.super); + if (!SUCCESS(rc)) { + btree_iterator_deinit(&itor); + goto out; + } } btree_iterator_deinit(&itor); } - return STATUS_OK; +out: + return rc; } static inline int @@ -960,19 +976,29 @@ test_btree_print_all_keys(cache *cc, uint64 i; for (i = 0; i < num_trees; i++) { platform_default_log("tree number %lu\n", i); - btree_iterator_init( - cc, cfg, &itor, root_addr[i], type, low_key, high_key, TRUE, 0); - bool32 at_end; - iterator_at_end(&itor.super, &at_end); - while (!at_end) { + btree_iterator_init(cc, + cfg, + &itor, + root_addr[i], + type, + low_key, + high_key, + low_key, + greater_than_or_equal, + TRUE, + 0); + while (iterator_can_curr(&itor.super)) { key curr_key; message data; - iterator_get_curr(&itor.super, &curr_key, &data); + iterator_curr(&itor.super, &curr_key, &data); char key_str[128]; data_key_to_string(cfg->data_cfg, curr_key, key_str, 128); platform_default_log("%s\n", key_str); - iterator_advance(&itor.super); - iterator_at_end(&itor.super, &at_end); + platform_status rc = iterator_next(&itor.super); + if (!SUCCESS(rc)) { + btree_iterator_deinit(&itor); + return -1; + } } btree_iterator_deinit(&itor); } @@ -1034,6 +1060,8 @@ test_btree_merge_basic(cache *cc, PAGE_TYPE_BRANCH, lo, hi, + lo, + greater_than_or_equal, TRUE, 0); itor_arr[tree_no] = &btree_itor_arr[tree_no].super; @@ -1244,7 +1272,6 @@ test_btree_rough_iterator(cache *cc, iterator **rough_itor = TYPED_ARRAY_MALLOC(hid, rough_itor, num_trees); platform_assert(rough_itor); - bool32 at_end; for (uint64 tree_no = 0; tree_no < num_trees; tree_no++) { btree_iterator_init(cc, btree_cfg, @@ -1253,14 +1280,14 @@ test_btree_rough_iterator(cache *cc, PAGE_TYPE_BRANCH, NEGATIVE_INFINITY_KEY, POSITIVE_INFINITY_KEY, + NEGATIVE_INFINITY_KEY, + greater_than_or_equal, TRUE, 1); - if (SUCCESS(iterator_at_end(&rough_btree_itor[tree_no].super, &at_end)) - && !at_end) - { + if (iterator_can_curr(&rough_btree_itor[tree_no].super)) { key curr_key; message msg; - iterator_get_curr(&rough_btree_itor[tree_no].super, &curr_key, &msg); + iterator_curr(&rough_btree_itor[tree_no].super, &curr_key, &msg); platform_default_log("key size: %lu\n", key_length(curr_key)); } rough_itor[tree_no] = &rough_btree_itor[tree_no].super; @@ -1277,14 +1304,14 @@ test_btree_rough_iterator(cache *cc, // uint64 target_num_pivots = // cfg->mt_cfg->max_tuples_per_memtable / btree_cfg->tuples_per_leaf; - iterator_at_end(&rough_merge_itor->super, &at_end); + bool32 at_end = !iterator_can_curr(&rough_merge_itor->super); uint64 pivot_no; for (pivot_no = 0; !at_end; pivot_no++) { // uint64 rough_count_pivots = 0; key curr_key; message dummy_data; - iterator_get_curr(&rough_merge_itor->super, &curr_key, &dummy_data); + iterator_curr(&rough_merge_itor->super, &curr_key, &dummy_data); if (key_length(curr_key) != btree_cfg->data_cfg->max_key_size) { platform_default_log("Weird key length: %lu should be: %lu\n", key_length(curr_key), @@ -1413,6 +1440,8 @@ test_btree_merge_perf(cache *cc, PAGE_TYPE_BRANCH, min_key, max_key, + min_key, + greater_than_or_equal, TRUE, 0); itor_arr[tree_no] = &btree_itor_arr[tree_no].super; diff --git a/tests/functional/log_test.c b/tests/functional/log_test.c index c1fed2d83..17b3a3b8a 100644 --- a/tests/functional/log_test.c +++ b/tests/functional/log_test.c @@ -45,7 +45,6 @@ test_log_crash(clockcache *cc, iterator *itorh = (iterator *)&itor; char key_str[128]; char data_str[128]; - bool32 at_end; merge_accumulator msg; DECLARE_AUTO_KEY_BUFFER(keybuffer, hid); @@ -82,8 +81,7 @@ test_log_crash(clockcache *cc, platform_assert_status_ok(rc); itorh = (iterator *)&itor; - iterator_at_end(itorh, &at_end); - for (i = 0; i < num_entries && !at_end; i++) { + for (i = 0; i < num_entries && iterator_can_curr(itorh); i++) { key skey = test_key(&keybuffer, TEST_RANDOM, i, @@ -93,7 +91,7 @@ test_log_crash(clockcache *cc, 0); generate_test_message(gen, i, &msg); message mmessage = merge_accumulator_to_message(&msg); - iterator_get_curr(itorh, &returned_key, &returned_message); + iterator_curr(itorh, &returned_key, &returned_message); if (data_key_compare(cfg->data_cfg, skey, returned_key) || message_lex_cmp(mmessage, returned_message)) { @@ -106,8 +104,8 @@ test_log_crash(clockcache *cc, platform_default_log("actual: %s -- %s\n", key_str, data_str); platform_assert(0); } - iterator_advance(itorh); - iterator_at_end(itorh, &at_end); + rc = iterator_next(itorh); + platform_assert_status_ok(rc); } platform_default_log("log returned %lu of %lu entries\n", i, num_entries); diff --git a/tests/functional/test_functionality.c b/tests/functional/test_functionality.c index 7cb79c090..bfd95fa67 100644 --- a/tests/functional/test_functionality.c +++ b/tests/functional/test_functionality.c @@ -32,20 +32,25 @@ static void search_for_key_via_iterator(trunk_handle *spl, key target) { trunk_range_iterator iter; - bool32 at_end; - trunk_range_iterator_init( - spl, &iter, NEGATIVE_INFINITY_KEY, POSITIVE_INFINITY_KEY, UINT64_MAX); + trunk_range_iterator_init(spl, + &iter, + NEGATIVE_INFINITY_KEY, + POSITIVE_INFINITY_KEY, + NEGATIVE_INFINITY_KEY, + greater_than_or_equal, + UINT64_MAX); uint64 count = 0; - while (SUCCESS(iterator_at_end((iterator *)&iter, &at_end)) && !at_end) { + while (iterator_can_curr((iterator *)&iter)) { key curr_key; message value; - iterator_get_curr((iterator *)&iter, &curr_key, &value); + iterator_curr((iterator *)&iter, &curr_key, &value); if (data_key_compare(spl->cfg.data_cfg, target, curr_key) == 0) { platform_error_log("Found missing key %s\n", key_string(spl->cfg.data_cfg, target)); } - iterator_advance((iterator *)&iter); + platform_status rc = iterator_next((iterator *)&iter); + platform_assert_status_ok(rc); count++; } platform_default_log("Saw a total of %lu keys\n", count); @@ -227,15 +232,19 @@ verify_range_against_shadow(trunk_handle *spl, const data_handle *splinter_data_handle; uint64 splinter_key; uint64 i; - bool32 at_end; platform_assert(start_index <= sharr->nkeys); platform_assert(end_index <= sharr->nkeys); trunk_range_iterator *range_itor = TYPED_MALLOC(hid, range_itor); platform_assert(range_itor != NULL); - status = trunk_range_iterator_init( - spl, range_itor, start_key, end_key, end_index - start_index); + status = trunk_range_iterator_init(spl, + range_itor, + start_key, + end_key, + start_key, + greater_than_or_equal, + end_index - start_index); if (!SUCCESS(status)) { platform_error_log("failed to create range itor: %s\n", platform_status_to_string(status)); @@ -250,19 +259,13 @@ verify_range_against_shadow(trunk_handle *spl, continue; } - status = iterator_at_end((iterator *)range_itor, &at_end); - if (!SUCCESS(status) || at_end) { - platform_error_log("ERROR: range itor failed or terminated " - "early (at_end = %d): %s\n", - at_end, - platform_status_to_string(status)); - if (SUCCESS(status)) { - status = STATUS_NO_PERMISSION; - } + if (!iterator_can_curr((iterator *)range_itor)) { + platform_error_log("ERROR: range itor terminated early\n"); + status = STATUS_NO_PERMISSION; goto destroy; } - iterator_get_curr( + iterator_curr( (iterator *)range_itor, &splinter_keybuf, &splinter_message); splinter_key = be64toh(*(uint64 *)key_data(splinter_keybuf)); splinter_data_handle = message_data(splinter_message); @@ -287,16 +290,15 @@ verify_range_against_shadow(trunk_handle *spl, verify_tuple( spl, splinter_keybuf, splinter_message, shadow_refcount, &status); - status = iterator_advance((iterator *)range_itor); + status = iterator_next((iterator *)range_itor); if (!SUCCESS(status)) { goto destroy; } } - while (SUCCESS(iterator_at_end((iterator *)range_itor, &at_end)) && !at_end) - { + while (iterator_can_curr((iterator *)range_itor)) { status = STATUS_LIMIT_EXCEEDED; - iterator_get_curr( + iterator_curr( (iterator *)range_itor, &splinter_keybuf, &splinter_message); splinter_key = be64toh(*(uint64 *)key_data(splinter_keybuf)); @@ -304,7 +306,7 @@ verify_range_against_shadow(trunk_handle *spl, "Tree Refcount %3d\n", splinter_key, splinter_data_handle->ref_count); - if (!SUCCESS(iterator_advance((iterator *)range_itor))) { + if (!SUCCESS(iterator_next((iterator *)range_itor))) { goto destroy; } } diff --git a/tests/unit/btree_stress_test.c b/tests/unit/btree_stress_test.c index dff2d6387..7f9c8d08b 100644 --- a/tests/unit/btree_stress_test.c +++ b/tests/unit/btree_stress_test.c @@ -64,8 +64,16 @@ iterator_tests(cache *cc, btree_config *cfg, uint64 root_addr, int nkvs, + bool32 start_front, platform_heap_id hid); +static int +iterator_seek_tests(cache *cc, + btree_config *cfg, + uint64 root_addr, + int nkvs, + platform_heap_id hid); + static uint64 pack_tests(cache *cc, btree_config *cfg, @@ -232,10 +240,29 @@ CTEST2(btree_stress, test_random_inserts_concurrent) nkvs); ASSERT_NOT_EQUAL(0, rc, "Invalid tree\n"); - if (!iterator_tests( + if (!iterator_tests((cache *)&data->cc, + &data->dbtree_cfg, + root_addr, + nkvs, + TRUE, + data->hid)) + { + CTEST_ERR("invalid ranges in original tree, starting at front\n"); + } + if (!iterator_tests((cache *)&data->cc, + &data->dbtree_cfg, + root_addr, + nkvs, + FALSE, + data->hid)) + { + CTEST_ERR("invalid ranges in original tree, starting at back\n"); + } + + if (!iterator_seek_tests( (cache *)&data->cc, &data->dbtree_cfg, root_addr, nkvs, data->hid)) { - CTEST_ERR("invalid ranges in original tree\n"); + CTEST_ERR("invalid ranges when seeking in original tree\n"); } uint64 packed_root_addr = pack_tests( @@ -252,8 +279,12 @@ CTEST2(btree_stress, test_random_inserts_concurrent) nkvs); ASSERT_NOT_EQUAL(0, rc, "Invalid tree\n"); - rc = iterator_tests( - (cache *)&data->cc, &data->dbtree_cfg, packed_root_addr, nkvs, data->hid); + rc = iterator_tests((cache *)&data->cc, + &data->dbtree_cfg, + packed_root_addr, + nkvs, + TRUE, + data->hid); ASSERT_NOT_EQUAL(0, rc, "Invalid ranges in packed tree\n"); // Exercise print method to verify that it basically continues to work. @@ -404,39 +435,24 @@ query_tests(cache *cc, return 1; } -static int -iterator_tests(cache *cc, - btree_config *cfg, - uint64 root_addr, - int nkvs, - platform_heap_id hid) +static uint64 +iterator_test(platform_heap_id hid, + btree_config *cfg, + uint64 nkvs, + iterator *iter, + bool32 forwards) { - btree_iterator dbiter; - - btree_iterator_init(cc, - cfg, - &dbiter, - root_addr, - PAGE_TYPE_MEMTABLE, - NEGATIVE_INFINITY_KEY, - POSITIVE_INFINITY_KEY, - FALSE, - 0); - - iterator *iter = (iterator *)&dbiter; - - uint64 seen = 0; - bool32 at_end; + uint64 seen = 0; uint8 *prevbuf = TYPED_MANUAL_MALLOC(hid, prevbuf, btree_page_size(cfg)); key prev = NULL_KEY; uint8 *keybuf = TYPED_MANUAL_MALLOC(hid, keybuf, btree_page_size(cfg)); uint8 *msgbuf = TYPED_MANUAL_MALLOC(hid, msgbuf, btree_page_size(cfg)); - while (SUCCESS(iterator_at_end(iter, &at_end)) && !at_end) { + while (iterator_can_curr(iter)) { key curr_key; message msg; - iterator_get_curr(iter, &curr_key, &msg); + iterator_curr(iter, &curr_key, &msg); uint64 k = ungen_key(curr_key); ASSERT_TRUE(k < nkvs); @@ -449,24 +465,128 @@ iterator_tests(cache *cc, rc = message_lex_cmp(msg, gen_msg(cfg, k, msgbuf, btree_page_size(cfg))); ASSERT_EQUAL(0, rc); - ASSERT_TRUE(key_is_null(prev) - || data_key_compare(cfg->data_cfg, prev, curr_key) < 0); + if (forwards) { + ASSERT_TRUE(key_is_null(prev) + || data_key_compare(cfg->data_cfg, prev, curr_key) < 0); + } else { + ASSERT_TRUE(key_is_null(prev) + || data_key_compare(cfg->data_cfg, curr_key, prev) < 0); + } seen++; prev = key_create(key_length(curr_key), prevbuf); key_copy_contents(prevbuf, curr_key); - if (!SUCCESS(iterator_advance(iter))) { - break; + if (forwards) { + if (!SUCCESS(iterator_next(iter))) { + break; + } + } else { + if (!SUCCESS(iterator_prev(iter))) { + break; + } } } - ASSERT_EQUAL(nkvs, seen); - - btree_iterator_deinit(&dbiter); platform_free(hid, prevbuf); platform_free(hid, keybuf); platform_free(hid, msgbuf); + return seen; +} + +static int +iterator_tests(cache *cc, + btree_config *cfg, + uint64 root_addr, + int nkvs, + bool32 start_front, + platform_heap_id hid) +{ + btree_iterator dbiter; + + key start_key; + if (start_front) + start_key = NEGATIVE_INFINITY_KEY; + else + start_key = POSITIVE_INFINITY_KEY; + + btree_iterator_init(cc, + cfg, + &dbiter, + root_addr, + PAGE_TYPE_MEMTABLE, + NEGATIVE_INFINITY_KEY, + POSITIVE_INFINITY_KEY, + start_key, + greater_than_or_equal, + FALSE, + 0); + + iterator *iter = (iterator *)&dbiter; + + if (!start_front) { + iterator_prev(iter); + } + bool32 nonempty = iterator_can_curr(iter); + + ASSERT_EQUAL(nkvs, iterator_test(hid, cfg, nkvs, iter, start_front)); + if (nonempty) { + if (start_front) { + iterator_prev(iter); + } else { + iterator_next(iter); + } + } + ASSERT_EQUAL(nkvs, iterator_test(hid, cfg, nkvs, iter, !start_front)); + + btree_iterator_deinit(&dbiter); + + return 1; +} + +static int +iterator_seek_tests(cache *cc, + btree_config *cfg, + uint64 root_addr, + int nkvs, + platform_heap_id hid) +{ + btree_iterator dbiter; + + int keybuf_size = btree_page_size(cfg); + uint8 *keybuf = TYPED_MANUAL_MALLOC(hid, keybuf, keybuf_size); + + // start in the "middle" of the range + key start_key = gen_key(cfg, nkvs / 2, keybuf, keybuf_size); + + btree_iterator_init(cc, + cfg, + &dbiter, + root_addr, + PAGE_TYPE_MEMTABLE, + NEGATIVE_INFINITY_KEY, + POSITIVE_INFINITY_KEY, + start_key, + greater_than_or_equal, + FALSE, + 0); + iterator *iter = (iterator *)&dbiter; + + // go down + uint64 found_down = iterator_test(hid, cfg, nkvs, iter, FALSE); + + // seek back to start_key + iterator_seek(iter, start_key, TRUE); + + // skip start_key + iterator_next(iter); + + // go up + uint64 found_up = iterator_test(hid, cfg, nkvs, iter, TRUE); + + ASSERT_EQUAL(nkvs, found_up + found_down); + + btree_iterator_deinit(&dbiter); return 1; } @@ -488,6 +608,8 @@ pack_tests(cache *cc, PAGE_TYPE_MEMTABLE, NEGATIVE_INFINITY_KEY, POSITIVE_INFINITY_KEY, + NEGATIVE_INFINITY_KEY, + greater_than_or_equal, FALSE, 0); diff --git a/tests/unit/btree_test.c b/tests/unit/btree_test.c index 41b7200a1..541e11587 100644 --- a/tests/unit/btree_test.c +++ b/tests/unit/btree_test.c @@ -170,7 +170,7 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) * or the size of a btree leafy entry, then this number will need * to be changed, and that's fine. */ - int nkvs = 209; + int nkvs = 208; btree_init_hdr(cfg, hdr); diff --git a/tests/unit/splinterdb_quick_test.c b/tests/unit/splinterdb_quick_test.c index 97d998042..b9a8e13d7 100644 --- a/tests/unit/splinterdb_quick_test.c +++ b/tests/unit/splinterdb_quick_test.c @@ -44,10 +44,10 @@ #define TEST_MAX_VALUE_SIZE 32 // Hard-coded format strings to generate key and values -static const char key_fmt[] = "key-%02x"; -static const char val_fmt[] = "val-%02x"; -#define KEY_FMT_LENGTH (6) -#define VAL_FMT_LENGTH (6) +static const char key_fmt[] = "key-%04x"; +static const char val_fmt[] = "val-%04x"; +#define KEY_FMT_LENGTH (8) +#define VAL_FMT_LENGTH (8) #define TEST_INSERT_KEY_LENGTH (KEY_FMT_LENGTH + 1) #define TEST_INSERT_VAL_LENGTH (VAL_FMT_LENGTH + 1) @@ -65,6 +65,14 @@ insert_keys(splinterdb *kvsb, const int minkey, int numkeys, const int incr); static int check_current_tuple(splinterdb_iterator *it, const int expected_i); +static int +test_two_step_iterator(splinterdb *kvsb, + slice start_key, + int num_keys, + int minkey, + int start_i, + int hop_i); + static int custom_key_comparator(const data_config *cfg, slice key1, slice key2); @@ -441,6 +449,24 @@ CTEST2(splinterdb_quick, test_basic_iterator) splinterdb_iterator_deinit(it); } +/* + * empty iterator test case. + */ +CTEST2(splinterdb_quick, test_empty_iterator) +{ + splinterdb_iterator *it = NULL; + int rc = splinterdb_iterator_init(data->kvsb, &it, NULL_SLICE); + ASSERT_EQUAL(0, rc); + + ASSERT_FALSE(splinterdb_iterator_valid(it)); + ASSERT_FALSE(splinterdb_iterator_can_next(it)); + ASSERT_FALSE(splinterdb_iterator_can_prev(it)); + rc = splinterdb_iterator_status(it); + ASSERT_EQUAL(0, rc); + + splinterdb_iterator_deinit(it); +} + /* * Test case to exercise and verify that splinterdb iterator interfaces with a * non-NULL start key correctly sets up the start scan at the requested @@ -626,6 +652,39 @@ CTEST2(splinterdb_quick, } } +CTEST2(splinterdb_quick, test_iterator_prev_and_next) +{ + const int num_inserts = 1 << 14; + // Should insert keys: 1, 4, 7, 10 13, 16, 19, ... + int minkey = 1; + int hop_amt = 3; + int rc = insert_keys(data->kvsb, minkey, num_inserts, 3); + ASSERT_EQUAL(0, rc); + + char key[TEST_INSERT_KEY_LENGTH]; + + // test starting with a null key + ASSERT_EQUAL(0, + test_two_step_iterator( + data->kvsb, NULL_SLICE, num_inserts, minkey, 0, hop_amt)); + + // test starting with key < minkey + snprintf(key, sizeof(key), key_fmt, 0); + slice start_key = slice_create(strlen(key), key); + ASSERT_EQUAL(0, + test_two_step_iterator( + data->kvsb, start_key, num_inserts, minkey, 0, hop_amt)); + + // test starting between two keys + int start_i = num_inserts / 4; + snprintf(key, sizeof(key), key_fmt, hop_amt * start_i + minkey - 1); + start_key = slice_create(strlen(key), key); + ASSERT_EQUAL( + 0, + test_two_step_iterator( + data->kvsb, start_key, num_inserts, minkey, start_i, hop_amt)); +} + /* * Test case to verify the interfaces to close() and reopen() a KVS work * as expected. After reopening the KVS, we should be able to retrieve data @@ -1010,6 +1069,64 @@ check_current_tuple(splinterdb_iterator *it, const int expected_i) return rc; } +// Test moving iterator 2 steps up, 1 step back and then all the way back down +static int +test_two_step_iterator(splinterdb *kvsb, + slice start_key, + int num_keys, + int minkey, + int start_i, + int hop_i) +{ + int rc; + splinterdb_iterator *it = NULL; + rc = splinterdb_iterator_init(kvsb, &it, start_key); + ASSERT_EQUAL(0, rc); + + for (int i = start_i; i < num_keys; i++) { + bool32 is_valid = splinterdb_iterator_valid(it); + ASSERT_TRUE(is_valid); + check_current_tuple(it, i * hop_i + minkey); + splinterdb_iterator_next(it); + + if (i < num_keys - 2) { + is_valid = splinterdb_iterator_valid(it); + ASSERT_TRUE(is_valid); + check_current_tuple(it, (i + 1) * hop_i + minkey); + splinterdb_iterator_next(it); + + is_valid = splinterdb_iterator_valid(it); + ASSERT_TRUE(is_valid); + check_current_tuple(it, (i + 2) * hop_i + minkey); + splinterdb_iterator_prev(it); + } + } + + bool32 is_valid = splinterdb_iterator_valid(it); + ASSERT_FALSE(is_valid); + rc = splinterdb_iterator_status(it); + ASSERT_EQUAL(0, rc); + ASSERT_TRUE(splinterdb_iterator_can_prev(it)); + + // Start going down + splinterdb_iterator_prev(it); + for (int i = num_keys - 1; i >= 0; i--) { + bool32 is_valid = splinterdb_iterator_valid(it); + ASSERT_TRUE(is_valid); + check_current_tuple(it, i * hop_i + minkey); + + splinterdb_iterator_prev(it); + } + + is_valid = splinterdb_iterator_valid(it); + ASSERT_FALSE(is_valid); + rc = splinterdb_iterator_status(it); + ASSERT_EQUAL(0, rc); + + splinterdb_iterator_deinit(it); + return rc; +} + // A user-specified spy comparator static int custom_key_comparator(const data_config *cfg, slice key1, slice key2) diff --git a/tests/unit/splinterdb_stress_test.c b/tests/unit/splinterdb_stress_test.c index b4061dad7..63ac09509 100644 --- a/tests/unit/splinterdb_stress_test.c +++ b/tests/unit/splinterdb_stress_test.c @@ -139,6 +139,82 @@ CTEST2(splinterdb_stress, test_naive_range_delete) } } +/* + * Test case that inserts a large number of KV-pairs and then performs a + * range query over them, backwards and then forwards. + */ +#define KEY_SIZE 13 +CTEST2(splinterdb_stress, test_iterator_over_many_kvs) +{ + char key_str[KEY_SIZE]; + char *value_str = "This is the value string\0"; + const uint32 inserts = 1 << 25; // 16 million + for (int i = 0; i < inserts; i++) { + snprintf(key_str, sizeof(key_str), "key-%08x", i); + slice key = slice_create(sizeof(key_str), key_str); + slice val = slice_create(sizeof(value_str), value_str); + ASSERT_EQUAL(0, splinterdb_insert(data->kvsb, key, val)); + } + + // create an iterator at end of keys + splinterdb_iterator *it = NULL; + snprintf(key_str, sizeof(key_str), "key-%08x", inserts); + slice start_key = slice_create(sizeof(key_str), key_str); + ASSERT_EQUAL(0, splinterdb_iterator_init(data->kvsb, &it, start_key)); + + // assert that the iterator is in the state we expect + ASSERT_FALSE(splinterdb_iterator_valid(it)); + ASSERT_FALSE(splinterdb_iterator_can_next(it)); + ASSERT_TRUE(splinterdb_iterator_can_prev(it)); + ASSERT_EQUAL(0, splinterdb_iterator_status(it)); + + splinterdb_iterator_prev(it); + + // iterate down all the keys + for (int i = inserts - 1; i >= 0; i--) { + ASSERT_TRUE(splinterdb_iterator_valid(it)); + slice key; + slice val; + splinterdb_iterator_get_current(it, &key, &val); + snprintf(key_str, sizeof(key_str), "key-%08x", i); + + ASSERT_EQUAL(KEY_SIZE, slice_length(key)); + int comp = memcmp(slice_data(key), key_str, slice_length(key)); + ASSERT_EQUAL(0, comp); + splinterdb_iterator_prev(it); + } + + // assert that the iterator is in the state we expect + ASSERT_FALSE(splinterdb_iterator_valid(it)); + ASSERT_TRUE(splinterdb_iterator_can_next(it)); + ASSERT_FALSE(splinterdb_iterator_can_prev(it)); + ASSERT_EQUAL(0, splinterdb_iterator_status(it)); + + splinterdb_iterator_next(it); + + // iterate back up all the keys + for (int i = 0; i < inserts; i++) { + ASSERT_TRUE(splinterdb_iterator_valid(it)); + slice key; + slice val; + splinterdb_iterator_get_current(it, &key, &val); + snprintf(key_str, sizeof(key_str), "key-%08x", i); + + ASSERT_EQUAL(KEY_SIZE, slice_length(key)); + int comp = memcmp(slice_data(key), key_str, slice_length(key)); + ASSERT_EQUAL(0, comp); + splinterdb_iterator_next(it); + } + + // assert that the iterator is in the state we expect + ASSERT_FALSE(splinterdb_iterator_valid(it)); + ASSERT_FALSE(splinterdb_iterator_can_next(it)); + ASSERT_TRUE(splinterdb_iterator_can_prev(it)); + ASSERT_EQUAL(0, splinterdb_iterator_status(it)); + + splinterdb_iterator_deinit(it); +} + /* * Test case that inserts large # of KV-pairs, and goes into a code path * reported by issue# 458, tripping a debug assert. This test case also