From dea654b94591ac918982897ade29e1137bf58ebf Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Sun, 14 Jan 2024 03:28:43 -0500 Subject: [PATCH] update amalg & bindings again --- .../CRoaring/bindgen_bundled_version.rs | 95 +- croaring-sys/CRoaring/roaring.c | 829 +++++++++++++++--- croaring-sys/CRoaring/roaring.h | 178 +++- croaring-sys/CRoaring/roaring.hh | 2 +- 4 files changed, 974 insertions(+), 130 deletions(-) diff --git a/croaring-sys/CRoaring/bindgen_bundled_version.rs b/croaring-sys/CRoaring/bindgen_bundled_version.rs index b9be1db..f816e46 100644 --- a/croaring-sys/CRoaring/bindgen_bundled_version.rs +++ b/croaring-sys/CRoaring/bindgen_bundled_version.rs @@ -1,4 +1,4 @@ -/* automatically generated by rust-bindgen 0.69.1 */ +/* automatically generated by rust-bindgen 0.69.2 */ pub const ROARING_VERSION: &[u8; 6] = b"2.1.2\0"; pub const ROARING_VERSION_MAJOR: _bindgen_ty_1 = 2; @@ -523,7 +523,7 @@ extern "C" { ) -> *mut roaring_bitmap_t; } extern "C" { - #[doc = " Read bitmap from a serialized buffer safely (reading up to maxbytes).\n In case of failure, NULL is returned.\n\n This is meant to be compatible with the Java and Go versions:\n https://github.com/RoaringBitmap/RoaringFormatSpec\n\n The function itself is safe in the sense that it will not cause buffer overflows.\n However, for correct operations, it is assumed that the bitmap read was once\n serialized from a valid bitmap (i.e., it follows the format specification).\n If you provided an incorrect input (garbage), then the bitmap read may not be in\n a valid state and following operations may not lead to sensible results.\n In particular, the serialized array containers need to be in sorted order, and the\n run containers should be in sorted non-overlapping order. This is is guaranteed to\n happen when serializing an existing bitmap, but not for random inputs.\n\n This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),\n the data format is going to be big-endian and not compatible with little-endian systems."] + #[doc = " Read bitmap from a serialized buffer safely (reading up to maxbytes).\n In case of failure, NULL is returned.\n\n This is meant to be compatible with the Java and Go versions:\n https://github.com/RoaringBitmap/RoaringFormatSpec\n\n The function itself is safe in the sense that it will not cause buffer overflows.\n However, for correct operations, it is assumed that the bitmap read was once\n serialized from a valid bitmap (i.e., it follows the format specification).\n If you provided an incorrect input (garbage), then the bitmap read may not be in\n a valid state and following operations may not lead to sensible results.\n In particular, the serialized array containers need to be in sorted order, and the\n run containers should be in sorted non-overlapping order. This is is guaranteed to\n happen when serializing an existing bitmap, but not for random inputs.\n\n You may use roaring_bitmap_internal_validate to check the validity of the bitmap prior\n to using it. You may also use other strategies to check for corrupted inputs (e.g.,\n checksums).\n\n This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),\n the data format is going to be big-endian and not compatible with little-endian systems."] pub fn roaring_bitmap_portable_deserialize_safe( buf: *const ::std::os::raw::c_char, maxbytes: usize, @@ -682,7 +682,7 @@ extern "C" { pub fn roaring_bitmap_statistics(r: *const roaring_bitmap_t, stat: *mut roaring_statistics_t); } extern "C" { - #[doc = " Perform internal consistency checks. Returns true if the bitmap is consistent.\n\n Note that some operations intentionally leave bitmaps in an inconsistent state temporarily,\n for example, `roaring_bitmap_lazy_*` functions, until `roaring_bitmap_repair_after_lazy` is called.\n\n If reason is non-null, it will be set to a string describing the first inconsistency found if any."] + #[doc = " Perform internal consistency checks. Returns true if the bitmap is consistent.\n It may be useful to call this after deserializing bitmaps from untrusted sources.\n If roaring_bitmap_internal_validate returns true, then the bitmap should be consistent\n and can be trusted not to cause crashes or memory corruption.\n\n Note that some operations intentionally leave bitmaps in an inconsistent state temporarily,\n for example, `roaring_bitmap_lazy_*` functions, until `roaring_bitmap_repair_after_lazy` is called.\n\n If reason is non-null, it will be set to a string describing the first inconsistency found if any."] pub fn roaring_bitmap_internal_validate( r: *const roaring_bitmap_t, reason: *mut *const ::std::os::raw::c_char, @@ -815,6 +815,12 @@ pub struct roaring64_leaf_s { _unused: [u8; 0], } pub type roaring64_leaf_t = roaring64_leaf_s; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct roaring64_iterator_s { + _unused: [u8; 0], +} +pub type roaring64_iterator_t = roaring64_iterator_s; #[doc = " A bit of context usable with `roaring64_bitmap_*_bulk()` functions.\n\n Should be initialized with `{0}` (or `memset()` to all zeros).\n Callers should treat it as an opaque type.\n\n A context may only be used with a single bitmap (unless re-initialized to\n zero), and any modification to a bitmap (other than modifications performed\n with `_bulk()` functions with the context passed) will invalidate any\n contexts associated with that bitmap."] #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -959,10 +965,6 @@ extern "C" { #[doc = " Returns true if the result has at least one run container."] pub fn roaring64_bitmap_run_optimize(r: *mut roaring64_bitmap_t) -> bool; } -extern "C" { - #[doc = " Returns the in-memory size of the bitmap.\n TODO: Return the serialized size."] - pub fn roaring64_bitmap_size_in_bytes(r: *const roaring64_bitmap_t) -> usize; -} extern "C" { #[doc = " Return true if the two bitmaps contain the same elements."] pub fn roaring64_bitmap_equals( @@ -1073,6 +1075,31 @@ extern "C" { r2: *const roaring64_bitmap_t, ); } +extern "C" { + #[doc = " How many bytes are required to serialize this bitmap.\n\n This is meant to be compatible with other languages:\n https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations"] + pub fn roaring64_bitmap_portable_size_in_bytes(r: *const roaring64_bitmap_t) -> usize; +} +extern "C" { + #[doc = " Write a bitmap to a buffer. The output buffer should refer to at least\n `roaring64_bitmap_portable_size_in_bytes(r)` bytes of allocated memory.\n\n Returns how many bytes were written, which should match\n `roaring64_bitmap_portable_size_in_bytes(r)`.\n\n This is meant to be compatible with other languages:\n https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations\n\n This function is endian-sensitive. If you have a big-endian system (e.g., a\n mainframe IBM s390x), the data format is going to be big-endian and not\n compatible with little-endian systems."] + pub fn roaring64_bitmap_portable_serialize( + r: *const roaring64_bitmap_t, + buf: *mut ::std::os::raw::c_char, + ) -> usize; +} +extern "C" { + #[doc = " Check how many bytes would be read (up to maxbytes) at this pointer if there\n is a valid bitmap, returns zero if there is no valid bitmap.\n\n This is meant to be compatible with other languages\n https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations"] + pub fn roaring64_bitmap_portable_deserialize_size( + buf: *const ::std::os::raw::c_char, + maxbytes: usize, + ) -> usize; +} +extern "C" { + #[doc = " Read a bitmap from a serialized buffer safely (reading up to maxbytes).\n In case of failure, NULL is returned.\n\n This is meant to be compatible with other languages\n https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations\n\n The function itself is safe in the sense that it will not cause buffer\n overflows. However, for correct operations, it is assumed that the bitmap\n read was once serialized from a valid bitmap (i.e., it follows the format\n specification). If you provided an incorrect input (garbage), then the bitmap\n read may not be in a valid state and following operations may not lead to\n sensible results. In particular, the serialized array containers need to be\n in sorted order, and the run containers should be in sorted non-overlapping\n order. This is is guaranteed to happen when serializing an existing bitmap,\n but not for random inputs.\n\n This function is endian-sensitive. If you have a big-endian system (e.g., a\n mainframe IBM s390x), the data format is going to be big-endian and not\n compatible with little-endian systems."] + pub fn roaring64_bitmap_portable_deserialize_safe( + buf: *const ::std::os::raw::c_char, + maxbytes: usize, + ) -> *mut roaring64_bitmap_t; +} extern "C" { #[doc = " Iterate over the bitmap elements. The function `iterator` is called once for\n all the values with `ptr` (can be NULL) as the second parameter of each call.\n\n `roaring_iterator64` is simply a pointer to a function that returns a bool\n and takes `(uint64_t, void*)` as inputs. True means that the iteration should\n continue, while false means that it should stop.\n\n Returns true if the `roaring64_iterator` returned true throughout (so that\n all data points were necessarily visited).\n\n Iteration is ordered from the smallest to the largest elements."] pub fn roaring64_bitmap_iterate( @@ -1081,3 +1108,57 @@ extern "C" { ptr: *mut ::std::os::raw::c_void, ) -> bool; } +extern "C" { + #[doc = " Create an iterator object that can be used to iterate through the values.\n Caller is responsible for calling `roaring64_iterator_free()`.\n\n The iterator is initialized. If there is a value, then this iterator points\n to the first value and `roaring64_iterator_has_value()` returns true. The\n value can be retrieved with `roaring64_iterator_value()`."] + pub fn roaring64_iterator_create(r: *const roaring64_bitmap_t) -> *mut roaring64_iterator_t; +} +extern "C" { + #[doc = " Create an iterator object that can be used to iterate through the values.\n Caller is responsible for calling `roaring64_iterator_free()`.\n\n The iterator is initialized. If there is a value, then this iterator points\n to the last value and `roaring64_iterator_has_value()` returns true. The\n value can be retrieved with `roaring64_iterator_value()`."] + pub fn roaring64_iterator_create_last( + r: *const roaring64_bitmap_t, + ) -> *mut roaring64_iterator_t; +} +extern "C" { + #[doc = " Re-initializes an existing iterator. Functionally the same as\n `roaring64_iterator_create` without a allocation."] + pub fn roaring64_iterator_reinit(r: *const roaring64_bitmap_t, it: *mut roaring64_iterator_t); +} +extern "C" { + #[doc = " Re-initializes an existing iterator. Functionally the same as\n `roaring64_iterator_create_last` without a allocation."] + pub fn roaring64_iterator_reinit_last( + r: *const roaring64_bitmap_t, + it: *mut roaring64_iterator_t, + ); +} +extern "C" { + #[doc = " Creates a copy of the iterator. Caller is responsible for calling\n `roaring64_iterator_free()` on the resulting iterator."] + pub fn roaring64_iterator_copy(it: *const roaring64_iterator_t) -> *mut roaring64_iterator_t; +} +extern "C" { + #[doc = " Free the iterator."] + pub fn roaring64_iterator_free(it: *mut roaring64_iterator_t); +} +extern "C" { + #[doc = " Returns true if the iterator currently points to a value. If so, calling\n `roaring64_iterator_value()` returns the value."] + pub fn roaring64_iterator_has_value(it: *const roaring64_iterator_t) -> bool; +} +extern "C" { + #[doc = " Returns the value the iterator currently points to. Should only be called if\n `roaring64_iterator_has_value()` returns true."] + pub fn roaring64_iterator_value(it: *const roaring64_iterator_t) -> u64; +} +extern "C" { + #[doc = " Advance the iterator. If there is a new value, then\n `roaring64_iterator_has_value()` returns true. Values are traversed in\n increasing order. For convenience, returns the result of\n `roaring64_iterator_has_value()`.\n\n Once this returns false, `roaring64_iterator_advance` should not be called on\n the iterator again. Calling `roaring64_iterator_previous` is allowed."] + pub fn roaring64_iterator_advance(it: *mut roaring64_iterator_t) -> bool; +} +extern "C" { + #[doc = " Decrement the iterator. If there is a new value, then\n `roaring64_iterator_has_value()` returns true. Values are traversed in\n decreasing order. For convenience, returns the result of\n `roaring64_iterator_has_value()`.\n\n Once this returns false, `roaring64_iterator_previous` should not be called\n on the iterator again. Calling `roaring64_iterator_advance` is allowed."] + pub fn roaring64_iterator_previous(it: *mut roaring64_iterator_t) -> bool; +} +extern "C" { + #[doc = " Move the iterator to the first value greater than or equal to `val`, if it\n exists at or after the current position of the iterator. If there is a new\n value, then `roaring64_iterator_has_value()` returns true. Values are\n traversed in increasing order. For convenience, returns the result of\n `roaring64_iterator_has_value()`."] + pub fn roaring64_iterator_move_equalorlarger(it: *mut roaring64_iterator_t, val: u64) -> bool; +} +extern "C" { + #[doc = " Reads up to `count` values from the iterator into the given `buf`. Returns\n the number of elements read. The number of elements read can be smaller than\n `count`, which means that there are no more elements in the bitmap.\n\n This function can be used together with other iterator functions."] + pub fn roaring64_iterator_read(it: *mut roaring64_iterator_t, buf: *mut u64, count: u64) + -> u64; +} diff --git a/croaring-sys/CRoaring/roaring.c b/croaring-sys/CRoaring/roaring.c index 4e85fa3..699f8b1 100644 --- a/croaring-sys/CRoaring/roaring.c +++ b/croaring-sys/CRoaring/roaring.c @@ -1,5 +1,5 @@ // !!! DO NOT EDIT - THIS IS AN AUTO-GENERATED FILE !!! -// Created by amalgamation.sh on 2024-01-14T07:28:05Z +// Created by amalgamation.sh on 2024-01-20T09:04:13Z /* * The CRoaring project is under a dual license (Apache/MIT). @@ -5617,10 +5617,10 @@ static inline container_t *container_lazy_xor( * If the returned pointer is identical to c1, then the container has been * modified. * If the returned pointer is different from c1, then a new container has been - * created and the caller is responsible for freeing it. - * The type of the first container may change. Returns the modified - * (and possibly new) container -*/ + * created. The original container is freed by container_ixor. + * The type of the first container may change. Returns the modified (and + * possibly new) container. + */ static inline container_t *container_ixor( container_t *c1, uint8_t type1, const container_t *c2, uint8_t type2, @@ -5849,10 +5849,10 @@ static inline container_t *container_andnot( * If the returned pointer is identical to c1, then the container has been * modified. * If the returned pointer is different from c1, then a new container has been - * created and the caller is responsible for freeing it. - * The type of the first container may change. Returns the modified - * (and possibly new) container -*/ + * created. The original container is freed by container_iandnot. + * The type of the first container may change. Returns the modified (and + * possibly new) container. + */ static inline container_t *container_iandnot( container_t *c1, uint8_t type1, const container_t *c2, uint8_t type2, @@ -6463,6 +6463,18 @@ bool container_iterator_read_into_uint32(const container_t *c, uint8_t typecode, uint32_t count, uint32_t *consumed, uint16_t *value_out); +/** + * Reads up to `count` entries from the container, and writes them into `buf` + * as `high48 | entry`. Returns true and sets `value_out` if a value is present + * after reading the entries. Sets `consumed` to the number of values read. + * `count` should be greater than zero. + */ +bool container_iterator_read_into_uint64(const container_t *c, uint8_t typecode, + roaring_container_iterator_t *it, + uint64_t high48, uint64_t *buf, + uint64_t count, uint32_t *consumed, + uint16_t *value_out); + #ifdef __cplusplus } } } // extern "C" { namespace roaring { namespace internal { #endif @@ -9153,6 +9165,8 @@ CROARING_UNTARGET_AVX512 #define SET_LEAF(p) ((art_node_t *)((uintptr_t)(p) | 1)) #define CAST_LEAF(p) ((art_leaf_t *)((void *)((uintptr_t)(p) & ~1))) +#define NODE48_AVAILABLE_CHILDREN_MASK ((UINT64_C(1) << 48) - 1) + #ifdef __cplusplus extern "C" { namespace roaring { @@ -9197,6 +9211,8 @@ typedef struct art_node16_s { typedef struct art_node48_s { art_inner_node_t base; uint8_t count; + // Bitset where the ith bit is set if children[i] is available + uint64_t available_children; uint8_t keys[256]; art_node_t *children[48]; } art_node48_t; @@ -9213,12 +9229,13 @@ typedef struct art_node256_s { typedef struct art_indexed_child_s { art_node_t *child; uint8_t index; + art_key_chunk_t key_chunk; } art_indexed_child_t; static inline bool art_is_leaf(const art_node_t *node) { return IS_LEAF(node); } static void art_leaf_populate(art_leaf_t *leaf, const art_key_chunk_t key[]) { - memcpy(&leaf->key, key, ART_KEY_BYTES); + memcpy(leaf->key, key, ART_KEY_BYTES); } static inline uint8_t art_get_type(const art_inner_node_t *node) { @@ -9327,8 +9344,9 @@ static inline art_node_t *art_node4_erase(art_node4_t *node, if (node->count == 2) { // Only one child remains after erasing, so compress the path by // removing this node. - art_node_t *remaining_child = node->children[idx ^ 1]; - art_key_chunk_t remaining_child_key = node->keys[idx ^ 1]; + uint8_t other_idx = idx ^ 1; + art_node_t *remaining_child = node->children[other_idx]; + art_key_chunk_t remaining_child_key = node->keys[other_idx]; if (!art_is_leaf(remaining_child)) { // Correct the prefix of the child node. art_inner_node_t *inner_node = (art_inner_node_t *)remaining_child; @@ -9373,6 +9391,7 @@ static inline art_indexed_child_t art_node4_next_child(const art_node4_t *node, } indexed_child.index = index; indexed_child.child = node->children[index]; + indexed_child.key_chunk = node->keys[index]; return indexed_child; } @@ -9389,6 +9408,7 @@ static inline art_indexed_child_t art_node4_prev_child(const art_node4_t *node, } indexed_child.index = index; indexed_child.child = node->children[index]; + indexed_child.key_chunk = node->keys[index]; return indexed_child; } @@ -9401,6 +9421,7 @@ static inline art_indexed_child_t art_node4_child_at(const art_node4_t *node, } indexed_child.index = index; indexed_child.child = node->children[index]; + indexed_child.key_chunk = node->keys[index]; return indexed_child; } @@ -9411,6 +9432,7 @@ static inline art_indexed_child_t art_node4_lower_bound( if (node->keys[i] >= key_chunk) { indexed_child.index = i; indexed_child.child = node->children[i]; + indexed_child.key_chunk = node->keys[i]; return indexed_child; } } @@ -9522,6 +9544,7 @@ static inline art_indexed_child_t art_node16_next_child( } indexed_child.index = index; indexed_child.child = node->children[index]; + indexed_child.key_chunk = node->keys[index]; return indexed_child; } @@ -9538,6 +9561,7 @@ static inline art_indexed_child_t art_node16_prev_child( } indexed_child.index = index; indexed_child.child = node->children[index]; + indexed_child.key_chunk = node->keys[index]; return indexed_child; } @@ -9550,6 +9574,7 @@ static inline art_indexed_child_t art_node16_child_at(const art_node16_t *node, } indexed_child.index = index; indexed_child.child = node->children[index]; + indexed_child.key_chunk = node->keys[index]; return indexed_child; } @@ -9560,6 +9585,7 @@ static inline art_indexed_child_t art_node16_lower_bound( if (node->keys[i] >= key_chunk) { indexed_child.index = i; indexed_child.child = node->children[i]; + indexed_child.key_chunk = node->keys[i]; return indexed_child; } } @@ -9572,6 +9598,7 @@ static art_node48_t *art_node48_create(const art_key_chunk_t prefix[], art_node48_t *node = (art_node48_t *)roaring_malloc(sizeof(art_node48_t)); art_init_inner_node(&node->base, ART_NODE48_TYPE, prefix, prefix_size); node->count = 0; + node->available_children = NODE48_AVAILABLE_CHILDREN_MASK; for (size_t i = 0; i < 256; ++i) { node->keys[i] = ART_NODE48_EMPTY_VAL; } @@ -9579,11 +9606,11 @@ static art_node48_t *art_node48_create(const art_key_chunk_t prefix[], } static void art_free_node48(art_node48_t *node) { - for (size_t i = 0; i < 256; ++i) { - uint8_t val_idx = node->keys[i]; - if (val_idx != ART_NODE48_EMPTY_VAL) { - art_free_node(node->children[val_idx]); - } + uint64_t used_children = (node->available_children) ^ NODE48_AVAILABLE_CHILDREN_MASK; + while (used_children != 0) { + uint8_t child_idx = roaring_trailing_zeroes(used_children); + art_free_node(node->children[child_idx]); + used_children &= ~(UINT64_C(1) << child_idx); } roaring_free(node); } @@ -9600,10 +9627,11 @@ static inline art_node_t *art_node48_find_child(const art_node48_t *node, static art_node_t *art_node48_insert(art_node48_t *node, art_node_t *child, uint8_t key) { if (node->count < 48) { - uint8_t val_idx = node->count; + uint8_t val_idx = roaring_trailing_zeroes(node->available_children); node->keys[key] = val_idx; node->children[val_idx] = child; node->count++; + node->available_children &= ~(UINT64_C(1) << val_idx); return (art_node_t *)node; } art_node256_t *new_node = @@ -9624,8 +9652,8 @@ static inline art_node_t *art_node48_erase(art_node48_t *node, if (val_idx == ART_NODE48_EMPTY_VAL) { return (art_node_t *)node; } - node->children[val_idx] = NULL; node->keys[key_chunk] = ART_NODE48_EMPTY_VAL; + node->available_children |= UINT64_C(1) << val_idx; node->count--; if (node->count > 16) { return (art_node_t *)node; @@ -9657,8 +9685,9 @@ static inline art_indexed_child_t art_node48_next_child( index++; for (size_t i = index; i < 256; ++i) { if (node->keys[i] != ART_NODE48_EMPTY_VAL) { - indexed_child.child = node->children[node->keys[i]]; indexed_child.index = i; + indexed_child.child = node->children[node->keys[i]]; + indexed_child.key_chunk = i; return indexed_child; } } @@ -9675,8 +9704,9 @@ static inline art_indexed_child_t art_node48_prev_child( art_indexed_child_t indexed_child; for (int i = index; i >= 0; --i) { if (node->keys[i] != ART_NODE48_EMPTY_VAL) { - indexed_child.child = node->children[node->keys[i]]; indexed_child.index = i; + indexed_child.child = node->children[node->keys[i]]; + indexed_child.key_chunk = i; return indexed_child; } } @@ -9693,6 +9723,7 @@ static inline art_indexed_child_t art_node48_child_at(const art_node48_t *node, } indexed_child.index = index; indexed_child.child = node->children[node->keys[index]]; + indexed_child.key_chunk = index; return indexed_child; } @@ -9703,6 +9734,7 @@ static inline art_indexed_child_t art_node48_lower_bound( if (node->keys[i] != ART_NODE48_EMPTY_VAL) { indexed_child.index = i; indexed_child.child = node->children[node->keys[i]]; + indexed_child.key_chunk = i; return indexed_child; } } @@ -9774,8 +9806,9 @@ static inline art_indexed_child_t art_node256_next_child( index++; for (size_t i = index; i < 256; ++i) { if (node->children[i] != NULL) { - indexed_child.child = node->children[i]; indexed_child.index = i; + indexed_child.child = node->children[i]; + indexed_child.key_chunk = i; return indexed_child; } } @@ -9792,8 +9825,9 @@ static inline art_indexed_child_t art_node256_prev_child( art_indexed_child_t indexed_child; for (int i = index; i >= 0; --i) { if (node->children[i] != NULL) { - indexed_child.child = node->children[i]; indexed_child.index = i; + indexed_child.child = node->children[i]; + indexed_child.key_chunk = i; return indexed_child; } } @@ -9810,6 +9844,7 @@ static inline art_indexed_child_t art_node256_child_at( } indexed_child.index = index; indexed_child.child = node->children[index]; + indexed_child.key_chunk = index; return indexed_child; } @@ -9820,6 +9855,7 @@ static inline art_indexed_child_t art_node256_lower_bound( if (node->children[i] != NULL) { indexed_child.index = i; indexed_child.child = node->children[i]; + indexed_child.key_chunk = i; return indexed_child; } } @@ -10283,7 +10319,15 @@ static void art_node_print_type(const art_node_t *node) { } void art_node_printf(const art_node_t *node, uint8_t depth) { - printf("%*s", depth, ""); + if (art_is_leaf(node)) { + printf("{ type: Leaf, key: "); + art_leaf_t *leaf = CAST_LEAF(node); + for (size_t i = 0; i < ART_KEY_BYTES; ++i) { + printf("%02x", leaf->key[i]); + } + printf(" }\n"); + return; + } printf("{\n"); depth++; @@ -10292,19 +10336,6 @@ void art_node_printf(const art_node_t *node, uint8_t depth) { art_node_print_type(node); printf("\n"); - if (art_is_leaf(node)) { - art_leaf_t *leaf = CAST_LEAF(node); - printf("%*s", depth, ""); - printf("key: "); - for (size_t i = 0; i < ART_KEY_BYTES; ++i) { - printf("%x", leaf->key[i]); - } - printf("\n"); - depth--; - printf("%*s", depth, ""); - printf("}\n"); - return; - } art_inner_node_t *inner_node = (art_inner_node_t *)node; printf("%*s", depth, ""); printf("prefix_size: %d\n", inner_node->prefix_size); @@ -10312,7 +10343,7 @@ void art_node_printf(const art_node_t *node, uint8_t depth) { printf("%*s", depth, ""); printf("prefix: "); for (uint8_t i = 0; i < inner_node->prefix_size; ++i) { - printf("%x", (char)inner_node->prefix[i]); + printf("%02x", inner_node->prefix[i]); } printf("\n"); @@ -10321,7 +10352,7 @@ void art_node_printf(const art_node_t *node, uint8_t depth) { art_node4_t *node4 = (art_node4_t *)node; for (uint8_t i = 0; i < node4->count; ++i) { printf("%*s", depth, ""); - printf("key: %x\n", node4->keys[i]); + printf("key: %02x ", node4->keys[i]); art_node_printf(node4->children[i], depth); } } break; @@ -10329,7 +10360,7 @@ void art_node_printf(const art_node_t *node, uint8_t depth) { art_node16_t *node16 = (art_node16_t *)node; for (uint8_t i = 0; i < node16->count; ++i) { printf("%*s", depth, ""); - printf("key: %x\n", node16->keys[i]); + printf("key: %02x ", node16->keys[i]); art_node_printf(node16->children[i], depth); } } break; @@ -10338,8 +10369,9 @@ void art_node_printf(const art_node_t *node, uint8_t depth) { for (int i = 0; i < 256; ++i) { if (node48->keys[i] != ART_NODE48_EMPTY_VAL) { printf("%*s", depth, ""); - printf("key: %x\n", node48->keys[i]); - art_node_printf(node48->children[i], depth); + printf("key: %02x ", i); + printf("child: %02x ", node48->keys[i]); + art_node_printf(node48->children[node48->keys[i]], depth); } } } break; @@ -10348,7 +10380,7 @@ void art_node_printf(const art_node_t *node, uint8_t depth) { for (int i = 0; i < 256; ++i) { if (node256->children[i] != NULL) { printf("%*s", depth, ""); - printf("key: %x\n", i); + printf("key: %02x ", i); art_node_printf(node256->children[i], depth); } } @@ -10485,6 +10517,16 @@ static bool art_iterator_up(art_iterator_t *iterator) { return true; } +// Moves the iterator one level, followed by a move to the next / previous leaf. +// Sets the status of the iterator. +static bool art_iterator_up_and_move(art_iterator_t *iterator, bool forward) { + if (!art_iterator_up(iterator)) { + // We're at the root. + return art_iterator_invalid_loc(iterator); + } + return art_iterator_move(iterator, forward); +} + // Initializes the iterator at the first / last leaf of the given node. // Returns true for convenience. static bool art_node_init_iterator(const art_node_t *node, @@ -10523,12 +10565,7 @@ bool art_iterator_move(art_iterator_t *iterator, bool forward) { return art_node_init_iterator(neighbor_child, iterator, forward); } // No more children at this level, go up. - bool went_up = art_iterator_up(iterator); - if (!went_up) { - // We're at the root. - return art_iterator_invalid_loc(iterator); - } - return art_iterator_move(iterator, forward); + return art_iterator_up_and_move(iterator, forward); } // Assumes the iterator is positioned at a node with an equal prefix path up to @@ -10545,7 +10582,7 @@ static bool art_node_iterator_lower_bound(const art_node_t *node, // Prefix so far has been equal, but we've found a smaller key. // Since we take the lower bound within each node, we can return the // next leaf. - return art_iterator_move(iterator, true); + return art_iterator_up_and_move(iterator, true); } else if (prefix_comparison > 0) { // No key equal to the key we're looking for, return the first leaf. return art_node_init_iterator(node, iterator, true); @@ -10557,23 +10594,25 @@ static bool art_node_iterator_lower_bound(const art_node_t *node, art_node_lower_bound(node, key_chunk); if (indexed_child.child == NULL) { // Only smaller keys among children. - bool went_up = art_iterator_up(iterator); - if (!went_up) { - return art_iterator_invalid_loc(iterator); - } - return art_iterator_move(iterator, true); + return art_iterator_up_and_move(iterator, true); } - // We found a child with a greater or equal prefix. + if (indexed_child.key_chunk > key_chunk) { + // Only larger children, return the first larger child. + art_iterator_down(iterator, inner_node, indexed_child.index); + return art_node_init_iterator(indexed_child.child, iterator, true); + } + // We found a child with an equal prefix. art_iterator_down(iterator, inner_node, indexed_child.index); node = indexed_child.child; } art_leaf_t *leaf = CAST_LEAF(node); - // Technically we don't have to re-compare the prefix if we arrived here - // through the while loop, but it simplifies the code. if (art_compare_keys(leaf->key, key) >= 0) { + // Leaf has an equal or larger key. return art_iterator_valid_loc(iterator, leaf); } - return art_iterator_invalid_loc(iterator); + // Leaf has an equal prefix, but the full key is smaller. Move to the next + // leaf. + return art_iterator_up_and_move(iterator, true); } art_iterator_t art_init_iterator(const art_t *art, bool first) { @@ -10595,19 +10634,22 @@ bool art_iterator_prev(art_iterator_t *iterator) { bool art_iterator_lower_bound(art_iterator_t *iterator, const art_key_chunk_t *key) { - int compare_result = art_compare_keys(iterator->key, key); + int compare_result = + art_compare_prefix(iterator->key, 0, key, 0, ART_KEY_BYTES); // Move up until we have an equal or greater prefix, after which we can do a // normal lower bound search. - while (compare_result < 0 && iterator->frame > 0) { + while (compare_result < 0) { if (!art_iterator_up(iterator)) { // Only smaller keys found. - return art_node_iterator_lower_bound(art_iterator_node(iterator), - iterator, key); + return art_iterator_invalid_loc(iterator); } // Since we're only moving up, we can keep comparing against the // iterator key. + art_inner_node_t *inner_node = + (art_inner_node_t *)art_iterator_node(iterator); compare_result = - art_compare_prefix(iterator->key, 0, key, 0, iterator->depth); + art_compare_prefix(iterator->key, 0, key, 0, + iterator->depth + inner_node->prefix_size); } if (compare_result > 0) { return art_node_init_iterator(art_iterator_node(iterator), iterator, @@ -10651,42 +10693,50 @@ art_val_t *art_iterator_erase(art_t *art, art_iterator_t *iterator) { if (iterator->value == NULL) { return NULL; } + art_key_chunk_t initial_key[ART_KEY_BYTES]; + memcpy(initial_key, iterator->key, ART_KEY_BYTES); + art_val_t *value_erased = iterator->value; bool went_up = art_iterator_up(iterator); if (!went_up) { + // We're erasing the root. art->root = NULL; art_iterator_invalid_loc(iterator); return value_erased; } // Erase the leaf. - art_node_t *child_to_replace; - { - art_inner_node_t *node = - (art_inner_node_t *)art_iterator_node(iterator); - art_key_chunk_t key_chunk = - iterator->key[iterator->depth + node->prefix_size]; - child_to_replace = art_node_erase(node, key_chunk); + art_inner_node_t *parent_node = + (art_inner_node_t *)art_iterator_node(iterator); + art_key_chunk_t key_chunk_in_parent = + iterator->key[iterator->depth + parent_node->prefix_size]; + art_node_t *new_parent_node = + art_node_erase(parent_node, key_chunk_in_parent); + + if (new_parent_node != ((art_node_t *)parent_node)) { + // Replace the pointer to the inner node we erased from in its + // parent (it may be a leaf now). + iterator->frames[iterator->frame].node = new_parent_node; + went_up = art_iterator_up(iterator); + if (went_up) { + art_inner_node_t *grandparent_node = + (art_inner_node_t *)art_iterator_node(iterator); + art_key_chunk_t key_chunk_in_grandparent = + iterator->key[iterator->depth + grandparent_node->prefix_size]; + art_replace(grandparent_node, key_chunk_in_grandparent, + new_parent_node); + } else { + // We were already at the rootmost node. + art->root = new_parent_node; + } } - // Replace the pointer to the inner node we erased from in its parent (it - // may be a leaf now). - went_up = art_iterator_up(iterator); - if (went_up) { - art_inner_node_t *node = - (art_inner_node_t *)art_iterator_node(iterator); - art_key_chunk_t key_chunk = - iterator->key[iterator->depth + node->prefix_size]; - art_replace(node, key_chunk, child_to_replace); - } else { - // This node was the rootmost node. - art->root = child_to_replace; - iterator->frames[0].node = child_to_replace; - } - art_key_chunk_t initial_key[ART_KEY_BYTES]; - memcpy(initial_key, iterator->key, ART_KEY_BYTES); - // Search for the first key after the one we erased. - art_iterator_lower_bound(iterator, initial_key); + iterator->frame = 0; + iterator->depth = 0; + // Do a lower bound search for the initial key, which will find the first + // greater key if it exists. This can likely be mildly faster if we instead + // start from the current position. + art_node_iterator_lower_bound(art->root, iterator, initial_key); return value_erased; } @@ -12747,7 +12797,8 @@ bool array_container_validate(const array_container_t *v, const char **reason) { return false; } if (v->cardinality == 0) { - return true; + *reason = "zero cardinality"; + return false; } if (v->array == NULL) { @@ -13854,6 +13905,10 @@ bool bitset_container_validate(const bitset_container_t *v, const char **reason) *reason = "cardinality is incorrect"; return false; } + if (v->cardinality <= DEFAULT_MAX_SIZE) { + *reason = "cardinality is too small for a bitmap container"; + return false; + } // Attempt to forcibly load the first and last words, hopefully causing // a segfault or an address sanitizer error if words is not allocated. volatile uint64_t *words = v->words; @@ -14758,6 +14813,95 @@ bool container_iterator_read_into_uint32(const container_t *c, uint8_t typecode, } } +bool container_iterator_read_into_uint64(const container_t *c, uint8_t typecode, + roaring_container_iterator_t *it, + uint64_t high48, uint64_t *buf, + uint64_t count, uint32_t *consumed, + uint16_t *value_out) { + *consumed = 0; + if (count == 0) { + return false; + } + switch (typecode) { + case BITSET_CONTAINER_TYPE: { + const bitset_container_t *bc = const_CAST_bitset(c); + uint32_t wordindex = it->index / 64; + uint64_t word = + bc->words[wordindex] & (UINT64_MAX << (it->index % 64)); + do { + // Read set bits. + while (word != 0 && *consumed < count) { + *buf = high48 | + (wordindex * 64 + roaring_trailing_zeroes(word)); + word = word & (word - 1); + buf++; + (*consumed)++; + } + // Skip unset bits. + while (word == 0 && + wordindex + 1 < BITSET_CONTAINER_SIZE_IN_WORDS) { + wordindex++; + word = bc->words[wordindex]; + } + } while (word != 0 && *consumed < count); + + if (word != 0) { + it->index = wordindex * 64 + roaring_trailing_zeroes(word); + *value_out = it->index; + return true; + } + return false; + } + case ARRAY_CONTAINER_TYPE: { + const array_container_t *ac = const_CAST_array(c); + uint32_t num_values = + minimum_uint32(ac->cardinality - it->index, count); + for (uint32_t i = 0; i < num_values; i++) { + buf[i] = high48 | ac->array[it->index + i]; + } + *consumed += num_values; + it->index += num_values; + if (it->index < ac->cardinality) { + *value_out = ac->array[it->index]; + return true; + } + return false; + } + case RUN_CONTAINER_TYPE: { + const run_container_t *rc = const_CAST_run(c); + do { + uint32_t largest_run_value = + rc->runs[it->index].value + rc->runs[it->index].length; + uint32_t num_values = minimum_uint32( + largest_run_value - *value_out + 1, count - *consumed); + for (uint32_t i = 0; i < num_values; i++) { + buf[i] = high48 | (*value_out + i); + } + *value_out += num_values; + buf += num_values; + *consumed += num_values; + + // We check for `value == 0` because `it->value += num_values` + // can overflow when `value == UINT16_MAX`, and `count > + // length`. In this case `value` will overflow to 0. + if (*value_out > largest_run_value || *value_out == 0) { + it->index++; + if (it->index < rc->n_runs) { + *value_out = rc->runs[it->index].value; + } else { + return false; + } + } + } while (*consumed < count); + return true; + } + default: + assert(false); + roaring_unreachable; + return 0; + } +} + #ifdef __cplusplus } } } // extern "C" { namespace roaring { namespace internal { #endif @@ -18028,7 +18172,8 @@ bool run_container_validate(const run_container_t *run, const char **reason) { } if (run->n_runs == 0) { - return true; + *reason = "zero run count"; + return false; } if (run->runs == NULL) { *reason = "NULL runs"; @@ -20411,7 +20556,7 @@ bool roaring_iterate64(const roaring_bitmap_t *r, roaring_iterator64 iterator, * 1. Invalid due to `has_value = false`, or * 2. At a container, with the high bits set, `has_value = true`. */ -static bool iter_new_container_partial_init(roaring_uint32_iterator_t *newit) { +CROARING_WARN_UNUSED static bool iter_new_container_partial_init(roaring_uint32_iterator_t *newit) { newit->current_value = 0; if (newit->container_index >= newit->parent->high_low_container.size || newit->container_index < 0) { @@ -20440,7 +20585,7 @@ static bool iter_new_container_partial_init(roaring_uint32_iterator_t *newit) { * Positions the iterator at the first value of the current container that the * iterator points at, if available. */ -static bool loadfirstvalue(roaring_uint32_iterator_t *newit) { +CROARING_WARN_UNUSED static bool loadfirstvalue(roaring_uint32_iterator_t *newit) { if (iter_new_container_partial_init(newit)) { uint16_t value = 0; newit->container_it = @@ -20454,7 +20599,7 @@ static bool loadfirstvalue(roaring_uint32_iterator_t *newit) { * Positions the iterator at the last value of the current container that the * iterator points at, if available. */ -static bool loadlastvalue(roaring_uint32_iterator_t *newit) { +CROARING_WARN_UNUSED static bool loadlastvalue(roaring_uint32_iterator_t *newit) { if (iter_new_container_partial_init(newit)) { uint16_t value = 0; newit->container_it = @@ -20469,16 +20614,18 @@ static bool loadlastvalue(roaring_uint32_iterator_t *newit) { * `val` within the current container that the iterator points at. Assumes such * a value exists within the current container. */ -static bool loadfirstvalue_largeorequal(roaring_uint32_iterator_t *newit, +CROARING_WARN_UNUSED static bool loadfirstvalue_largeorequal(roaring_uint32_iterator_t *newit, uint32_t val) { bool partial_init = iter_new_container_partial_init(newit); assert(partial_init); + if(!partial_init) { return false; } uint16_t value = 0; newit->container_it = container_init_iterator(newit->container, newit->typecode, &value); bool found = container_iterator_lower_bound( newit->container, newit->typecode, &newit->container_it, &value, val & 0xFFFF); assert(found); + if(!found) { return false; } newit->current_value = newit->highbits | value; return true; } @@ -22129,6 +22276,9 @@ bool roaring_bitmap_to_bitset(const roaring_bitmap_t *r, bitset_t * bitset) { #include #include +// For serialization / deserialization +// containers.h last to avoid conflict with ROARING_CONTAINER_T. + #ifdef __cplusplus using namespace ::roaring::internal; @@ -22137,12 +22287,8 @@ namespace roaring { namespace api { #endif -// TODO: Iteration. -// * Need to create a container iterator which can be used across 32 and 64 bit -// bitmaps. -// * Iteration-based functions like roaring64_bitmap_intersect_with_range. +// TODO: Iteration-based functions like roaring64_bitmap_intersect_with_range. // TODO: Copy on write. -// TODO: Serialization. // TODO: Error on failed allocation. typedef struct roaring64_bitmap_s { @@ -22161,6 +22307,17 @@ typedef struct roaring64_leaf_s { // anyway. typedef struct roaring64_leaf_s leaf_t; +// Iterator struct to hold iteration state. +typedef struct roaring64_iterator_s { + const roaring64_bitmap_t *parent; + art_iterator_t art_it; + roaring_container_iterator_t container_it; + uint64_t high48; // Key that art_it points to. + + uint64_t value; + bool has_value; +} roaring64_iterator_t; + // Splits the given uint64 key into high 48 bit and low 16 bit components. // Expects high48_out to be of length ART_KEY_BYTES. static inline uint16_t split_key(uint64_t key, uint8_t high48_out[]) { @@ -22802,18 +22959,6 @@ bool roaring64_bitmap_run_optimize(roaring64_bitmap_t *r) { return has_run_container; } -size_t roaring64_bitmap_size_in_bytes(const roaring64_bitmap_t *r) { - size_t size = art_size_in_bytes(&r->art); - art_iterator_t it = art_init_iterator(&r->art, /*first=*/true); - while (it.value != NULL) { - leaf_t *leaf = (leaf_t *)it.value; - size += sizeof(leaf_t); - size += container_size_in_bytes(leaf->container, leaf->typecode); - art_iterator_next(&it); - } - return size; -} - bool roaring64_bitmap_equals(const roaring64_bitmap_t *r1, const roaring64_bitmap_t *r2) { art_iterator_t it1 = art_init_iterator(&r1->art, /*first=*/true); @@ -23404,13 +23549,18 @@ void roaring64_bitmap_andnot_inplace(roaring64_bitmap_t *r1, container2 = container_andnot( leaf1->container, leaf1->typecode, leaf2->container, leaf2->typecode, &typecode2); + if (container2 != container1) { + // We only free when doing container_andnot, not + // container_iandnot, as iandnot frees the original + // internally. + container_free(container1, typecode1); + } } else { container2 = container_iandnot( leaf1->container, leaf1->typecode, leaf2->container, leaf2->typecode, &typecode2); } if (container2 != container1) { - container_free(container1, typecode1); leaf1->container = container2; leaf1->typecode = typecode2; } @@ -23437,6 +23587,283 @@ void roaring64_bitmap_andnot_inplace(roaring64_bitmap_t *r1, } } +// Returns the number of distinct high 32-bit entries in the bitmap. +static inline uint64_t count_high32(const roaring64_bitmap_t *r) { + art_iterator_t it = art_init_iterator(&r->art, /*first=*/true); + uint64_t high32_count = 0; + uint32_t prev_high32; + while (it.value != NULL) { + uint32_t current_high32 = (uint32_t)(combine_key(it.key, 0) >> 32); + if (high32_count == 0 || prev_high32 != current_high32) { + high32_count++; + prev_high32 = current_high32; + } + art_iterator_next(&it); + } + return high32_count; +} + +// Frees the (32-bit!) bitmap without freeing the containers. +static inline void roaring_bitmap_free_without_containers(roaring_bitmap_t *r) { + ra_clear_without_containers(&r->high_low_container); + roaring_free(r); +} + +size_t roaring64_bitmap_portable_size_in_bytes(const roaring64_bitmap_t *r) { + // https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations + size_t size = 0; + + // Write as uint64 the distinct number of "buckets", where a bucket is + // defined as the most significant 32 bits of an element. + uint64_t high32_count; + size += sizeof(high32_count); + + art_iterator_t it = art_init_iterator(&r->art, /*first=*/true); + bool first = true; + uint32_t prev_high32; + roaring_bitmap_t *bitmap32 = NULL; + + // Iterate through buckets ordered by increasing keys. + while (it.value != NULL) { + uint32_t current_high32 = (uint32_t)(combine_key(it.key, 0) >> 32); + if (first || prev_high32 != current_high32) { + if (!first) { + // Write as uint32 the most significant 32 bits of the bucket. + size += sizeof(prev_high32); + + // Write the 32-bit Roaring bitmaps representing the least + // significant bits of a set of elements. + size += roaring_bitmap_portable_size_in_bytes(bitmap32); + roaring_bitmap_free_without_containers(bitmap32); + } + + // Start a new 32-bit bitmap with the current high 32 bits. + art_iterator_t it2 = it; + uint32_t containers_with_high32 = 0; + while (it2.value != NULL && + (uint32_t)combine_key(it2.key, 0) == current_high32) { + containers_with_high32++; + art_iterator_next(&it2); + } + bitmap32 = + roaring_bitmap_create_with_capacity(containers_with_high32); + + first = false; + prev_high32 = current_high32; + } + leaf_t *leaf = (leaf_t *)it.value; + ra_append(&bitmap32->high_low_container, + (uint16_t)(current_high32 >> 16), leaf->container, + leaf->typecode); + art_iterator_next(&it); + } + + if (bitmap32 != NULL) { + // Write as uint32 the most significant 32 bits of the bucket. + size += sizeof(prev_high32); + + // Write the 32-bit Roaring bitmaps representing the least + // significant bits of a set of elements. + size += roaring_bitmap_portable_size_in_bytes(bitmap32); + roaring_bitmap_free_without_containers(bitmap32); + } + + return size; +} + +size_t roaring64_bitmap_portable_serialize(const roaring64_bitmap_t *r, + char *buf) { + // https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations + if (buf == NULL) { + return 0; + } + char *initial_buf = buf; + + // Write as uint64 the distinct number of "buckets", where a bucket is + // defined as the most significant 32 bits of an element. + uint64_t high32_count = count_high32(r); + memcpy(buf, &high32_count, sizeof(high32_count)); + buf += sizeof(high32_count); + + art_iterator_t it = art_init_iterator(&r->art, /*first=*/true); + bool first = true; + uint32_t prev_high32; + roaring_bitmap_t *bitmap32 = NULL; + + // Iterate through buckets ordered by increasing keys. + while (it.value != NULL) { + uint64_t current_high48 = combine_key(it.key, 0); + uint32_t current_high32 = (uint32_t)(current_high48 >> 32); + if (first || prev_high32 != current_high32) { + if (!first) { + // Write as uint32 the most significant 32 bits of the bucket. + memcpy(buf, &prev_high32, sizeof(prev_high32)); + buf += sizeof(prev_high32); + + // Write the 32-bit Roaring bitmaps representing the least + // significant bits of a set of elements. + buf += roaring_bitmap_portable_serialize(bitmap32, buf); + roaring_bitmap_free_without_containers(bitmap32); + } + + // Start a new 32-bit bitmap with the current high 32 bits. + art_iterator_t it2 = it; + uint32_t containers_with_high32 = 0; + while (it2.value != NULL && + (uint32_t)combine_key(it2.key, 0) == current_high32) { + containers_with_high32++; + art_iterator_next(&it2); + } + bitmap32 = + roaring_bitmap_create_with_capacity(containers_with_high32); + + first = false; + prev_high32 = current_high32; + } + leaf_t *leaf = (leaf_t *)it.value; + ra_append(&bitmap32->high_low_container, + (uint16_t)(current_high48 >> 16), leaf->container, + leaf->typecode); + art_iterator_next(&it); + } + + if (bitmap32 != NULL) { + // Write as uint32 the most significant 32 bits of the bucket. + memcpy(buf, &prev_high32, sizeof(prev_high32)); + buf += sizeof(prev_high32); + + // Write the 32-bit Roaring bitmaps representing the least + // significant bits of a set of elements. + buf += roaring_bitmap_portable_serialize(bitmap32, buf); + roaring_bitmap_free_without_containers(bitmap32); + } + + return buf - initial_buf; +} + +size_t roaring64_bitmap_portable_deserialize_size(const char *buf, + size_t maxbytes) { + // https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations + if (buf == NULL) { + return 0; + } + const char *maxbuf; + if (UINTPTR_MAX - maxbytes < (uintptr_t)buf) { + maxbuf = (const char *)UINTPTR_MAX; + } else { + maxbuf = buf + maxbytes; + } + const char *initial_buf = buf; + + // Read as uint64 the distinct number of "buckets", where a bucket is + // defined as the most significant 32 bits of an element. + uint64_t buckets; + if (buf + sizeof(buckets) > maxbuf) { + return 0; + } + memcpy(&buckets, buf, sizeof(buckets)); + buf += sizeof(buckets); + + // Buckets should be 32 bits with 4 bits of zero padding. + if (buckets > UINT32_MAX) { + return 0; + } + + // Iterate through buckets ordered by increasing keys. + for (uint64_t bucket = 0; bucket < buckets; ++bucket) { + // Read as uint32 the most significant 32 bits of the bucket. + uint32_t high32; + if (buf + sizeof(high32) > maxbuf) { + return 0; + } + buf += sizeof(high32); + + // Read the 32-bit Roaring bitmaps representing the least significant + // bits of a set of elements. + size_t bitmap32_size = + roaring_bitmap_portable_deserialize_size(buf, maxbuf - buf); + if (bitmap32_size == 0) { + return 0; + } + buf += bitmap32_size; + } + return buf - initial_buf; +} + +roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe( + const char *buf, size_t maxbytes) { + // https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations + if (buf == NULL) { + return NULL; + } + const char *maxbuf; + if (UINTPTR_MAX - maxbytes < (uintptr_t)buf) { + maxbuf = (const char *)UINTPTR_MAX; + } else { + maxbuf = buf + maxbytes; + } + + // Read as uint64 the distinct number of "buckets", where a bucket is + // defined as the most significant 32 bits of an element. + uint64_t buckets; + if (buf + sizeof(buckets) > maxbuf) { + return NULL; + } + memcpy(&buckets, buf, sizeof(buckets)); + buf += sizeof(buckets); + + // Buckets should be 32 bits with 4 bits of zero padding. + if (buckets > UINT32_MAX) { + return NULL; + } + + roaring64_bitmap_t *r = roaring64_bitmap_create(); + // Iterate through buckets ordered by increasing keys. + for (uint64_t bucket = 0; bucket < buckets; ++bucket) { + // Read as uint32 the most significant 32 bits of the bucket. + uint32_t high32; + if (buf + sizeof(high32) > maxbuf) { + return NULL; + } + memcpy(&high32, buf, sizeof(high32)); + buf += sizeof(high32); + + // Read the 32-bit Roaring bitmaps representing the least significant + // bits of a set of elements. + size_t bitmap32_size = + roaring_bitmap_portable_deserialize_size(buf, maxbuf - buf); + if (bitmap32_size == 0) { + return NULL; + } + + roaring_bitmap_t *bitmap32 = + roaring_bitmap_portable_deserialize_safe(buf, maxbuf - buf); + if (bitmap32 == NULL) { + return NULL; + } + buf += bitmap32_size; + + // Insert all containers of the 32-bit bitmap into the 64-bit bitmap. + uint32_t r32_size = ra_get_size(&bitmap32->high_low_container); + for (size_t i = 0; i < r32_size; ++i) { + uint16_t key16 = + ra_get_key_at_index(&bitmap32->high_low_container, (uint16_t)i); + uint8_t typecode; + container_t *container = ra_get_container_at_index( + &bitmap32->high_low_container, (uint16_t)i, &typecode); + + uint64_t high48_bits = + (((uint64_t)high32) << 32) | (((uint64_t)key16) << 16); + uint8_t high48[ART_KEY_BYTES]; + split_key(high48_bits, high48); + leaf_t *leaf = create_leaf(container, typecode); + art_insert(&r->art, high48, (art_val_t *)leaf); + } + roaring_bitmap_free_without_containers(bitmap32); + } + return r; +} + bool roaring64_bitmap_iterate(const roaring64_bitmap_t *r, roaring_iterator64 iterator, void *ptr) { art_iterator_t it = art_init_iterator(&r->art, /*first=*/true); @@ -23454,6 +23881,182 @@ bool roaring64_bitmap_iterate(const roaring64_bitmap_t *r, return true; } +static inline bool roaring64_iterator_init_at_leaf_first( + roaring64_iterator_t *it) { + it->high48 = combine_key(it->art_it.key, 0); + leaf_t *leaf = (leaf_t *)it->art_it.value; + uint16_t low16 = 0; + it->container_it = + container_init_iterator(leaf->container, leaf->typecode, &low16); + it->value = it->high48 | low16; + return (it->has_value = true); +} + +static inline bool roaring64_iterator_init_at_leaf_last( + roaring64_iterator_t *it) { + it->high48 = combine_key(it->art_it.key, 0); + leaf_t *leaf = (leaf_t *)it->art_it.value; + uint16_t low16 = 0; + it->container_it = + container_init_iterator_last(leaf->container, leaf->typecode, &low16); + it->value = it->high48 | low16; + return (it->has_value = true); +} + +static inline roaring64_iterator_t *roaring64_iterator_init_at( + const roaring64_bitmap_t *r, roaring64_iterator_t *it, bool first) { + it->parent = r; + it->art_it = art_init_iterator(&r->art, first); + it->has_value = it->art_it.value != NULL; + if (it->has_value) { + if (first) { + roaring64_iterator_init_at_leaf_first(it); + } else { + roaring64_iterator_init_at_leaf_last(it); + } + } + return it; +} + +roaring64_iterator_t *roaring64_iterator_create(const roaring64_bitmap_t *r) { + roaring64_iterator_t *it = + (roaring64_iterator_t *)roaring_malloc(sizeof(roaring64_iterator_t)); + return roaring64_iterator_init_at(r, it, /*first=*/true); +} + +roaring64_iterator_t *roaring64_iterator_create_last( + const roaring64_bitmap_t *r) { + roaring64_iterator_t *it = + (roaring64_iterator_t *)roaring_malloc(sizeof(roaring64_iterator_t)); + return roaring64_iterator_init_at(r, it, /*first=*/false); +} + +void roaring64_iterator_reinit(const roaring64_bitmap_t *r, + roaring64_iterator_t *it) { + roaring64_iterator_init_at(r, it, /*first=*/true); +} + +void roaring64_iterator_reinit_last(const roaring64_bitmap_t *r, + roaring64_iterator_t *it) { + roaring64_iterator_init_at(r, it, /*first=*/false); +} + +roaring64_iterator_t *roaring64_iterator_copy(const roaring64_iterator_t *it) { + roaring64_iterator_t *new_it = + (roaring64_iterator_t *)roaring_malloc(sizeof(roaring64_iterator_t)); + memcpy(new_it, it, sizeof(*it)); + return new_it; +} + +void roaring64_iterator_free(roaring64_iterator_t *it) { roaring_free(it); } + +bool roaring64_iterator_has_value(const roaring64_iterator_t *it) { + return it->has_value; +} + +uint64_t roaring64_iterator_value(const roaring64_iterator_t *it) { + return it->value; +} + +bool roaring64_iterator_advance(roaring64_iterator_t *it) { + if (it->art_it.value == NULL) { + return (it->has_value = false); + } + leaf_t *leaf = (leaf_t *)it->art_it.value; + uint16_t low16 = (uint16_t)it->value; + if (container_iterator_next(leaf->container, leaf->typecode, + &it->container_it, &low16)) { + it->value = it->high48 | low16; + return (it->has_value = true); + } + if (!art_iterator_next(&it->art_it)) { + return (it->has_value = false); + } + return roaring64_iterator_init_at_leaf_first(it); +} + +bool roaring64_iterator_previous(roaring64_iterator_t *it) { + if (it->art_it.value == NULL) { + return (it->has_value = false); + } + leaf_t *leaf = (leaf_t *)it->art_it.value; + uint16_t low16 = (uint16_t)it->value; + if (container_iterator_prev(leaf->container, leaf->typecode, + &it->container_it, &low16)) { + it->value = it->high48 | low16; + return (it->has_value = true); + } + if (!art_iterator_prev(&it->art_it)) { + return (it->has_value = false); + } + return roaring64_iterator_init_at_leaf_last(it); +} + +bool roaring64_iterator_move_equalorlarger(roaring64_iterator_t *it, + uint64_t val) { + if (it->art_it.value == NULL) { + return (it->has_value = false); + } + + uint8_t val_high48[ART_KEY_BYTES]; + uint16_t val_low16 = split_key(val, val_high48); + if (it->high48 < (val & 0xFFFFFFFFFFFF0000)) { + // The ART iterator is before the high48 bits of `val`, so we need to + // move to a leaf with a key equal or greater. + if (!art_iterator_lower_bound(&it->art_it, val_high48)) { + // Only smaller keys found. + return (it->has_value = false); + } + } + + if (it->high48 == (val & 0xFFFFFFFFFFFF0000)) { + // We're at equal high bits, check if a suitable value can be found in + // this container. + leaf_t *leaf = (leaf_t *)it->art_it.value; + uint16_t low16 = (uint16_t)it->value; + if (container_iterator_lower_bound(leaf->container, leaf->typecode, + &it->container_it, &low16, + val_low16)) { + it->value = it->high48 | low16; + return (it->has_value = true); + } + // Only smaller entries in this container, move to the next. + if (!art_iterator_next(&it->art_it)) { + return (it->has_value = false); + } + } + + // We're at a leaf with high bits greater than `val`, so the first entry in + // this container is our result. + return roaring64_iterator_init_at_leaf_first(it); +} + +uint64_t roaring64_iterator_read(roaring64_iterator_t *it, uint64_t *buf, + uint64_t count) { + uint64_t consumed = 0; + while (it->has_value && consumed < count) { + uint32_t container_consumed; + leaf_t *leaf = (leaf_t *)it->art_it.value; + uint16_t low16 = (uint16_t)it->value; + bool has_value = container_iterator_read_into_uint64( + leaf->container, leaf->typecode, &it->container_it, it->high48, buf, + count - consumed, &container_consumed, &low16); + consumed += container_consumed; + buf += container_consumed; + if (has_value) { + it->has_value = true; + it->value = it->high48 | low16; + assert(consumed == count); + return consumed; + } + it->has_value = art_iterator_next(&it->art_it); + if (it->has_value) { + roaring64_iterator_init_at_leaf_first(it); + } + } + return consumed; +} + #ifdef __cplusplus } // extern "C" } // namespace roaring diff --git a/croaring-sys/CRoaring/roaring.h b/croaring-sys/CRoaring/roaring.h index 87d6157..d87fcbb 100644 --- a/croaring-sys/CRoaring/roaring.h +++ b/croaring-sys/CRoaring/roaring.h @@ -1,5 +1,5 @@ // !!! DO NOT EDIT - THIS IS AN AUTO-GENERATED FILE !!! -// Created by amalgamation.sh on 2024-01-14T07:28:05Z +// Created by amalgamation.sh on 2024-01-20T09:04:13Z /* * The CRoaring project is under a dual license (Apache/MIT). @@ -423,9 +423,9 @@ inline int roaring_leading_zeroes(unsigned long long input_num) { return __built #endif #if defined(__GNUC__) || defined(__clang__) -#define WARN_UNUSED __attribute__((warn_unused_result)) +#define CROARING_WARN_UNUSED __attribute__((warn_unused_result)) #else -#define WARN_UNUSED +#define CROARING_WARN_UNUSED #endif #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100) @@ -1600,6 +1600,10 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf); * run containers should be in sorted non-overlapping order. This is is guaranteed to * happen when serializing an existing bitmap, but not for random inputs. * + * You may use roaring_bitmap_internal_validate to check the validity of the bitmap prior + * to using it. You may also use other strategies to check for corrupted inputs (e.g., + * checksums). + * * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), * the data format is going to be big-endian and not compatible with little-endian systems. */ @@ -1888,6 +1892,9 @@ void roaring_bitmap_statistics(const roaring_bitmap_t *r, /** * Perform internal consistency checks. Returns true if the bitmap is consistent. + * It may be useful to call this after deserializing bitmaps from untrusted sources. + * If roaring_bitmap_internal_validate returns true, then the bitmap should be consistent + * and can be trusted not to cause crashes or memory corruption. * * Note that some operations intentionally leave bitmaps in an inconsistent state temporarily, * for example, `roaring_bitmap_lazy_*` functions, until `roaring_bitmap_repair_after_lazy` is called. @@ -2089,6 +2096,7 @@ namespace api { typedef struct roaring64_bitmap_s roaring64_bitmap_t; typedef struct roaring64_leaf_s roaring64_leaf_t; +typedef struct roaring64_iterator_s roaring64_iterator_t; /** * A bit of context usable with `roaring64_bitmap_*_bulk()` functions. @@ -2311,12 +2319,6 @@ uint64_t roaring64_bitmap_maximum(const roaring64_bitmap_t *r); */ bool roaring64_bitmap_run_optimize(roaring64_bitmap_t *r); -/** - * Returns the in-memory size of the bitmap. - * TODO: Return the serialized size. - */ -size_t roaring64_bitmap_size_in_bytes(const roaring64_bitmap_t *r); - /** * Return true if the two bitmaps contain the same elements. */ @@ -2438,6 +2440,64 @@ uint64_t roaring64_bitmap_andnot_cardinality(const roaring64_bitmap_t *r1, void roaring64_bitmap_andnot_inplace(roaring64_bitmap_t *r1, const roaring64_bitmap_t *r2); +/** + * How many bytes are required to serialize this bitmap. + * + * This is meant to be compatible with other languages: + * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations + */ +size_t roaring64_bitmap_portable_size_in_bytes(const roaring64_bitmap_t *r); + +/** + * Write a bitmap to a buffer. The output buffer should refer to at least + * `roaring64_bitmap_portable_size_in_bytes(r)` bytes of allocated memory. + * + * Returns how many bytes were written, which should match + * `roaring64_bitmap_portable_size_in_bytes(r)`. + * + * This is meant to be compatible with other languages: + * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a + * mainframe IBM s390x), the data format is going to be big-endian and not + * compatible with little-endian systems. + */ +size_t roaring64_bitmap_portable_serialize(const roaring64_bitmap_t *r, + char *buf); +/** + * Check how many bytes would be read (up to maxbytes) at this pointer if there + * is a valid bitmap, returns zero if there is no valid bitmap. + * + * This is meant to be compatible with other languages + * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations + */ +size_t roaring64_bitmap_portable_deserialize_size(const char *buf, + size_t maxbytes); + +/** + * Read a bitmap from a serialized buffer safely (reading up to maxbytes). + * In case of failure, NULL is returned. + * + * This is meant to be compatible with other languages + * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations + * + * The function itself is safe in the sense that it will not cause buffer + * overflows. However, for correct operations, it is assumed that the bitmap + * read was once serialized from a valid bitmap (i.e., it follows the format + * specification). If you provided an incorrect input (garbage), then the bitmap + * read may not be in a valid state and following operations may not lead to + * sensible results. In particular, the serialized array containers need to be + * in sorted order, and the run containers should be in sorted non-overlapping + * order. This is is guaranteed to happen when serializing an existing bitmap, + * but not for random inputs. + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a + * mainframe IBM s390x), the data format is going to be big-endian and not + * compatible with little-endian systems. + */ +roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe(const char *buf, + size_t maxbytes); + /** * Iterate over the bitmap elements. The function `iterator` is called once for * all the values with `ptr` (can be NULL) as the second parameter of each call. @@ -2454,6 +2514,106 @@ void roaring64_bitmap_andnot_inplace(roaring64_bitmap_t *r1, bool roaring64_bitmap_iterate(const roaring64_bitmap_t *r, roaring_iterator64 iterator, void *ptr); +/** + * Create an iterator object that can be used to iterate through the values. + * Caller is responsible for calling `roaring64_iterator_free()`. + * + * The iterator is initialized. If there is a value, then this iterator points + * to the first value and `roaring64_iterator_has_value()` returns true. The + * value can be retrieved with `roaring64_iterator_value()`. + */ +roaring64_iterator_t *roaring64_iterator_create(const roaring64_bitmap_t *r); + +/** + * Create an iterator object that can be used to iterate through the values. + * Caller is responsible for calling `roaring64_iterator_free()`. + * + * The iterator is initialized. If there is a value, then this iterator points + * to the last value and `roaring64_iterator_has_value()` returns true. The + * value can be retrieved with `roaring64_iterator_value()`. + */ +roaring64_iterator_t *roaring64_iterator_create_last( + const roaring64_bitmap_t *r); + +/** + * Re-initializes an existing iterator. Functionally the same as + * `roaring64_iterator_create` without a allocation. + */ +void roaring64_iterator_reinit(const roaring64_bitmap_t *r, + roaring64_iterator_t *it); + +/** + * Re-initializes an existing iterator. Functionally the same as + * `roaring64_iterator_create_last` without a allocation. + */ +void roaring64_iterator_reinit_last(const roaring64_bitmap_t *r, + roaring64_iterator_t *it); + +/** + * Creates a copy of the iterator. Caller is responsible for calling + * `roaring64_iterator_free()` on the resulting iterator. + */ +roaring64_iterator_t *roaring64_iterator_copy(const roaring64_iterator_t *it); + +/** + * Free the iterator. + */ +void roaring64_iterator_free(roaring64_iterator_t *it); + +/** + * Returns true if the iterator currently points to a value. If so, calling + * `roaring64_iterator_value()` returns the value. + */ +bool roaring64_iterator_has_value(const roaring64_iterator_t *it); + +/** + * Returns the value the iterator currently points to. Should only be called if + * `roaring64_iterator_has_value()` returns true. + */ +uint64_t roaring64_iterator_value(const roaring64_iterator_t *it); + +/** + * Advance the iterator. If there is a new value, then + * `roaring64_iterator_has_value()` returns true. Values are traversed in + * increasing order. For convenience, returns the result of + * `roaring64_iterator_has_value()`. + * + * Once this returns false, `roaring64_iterator_advance` should not be called on + * the iterator again. Calling `roaring64_iterator_previous` is allowed. + */ +bool roaring64_iterator_advance(roaring64_iterator_t *it); + +/** + * Decrement the iterator. If there is a new value, then + * `roaring64_iterator_has_value()` returns true. Values are traversed in + * decreasing order. For convenience, returns the result of + * `roaring64_iterator_has_value()`. + * + * Once this returns false, `roaring64_iterator_previous` should not be called + * on the iterator again. Calling `roaring64_iterator_advance` is allowed. + */ +bool roaring64_iterator_previous(roaring64_iterator_t *it); + +/** + * Move the iterator to the first value greater than or equal to `val`, if it + * exists at or after the current position of the iterator. If there is a new + * value, then `roaring64_iterator_has_value()` returns true. Values are + * traversed in increasing order. For convenience, returns the result of + * `roaring64_iterator_has_value()`. + */ +bool roaring64_iterator_move_equalorlarger(roaring64_iterator_t *it, + uint64_t val); + +/** + * Reads up to `count` values from the iterator into the given `buf`. Returns + * the number of elements read. The number of elements read can be smaller than + * `count`, which means that there are no more elements in the bitmap. + * + * This function can be used together with other iterator functions. + */ +uint64_t roaring64_iterator_read(roaring64_iterator_t *it, uint64_t *buf, + uint64_t count); + #ifdef __cplusplus } // extern "C" } // namespace roaring diff --git a/croaring-sys/CRoaring/roaring.hh b/croaring-sys/CRoaring/roaring.hh index 474c838..361d2ea 100644 --- a/croaring-sys/CRoaring/roaring.hh +++ b/croaring-sys/CRoaring/roaring.hh @@ -1,5 +1,5 @@ // !!! DO NOT EDIT - THIS IS AN AUTO-GENERATED FILE !!! -// Created by amalgamation.sh on 2024-01-14T07:28:05Z +// Created by amalgamation.sh on 2024-01-20T09:04:13Z /* * The CRoaring project is under a dual license (Apache/MIT).