Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/src/parquet/bloom_filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
namespace parquet {
constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];

BlockSplitBloomFilter::BlockSplitBloomFilter()
: pool_(::arrow::default_memory_pool()),
BlockSplitBloomFilter::BlockSplitBloomFilter(::arrow::MemoryPool* pool)
: pool_(pool),
hash_strategy_(HashStrategy::XXHASH),
algorithm_(Algorithm::BLOCK),
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
Expand Down
18 changes: 17 additions & 1 deletion cpp/src/parquet/bloom_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,10 @@ class PARQUET_EXPORT BloomFilter {
class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
public:
/// The constructor of BlockSplitBloomFilter. It uses XXH64 as hash function.
BlockSplitBloomFilter();
///
/// \param pool memory pool to use.
explicit BlockSplitBloomFilter(
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());

/// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
/// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
Expand Down Expand Up @@ -152,6 +155,19 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
/// @param fpp The false positive probability.
/// @return it always return a value between kMinimumBloomFilterBytes and
/// kMaximumBloomFilterBytes, and the return value is always a power of 2
static uint32_t OptimalNumOfBytes(uint32_t ndv, double fpp) {
uint32_t optimal_num_of_bits = OptimalNumOfBits(ndv, fpp);
DCHECK(::arrow::bit_util::IsMultipleOf8(optimal_num_of_bits));
return optimal_num_of_bits >> 3;
}

/// Calculate optimal size according to the number of distinct values and false
/// positive probability.
///
/// @param ndv The number of distinct values.
/// @param fpp The false positive probability.
/// @return it always return a value between kMinimumBloomFilterBytes * 8 and
/// kMaximumBloomFilterBytes * 8, and the return value is always a power of 16
static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
DCHECK(fpp > 0.0 && fpp < 1.0);
const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
Expand Down
61 changes: 32 additions & 29 deletions cpp/src/parquet/bloom_filter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ TEST(FPPTest, TestBloomFilter) {

std::vector<std::string> members;
BlockSplitBloomFilter bloom_filter;
bloom_filter.Init(BlockSplitBloomFilter::OptimalNumOfBits(total_count, fpp));
bloom_filter.Init(BlockSplitBloomFilter::OptimalNumOfBytes(total_count, fpp));

// Insert elements into the Bloom filter
for (int i = 0; i < total_count; i++) {
Expand Down Expand Up @@ -249,36 +249,39 @@ TEST(CompatibilityTest, TestBloomFilter) {
// Also it is used to test whether OptimalNumOfBits returns value between
// [MINIMUM_BLOOM_FILTER_SIZE, MAXIMUM_BLOOM_FILTER_SIZE].
TEST(OptimalValueTest, TestBloomFilter) {
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(256, 0.01), UINT32_C(4096));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(512, 0.01), UINT32_C(8192));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(1024, 0.01), UINT32_C(16384));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(2048, 0.01), UINT32_C(32768));

EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(200, 0.01), UINT32_C(2048));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(300, 0.01), UINT32_C(4096));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(700, 0.01), UINT32_C(8192));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(1500, 0.01), UINT32_C(16384));

EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(200, 0.025), UINT32_C(2048));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(300, 0.025), UINT32_C(4096));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(700, 0.025), UINT32_C(8192));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(1500, 0.025), UINT32_C(16384));

EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(200, 0.05), UINT32_C(2048));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(300, 0.05), UINT32_C(4096));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(700, 0.05), UINT32_C(8192));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(1500, 0.05), UINT32_C(16384));
auto testOptimalNumEstimation = [](uint32_t ndv, double fpp, uint32_t num_bits) {
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(ndv, fpp), num_bits);
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBytes(ndv, fpp), num_bits / 8);
};

testOptimalNumEstimation(256, 0.01, UINT32_C(4096));
testOptimalNumEstimation(512, 0.01, UINT32_C(8192));
testOptimalNumEstimation(1024, 0.01, UINT32_C(16384));
testOptimalNumEstimation(2048, 0.01, UINT32_C(32768));

testOptimalNumEstimation(200, 0.01, UINT32_C(2048));
testOptimalNumEstimation(300, 0.01, UINT32_C(4096));
testOptimalNumEstimation(700, 0.01, UINT32_C(8192));
testOptimalNumEstimation(1500, 0.01, UINT32_C(16384));

testOptimalNumEstimation(200, 0.025, UINT32_C(2048));
testOptimalNumEstimation(300, 0.025, UINT32_C(4096));
testOptimalNumEstimation(700, 0.025, UINT32_C(8192));
testOptimalNumEstimation(1500, 0.025, UINT32_C(16384));

testOptimalNumEstimation(200, 0.05, UINT32_C(2048));
testOptimalNumEstimation(300, 0.05, UINT32_C(4096));
testOptimalNumEstimation(700, 0.05, UINT32_C(8192));
testOptimalNumEstimation(1500, 0.05, UINT32_C(16384));

// Boundary check
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(4, 0.01), UINT32_C(256));
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBits(4, 0.25), UINT32_C(256));

EXPECT_EQ(
BlockSplitBloomFilter::OptimalNumOfBits(std::numeric_limits<uint32_t>::max(), 0.01),
UINT32_C(1073741824));
EXPECT_EQ(
BlockSplitBloomFilter::OptimalNumOfBits(std::numeric_limits<uint32_t>::max(), 0.25),
UINT32_C(1073741824));
testOptimalNumEstimation(4, 0.01, BlockSplitBloomFilter::kMinimumBloomFilterBytes * 8);
testOptimalNumEstimation(4, 0.25, BlockSplitBloomFilter::kMinimumBloomFilterBytes * 8);

testOptimalNumEstimation(std::numeric_limits<uint32_t>::max(), 0.01,
BlockSplitBloomFilter::kMaximumBloomFilterBytes * 8);
testOptimalNumEstimation(std::numeric_limits<uint32_t>::max(), 0.25,
BlockSplitBloomFilter::kMaximumBloomFilterBytes * 8);
}

// The test below is plainly copied from parquet-mr and serves as a basic sanity
Expand Down