From 55e851629784ca633880e88427a605916d65c5a6 Mon Sep 17 00:00:00 2001 From: ForeverAngry <61765732+ForeverAngry@users.noreply.github.com> Date: Fri, 7 Nov 2025 12:58:41 -0500 Subject: [PATCH] feat: Add utilities for reading Parquet bloom filters Add utility functions to read and check bloom filters directly from Parquet files using PyArrow, without requiring Iceberg spec changes. - get_parquet_bloom_filter_for_column(): Extract bloom filter from Parquet row group - bloom_filter_might_contain(): Check if value might be in bloom filter This provides foundation for future bloom filter integration without modifying the Iceberg manifest specification. --- pyiceberg/table/bloom_filter.py | 79 ++++++++++++++++++++++++++++++++ tests/table/test_bloom_filter.py | 78 +++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 pyiceberg/table/bloom_filter.py create mode 100644 tests/table/test_bloom_filter.py diff --git a/pyiceberg/table/bloom_filter.py b/pyiceberg/table/bloom_filter.py new file mode 100644 index 0000000000..b8fb56c8a6 --- /dev/null +++ b/pyiceberg/table/bloom_filter.py @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Bloom filter support for reading from Parquet files.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + import pyarrow.parquet as pq + + +def get_parquet_bloom_filter_for_column(parquet_file: pq.ParquetFile, column_name: str, row_group_index: int) -> Any | None: + """Extract bloom filter for a specific column from a Parquet row group. + + Args: + parquet_file: PyArrow ParquetFile object. + column_name: Name of the column to get bloom filter for. + row_group_index: Index of the row group. + + Returns: + Bloom filter object if available, None otherwise. + """ + try: + # PyArrow provides access to bloom filters through the row group metadata + row_group = parquet_file.metadata.row_group(row_group_index) + + # Find the column by name + for i in range(row_group.num_columns): + column = row_group.column(i) + if column.path_in_schema == column_name: + # Check if bloom filter is available + if hasattr(column, "bloom_filter"): + return column.bloom_filter + break + + return None + except Exception: + # If bloom filter reading fails, return None + return None + + +def bloom_filter_might_contain(bloom_filter: Any, value: Any) -> bool: + """Check if a Parquet bloom filter might contain a value. + + Args: + bloom_filter: PyArrow bloom filter object. + value: Value to check. + + Returns: + True if value might be in the filter, False if definitely not. + """ + if bloom_filter is None or value is None: + return True # Conservative: assume it might contain + + try: + # PyArrow bloom filters have a check method + if hasattr(bloom_filter, "check"): + return bloom_filter.check(value) + elif hasattr(bloom_filter, "__contains__"): + return value in bloom_filter + else: + return True # Conservative: assume it might contain + except Exception: + return True # On error, be conservative diff --git a/tests/table/test_bloom_filter.py b/tests/table/test_bloom_filter.py new file mode 100644 index 0000000000..18b32ab79a --- /dev/null +++ b/tests/table/test_bloom_filter.py @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Tests for bloom filter utility functions.""" + +from unittest.mock import MagicMock + +from pyiceberg.table.bloom_filter import bloom_filter_might_contain, get_parquet_bloom_filter_for_column + + +class TestBloomFilterUtilities: + """Test cases for Parquet bloom filter reading utilities.""" + + def test_get_parquet_bloom_filter_returns_none_when_not_available(self) -> None: + """Test that getting a bloom filter returns None when not available.""" + # Mock a ParquetFile without bloom filters + mock_parquet_file = MagicMock() + mock_row_group = MagicMock() + mock_column = MagicMock() + mock_column.path_in_schema = "test_column" + del mock_column.bloom_filter # Ensure bloom_filter attribute doesn't exist + + mock_row_group.num_columns = 1 + mock_row_group.column.return_value = mock_column + mock_parquet_file.metadata.row_group.return_value = mock_row_group + + result = get_parquet_bloom_filter_for_column(mock_parquet_file, "test_column", 0) + assert result is None + + def test_bloom_filter_might_contain_returns_true_when_filter_is_none(self) -> None: + """Test that might_contain returns True conservatively when filter is None.""" + result = bloom_filter_might_contain(None, "test_value") + assert result is True + + def test_bloom_filter_might_contain_returns_true_when_value_is_none(self) -> None: + """Test that might_contain returns True conservatively when value is None.""" + mock_filter = MagicMock() + result = bloom_filter_might_contain(mock_filter, None) + assert result is True + + def test_bloom_filter_might_contain_uses_check_method(self) -> None: + """Test that might_contain uses the check method if available.""" + mock_filter = MagicMock() + mock_filter.check.return_value = True + + result = bloom_filter_might_contain(mock_filter, "test_value") + assert result is True + mock_filter.check.assert_called_once_with("test_value") + + def test_bloom_filter_might_contain_uses_contains_method(self) -> None: + """Test that might_contain uses __contains__ if check is not available.""" + mock_filter = MagicMock() + del mock_filter.check # Remove check method + mock_filter.__contains__.return_value = True + + result = bloom_filter_might_contain(mock_filter, "test_value") + assert result is True + + def test_bloom_filter_might_contain_returns_true_on_exception(self) -> None: + """Test that might_contain returns True conservatively on exception.""" + mock_filter = MagicMock() + mock_filter.check.side_effect = Exception("Test error") + + result = bloom_filter_might_contain(mock_filter, "test_value") + assert result is True