feat: ability to filter dataset when downloading (#762)

nankolena · web-flow · commit 78ab66a7514e · 2025-03-19T11:27:34.000-04:00
* draft

* add integ test

* use backward compatible type

* tidy up docs

* set numpy upper bound in semantic segmentation example due to issue with opencv
diff --git a/docs/reference/dataset/index.md b/docs/reference/dataset/index.md
@@ -11,3 +11,7 @@
     options:
         members: ["upload_dataset_embeddings"]
         show_root_heading: false
+::: kolena._api.v2.dataset
+    options:
+        members: ["Filters", "GeneralFieldFilter"]
+        show_root_heading: false
diff --git a/examples/dataset/semantic_segmentation/pyproject.toml b/examples/dataset/semantic_segmentation/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
     "boto3>=1.25,<2",
     "scikit-learn>=1.1.2,<2",
     "scikit-image>=0.19.3,<1",
+    "numpy<2",
 ]
 
 [tool.uv]
diff --git a/kolena/_api/v2/dataset.py b/kolena/_api/v2/dataset.py
@@ -11,13 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from dataclasses import field
 from enum import Enum
 from typing import Dict
 from typing import List
 from typing import Optional
+from typing import Union
+
+from typing_extensions import Literal
 
 from kolena._api.v1.batched_load import BatchedLoad
 from kolena._utils.pydantic_v1 import conint
+from kolena._utils.pydantic_v1 import StrictBool
+from kolena._utils.pydantic_v1 import StrictStr
 from kolena._utils.pydantic_v1.dataclasses import dataclass
 
 
@@ -41,11 +47,40 @@ class RegisterRequest:
     description: Optional[str] = None
 
 
+@dataclass(frozen=True)
+class GeneralFieldFilter:
+    """
+    Generic representation of a filter on Kolena.
+    """
+
+    value_in: Optional[List[Union[StrictStr, StrictBool]]] = None
+    """A list of desired categorical values."""
+    null_value: Optional[Literal[True]] = None
+    """Whether to filter for cases where the field has null value or the field name does not exist."""
+
+
+@dataclass(frozen=True)
+class Filters:
+    """
+    Filters to be applied on the dataset during the operation. Currently only used as an optional argument
+     in [`download_dataset`][kolena.dataset.download_dataset].
+    """
+
+    datapoint: Dict[str, GeneralFieldFilter] = field(default_factory=dict)
+    """
+    Dictionary of a field name of the datapoint to the [`GeneralFieldFilter`][kolena.dataset.GeneralFieldFilter] to be
+    applied on the field. In case of nested objects, use `.` as the delimiter to separate the keys. For example, if you
+    have a `ground_truth` column of [`Label`][kolena.annotation.Label] type, you can use `ground_truth.label` as the key
+    to query for the class label.
+    """
+
+
 @dataclass(frozen=True)
 class LoadDatapointsRequest(BatchedLoad.BaseInitDownloadRequest):
     name: str
     commit: Optional[str] = None
     include_extracted_properties: bool = False
+    filters: Optional[Filters] = None
 
 
 @dataclass(frozen=True)
diff --git a/kolena/dataset/__init__.py b/kolena/dataset/__init__.py
@@ -21,9 +21,13 @@
 from kolena.dataset.evaluation import ModelEntity
 from kolena.dataset.evaluation import get_models
 from kolena.dataset.embeddings import upload_dataset_embeddings
+from kolena._api.v2.dataset import Filters
+from kolena._api.v2.dataset import GeneralFieldFilter
 
 __all__ = [
     "upload_dataset",
+    "Filters",
+    "GeneralFieldFilter",
     "download_dataset",
     "upload_results",
     "download_results",
diff --git a/kolena/dataset/dataset.py b/kolena/dataset/dataset.py
@@ -31,6 +31,7 @@
 from kolena._api.v1.event import EventAPI
 from kolena._api.v2.dataset import CommitData
 from kolena._api.v2.dataset import EntityData
+from kolena._api.v2.dataset import Filters
 from kolena._api.v2.dataset import ListCommitHistoryRequest
 from kolena._api.v2.dataset import ListCommitHistoryResponse
 from kolena._api.v2.dataset import ListDatasetsResponse
@@ -368,13 +369,15 @@ def _iter_dataset_raw(
     commit: Optional[str] = None,
     batch_size: int = BatchSize.LOAD_SAMPLES.value,
     include_extracted_properties: bool = False,
+    filters: Optional[Filters] = None,
 ) -> Iterator[pd.DataFrame]:
     validate_batch_size(batch_size)
     init_request = LoadDatapointsRequest(
         name=name,
         commit=commit,
         batch_size=batch_size,
         include_extracted_properties=include_extracted_properties,
+        filters=filters,
     )
     yield from _BatchedLoader.iter_data(
         init_request=init_request,
@@ -389,11 +392,12 @@ def _iter_dataset(
     commit: Optional[str] = None,
     batch_size: int = BatchSize.LOAD_SAMPLES.value,
     include_extracted_properties: bool = False,
+    filters: Optional[Filters] = None,
 ) -> Iterator[pd.DataFrame]:
     """
     Get an iterator over datapoints in the dataset.
     """
-    for df_batch in _iter_dataset_raw(name, commit, batch_size, include_extracted_properties):
+    for df_batch in _iter_dataset_raw(name, commit, batch_size, include_extracted_properties, filters):
         yield _to_deserialized_dataframe(df_batch, column=COL_DATAPOINT)
 
 
@@ -403,6 +407,7 @@ def download_dataset(
     *,
     commit: Optional[str] = None,
     include_extracted_properties: bool = False,
+    filters: Optional[Filters] = None,
 ) -> pd.DataFrame:
     """
     Download an entire dataset given its name.
@@ -411,9 +416,10 @@ def download_dataset(
     :param commit: The commit hash for version control. Get the latest commit when this value is `None`.
     :param include_extracted_properties: If True, include kolena extracted properties from automated extractions
      in the dataset as separate columns
+    :param filters: [Experimental] Optional filter to specify which datapoints should be downloaded.
     :return: A DataFrame containing the specified dataset.
     """
-    df_batches = list(_iter_dataset(name, commit, BatchSize.LOAD_SAMPLES.value, include_extracted_properties))
+    df_batches = list(_iter_dataset(name, commit, BatchSize.LOAD_SAMPLES.value, include_extracted_properties, filters))
     log.info(f"downloaded dataset '{name}'")
     df_dataset = pd.concat(df_batches, ignore_index=True) if df_batches else pd.DataFrame()
     return df_dataset
diff --git a/tests/integration/dataset/test_dataset.py b/tests/integration/dataset/test_dataset.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import random
+from typing import Dict
 from typing import Iterator
 from typing import List
+from typing import Optional
 from typing import Tuple
 
 import numpy as np
@@ -25,6 +27,8 @@
 from kolena.annotation import BoundingBox
 from kolena.annotation import LabeledBoundingBox
 from kolena.dataset import download_dataset
+from kolena.dataset import Filters
+from kolena.dataset import GeneralFieldFilter
 from kolena.dataset import list_datasets
 from kolena.dataset import upload_dataset
 from kolena.dataset.dataset import _fetch_dataset_history
@@ -422,3 +426,65 @@ def test__upload_dataset__with_description() -> None:
     )
     dataset = _load_dataset_metadata(name)
     assert dataset.description == description_v2
+
+
+@pytest.fixture(scope="module")
+def download_datapoints_with_filters_data() -> Tuple[str, List[str], List[Dict]]:
+    name = with_test_prefix(f"{__file__}::test__download_dataset__with_filters")
+    id_fields = ["value"]
+    n_datapoints = 10
+    columns = ["value", "str", "nested"]
+    datapoints = [
+        dict(
+            value=i,
+            str=f"str-{i}",
+            nested={
+                "bool ean": i % 2 == 0,
+                "optional_col": str(i) if i % 5 > 0 else None,
+            },
+        )
+        for i in range(n_datapoints)
+    ]
+    df_datapoints = pd.DataFrame(datapoints, columns=["value", "str", "nested"])
+
+    upload_dataset(
+        name,
+        df_datapoints,
+        id_fields=id_fields,
+    )
+    return name, columns, datapoints
+
+
+@pytest.mark.parametrize(
+    "filters, expected_datapoint_inds",
+    [
+        (None, list(range(10))),
+        (Filters(datapoint={"str": GeneralFieldFilter(value_in=["str-0", "str-1"])}), [0, 1]),
+        (Filters(datapoint={"value": GeneralFieldFilter(value_in=["2", "3"])}), [2, 3]),
+        (Filters(datapoint={'nested."bool ean"': GeneralFieldFilter(value_in=[True])}), [0, 2, 4, 6, 8]),
+        (Filters(datapoint={"nested.optional_col": GeneralFieldFilter(value_in=["7"], null_value=True)}), [0, 5, 7]),
+        (
+            Filters(
+                datapoint={
+                    "str": GeneralFieldFilter(value_in=["str-0", "str-1", "str-2", "str-5"]),
+                    "nested.optional_col": GeneralFieldFilter(value_in=["0", "3"], null_value=True),
+                },
+            ),
+            [0, 5],
+        ),
+    ],
+)
+def test__download_dataset__with_filters(
+    download_datapoints_with_filters_data: Tuple[str, List[str], List[Dict]],
+    filters: Optional[Filters],
+    expected_datapoint_inds: List[int],
+) -> None:
+    name, columns, datapoints = download_datapoints_with_filters_data
+    expected_datapoints = (
+        pd.DataFrame([datapoints[ind] for ind in expected_datapoint_inds], columns=columns)
+        .sort_values(by="value")
+        .reset_index(drop=True)
+    )
+    loaded_datapoints = download_dataset(name, filters=filters)
+    loaded_datapoints = loaded_datapoints.sort_values(by="value").reset_index(drop=True)
+    assert_frame_equal(loaded_datapoints, expected_datapoints, columns)

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ dependencies = [`
`16`	`16`	`"boto3>=1.25,<2",`
`17`	`17`	`"scikit-learn>=1.1.2,<2",`
`18`	`18`	`"scikit-image>=0.19.3,<1",`
	`19`	`+ "numpy<2",`
`19`	`20`	`]`
`20`	`21`
`21`	`22`	`[tool.uv]`