|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 | import random
|
| 15 | +from typing import Dict |
15 | 16 | from typing import Iterator
|
16 | 17 | from typing import List
|
| 18 | +from typing import Optional |
17 | 19 | from typing import Tuple
|
18 | 20 |
|
19 | 21 | import numpy as np
|
|
25 | 27 | from kolena.annotation import BoundingBox
|
26 | 28 | from kolena.annotation import LabeledBoundingBox
|
27 | 29 | from kolena.dataset import download_dataset
|
| 30 | +from kolena.dataset import Filters |
| 31 | +from kolena.dataset import GeneralFieldFilter |
28 | 32 | from kolena.dataset import list_datasets
|
29 | 33 | from kolena.dataset import upload_dataset
|
30 | 34 | from kolena.dataset.dataset import _fetch_dataset_history
|
@@ -422,3 +426,65 @@ def test__upload_dataset__with_description() -> None:
|
422 | 426 | )
|
423 | 427 | dataset = _load_dataset_metadata(name)
|
424 | 428 | assert dataset.description == description_v2
|
| 429 | + |
| 430 | + |
| 431 | +@pytest.fixture(scope="module") |
| 432 | +def download_datapoints_with_filters_data() -> Tuple[str, List[str], List[Dict]]: |
| 433 | + name = with_test_prefix(f"{__file__}::test__download_dataset__with_filters") |
| 434 | + id_fields = ["value"] |
| 435 | + n_datapoints = 10 |
| 436 | + columns = ["value", "str", "nested"] |
| 437 | + datapoints = [ |
| 438 | + dict( |
| 439 | + value=i, |
| 440 | + str=f"str-{i}", |
| 441 | + nested={ |
| 442 | + "bool ean": i % 2 == 0, |
| 443 | + "optional_col": str(i) if i % 5 > 0 else None, |
| 444 | + }, |
| 445 | + ) |
| 446 | + for i in range(n_datapoints) |
| 447 | + ] |
| 448 | + df_datapoints = pd.DataFrame(datapoints, columns=["value", "str", "nested"]) |
| 449 | + |
| 450 | + upload_dataset( |
| 451 | + name, |
| 452 | + df_datapoints, |
| 453 | + id_fields=id_fields, |
| 454 | + ) |
| 455 | + return name, columns, datapoints |
| 456 | + |
| 457 | + |
| 458 | +@pytest.mark.parametrize( |
| 459 | + "filters, expected_datapoint_inds", |
| 460 | + [ |
| 461 | + (None, list(range(10))), |
| 462 | + (Filters(datapoint={"str": GeneralFieldFilter(value_in=["str-0", "str-1"])}), [0, 1]), |
| 463 | + (Filters(datapoint={"value": GeneralFieldFilter(value_in=["2", "3"])}), [2, 3]), |
| 464 | + (Filters(datapoint={'nested."bool ean"': GeneralFieldFilter(value_in=[True])}), [0, 2, 4, 6, 8]), |
| 465 | + (Filters(datapoint={"nested.optional_col": GeneralFieldFilter(value_in=["7"], null_value=True)}), [0, 5, 7]), |
| 466 | + ( |
| 467 | + Filters( |
| 468 | + datapoint={ |
| 469 | + "str": GeneralFieldFilter(value_in=["str-0", "str-1", "str-2", "str-5"]), |
| 470 | + "nested.optional_col": GeneralFieldFilter(value_in=["0", "3"], null_value=True), |
| 471 | + }, |
| 472 | + ), |
| 473 | + [0, 5], |
| 474 | + ), |
| 475 | + ], |
| 476 | +) |
| 477 | +def test__download_dataset__with_filters( |
| 478 | + download_datapoints_with_filters_data: Tuple[str, List[str], List[Dict]], |
| 479 | + filters: Optional[Filters], |
| 480 | + expected_datapoint_inds: List[int], |
| 481 | +) -> None: |
| 482 | + name, columns, datapoints = download_datapoints_with_filters_data |
| 483 | + expected_datapoints = ( |
| 484 | + pd.DataFrame([datapoints[ind] for ind in expected_datapoint_inds], columns=columns) |
| 485 | + .sort_values(by="value") |
| 486 | + .reset_index(drop=True) |
| 487 | + ) |
| 488 | + loaded_datapoints = download_dataset(name, filters=filters) |
| 489 | + loaded_datapoints = loaded_datapoints.sort_values(by="value").reset_index(drop=True) |
| 490 | + assert_frame_equal(loaded_datapoints, expected_datapoints, columns) |
0 commit comments