From 63de8d5de8c0c1cc329056808a83012033edade6 Mon Sep 17 00:00:00 2001 From: renato boemer Date: Mon, 2 Sep 2024 21:24:54 +0100 Subject: [PATCH 1/3] fixed min non-zero issue v1 --- src/evidently/calculations/stattests/utils.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/src/evidently/calculations/stattests/utils.py b/src/evidently/calculations/stattests/utils.py index 39a1bce86d..70538588c8 100644 --- a/src/evidently/calculations/stattests/utils.py +++ b/src/evidently/calculations/stattests/utils.py @@ -39,20 +39,15 @@ def get_binned_data( current_percents = np.array([current_feature_dict[key] / len(current_data) for key in keys]) if feel_zeroes: - np.place( - reference_percents, - reference_percents == 0, - min(reference_percents[reference_percents != 0]) / 10**6 - if min(reference_percents[reference_percents != 0]) <= 0.0001 - else 0.0001, - ) - np.place( - current_percents, - current_percents == 0, - min(current_percents[current_percents != 0]) / 10**6 - if min(current_percents[current_percents != 0]) <= 0.0001 - else 0.0001, - ) + min_non_zero_ref = np.min(reference_percents[reference_percents != 0]) + min_non_zero_cur = np.min(current_percents[current_percents != 0]) + + fill_value = min(min_non_zero_ref, min_non_zero_cur) / 10 + + fill_value = min(fill_value, min(min_non_zero_ref, min_non_zero_cur) / 2) + + np.place(reference_percents, reference_percents == 0, fill_value) + np.place(current_percents, current_percents == 0, fill_value) return reference_percents, current_percents From a9bac3f6a77f8e63591bdb8e37531fd717384c29 Mon Sep 17 00:00:00 2001 From: renato boemer Date: Mon, 2 Sep 2024 21:28:10 +0100 Subject: [PATCH 2/3] name correction feel -> fill --- src/evidently/calculations/stattests/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/evidently/calculations/stattests/utils.py b/src/evidently/calculations/stattests/utils.py index 70538588c8..33e0e06804 100644 --- a/src/evidently/calculations/stattests/utils.py +++ b/src/evidently/calculations/stattests/utils.py @@ -12,7 +12,7 @@ def get_unique_not_nan_values_list_from_series(current_data: pd.Series, referenc def get_binned_data( - reference_data: pd.Series, current_data: pd.Series, feature_type: ColumnType, n: int, feel_zeroes: bool = True + reference_data: pd.Series, current_data: pd.Series, feature_type: ColumnType, n: int, fill_zeroes: bool = True ): """Split variable into n buckets based on reference quantiles Args: @@ -38,7 +38,7 @@ def get_binned_data( reference_percents = np.array([ref_feature_dict[key] / len(reference_data) for key in keys]) current_percents = np.array([current_feature_dict[key] / len(current_data) for key in keys]) - if feel_zeroes: + if fill_zeroes: min_non_zero_ref = np.min(reference_percents[reference_percents != 0]) min_non_zero_cur = np.min(current_percents[current_percents != 0]) From bd0b12cb46061007e71fe152a3d0e3d99118c06a Mon Sep 17 00:00:00 2001 From: renato boemer Date: Thu, 5 Sep 2024 16:24:51 +0100 Subject: [PATCH 3/3] updated function with new parameters --- src/evidently/calculations/stattests/utils.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/evidently/calculations/stattests/utils.py b/src/evidently/calculations/stattests/utils.py index 33e0e06804..8a9f5c6692 100644 --- a/src/evidently/calculations/stattests/utils.py +++ b/src/evidently/calculations/stattests/utils.py @@ -12,7 +12,8 @@ def get_unique_not_nan_values_list_from_series(current_data: pd.Series, referenc def get_binned_data( - reference_data: pd.Series, current_data: pd.Series, feature_type: ColumnType, n: int, fill_zeroes: bool = True + reference_data: pd.Series, current_data: pd.Series, feature_type: ColumnType, n: int, fill_zeroes: bool=True, + fill_method: str='auto', dynamic_scale: bool=False ): """Split variable into n buckets based on reference quantiles Args: @@ -20,6 +21,9 @@ def get_binned_data( current_data: current data feature_type: feature type n: number of quantiles + fill_zeroes: whether to fill zero percentages + fill_method: method to calculate fill value ('auto', 'min', 'mean') + dynamic_scale: whether to use dynamic scaling for fill value Returns: reference_percents: % of records in each bucket for reference current_percents: % of records in each bucket for current @@ -42,13 +46,26 @@ def get_binned_data( min_non_zero_ref = np.min(reference_percents[reference_percents != 0]) min_non_zero_cur = np.min(current_percents[current_percents != 0]) - fill_value = min(min_non_zero_ref, min_non_zero_cur) / 10 + if fill_method == 'auto': + fill_value = min(min_non_zero_ref, min_non_zero_cur) / 10 + fill_value = min(fill_value, min(min_non_zero_ref, min_non_zero_cur) / 2) + elif fill_method == 'min': + fill_value = min(min_non_zero_ref, min_non_zero_cur) + elif fill_method == 'mean': + fill_value = (min_non_zero_ref + min_non_zero_cur) / 2 + else: + raise ValueError("Invalid fill_method. Choose 'auto', 'min', or 'mean'.") - fill_value = min(fill_value, min(min_non_zero_ref, min_non_zero_cur) / 2) + if dynamic_scale: + scale_factor = min(min_non_zero_ref, min_non_zero_cur) / max(min_non_zero_ref, min_non_zero_cur) + fill_value *= scale_factor np.place(reference_percents, reference_percents == 0, fill_value) np.place(current_percents, current_percents == 0, fill_value) + reference_percents = reference_percents / np.sum(reference_percents) + current_percents = current_percents / np.sum(current_percents) + return reference_percents, current_percents