From b4183f68e9f67e95ddd4788205b0eb3bca837d1d Mon Sep 17 00:00:00 2001 From: Gwen Au Date: Thu, 21 Nov 2024 15:21:04 +1100 Subject: [PATCH 1/2] Resolves evidentlyai/evidently#1023 --- src/evidently/calculations/data_quality.py | 2 ++ src/evidently/metrics/data_integrity/column_summary_metric.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/evidently/calculations/data_quality.py b/src/evidently/calculations/data_quality.py index a3620f9d40..bc375948a6 100644 --- a/src/evidently/calculations/data_quality.py +++ b/src/evidently/calculations/data_quality.py @@ -92,6 +92,7 @@ class FeatureQualityStats: most_common_not_null_value_percentage: Optional[float] = None new_in_current_values_count: Optional[int] = None unused_in_current_values_count: Optional[int] = None + skew: Optional[float] = None def is_datetime(self): """Checks that the object store stats for a datetime feature""" @@ -205,6 +206,7 @@ def get_percentage_from_all_values(value: Union[int, float]) -> float: result.percentile_25 = np.round(common_stats["25%"], 2) # type: ignore[assignment] result.percentile_50 = np.round(common_stats["50%"], 2) # type: ignore[assignment] result.percentile_75 = np.round(common_stats["75%"], 2) # type: ignore[assignment] + result.skew = feature.skew() if feature_type == ColumnType.Datetime: # cast datetime value to str for datetime features diff --git a/src/evidently/metrics/data_integrity/column_summary_metric.py b/src/evidently/metrics/data_integrity/column_summary_metric.py index 73b9ad8ad1..23add62004 100644 --- a/src/evidently/metrics/data_integrity/column_summary_metric.py +++ b/src/evidently/metrics/data_integrity/column_summary_metric.py @@ -80,6 +80,7 @@ class Config: infinite_percentage: Optional[float] most_common: Optional[Union[int, float]] most_common_percentage: Optional[float] + skew: Optional[float] class CategoricalCharacteristics(ColumnCharacteristics): @@ -501,6 +502,7 @@ def map_data(stats: FeatureQualityStats) -> ColumnCharacteristics: infinite_percentage=stats.infinite_percentage, most_common=stats.most_common_value, most_common_percentage=stats.most_common_value_percentage, + skew=stats.skew ) if stats.feature_type == "cat": return CategoricalCharacteristics( @@ -820,6 +822,7 @@ def _metrics_fot_table(self, column_type: str, data_quality_results: ColumnSumma ("50%", "p50", None), ("75%", "p75", None), ("max", "max", None), + ("skew", "skew", None), ("unique", "unique", "unique_percentage"), ("most common", "most_common", "most_common_percentage"), ("missing", "missing", "missing_percentage"), From a15124185ca81bdf4b854bc34e7ce4b1abb7bee3 Mon Sep 17 00:00:00 2001 From: Gwen Au Date: Fri, 22 Nov 2024 10:49:06 +1100 Subject: [PATCH 2/2] Updated tests for expected skew output --- tests/metrics/data_interity/test_column_summary_metric.py | 2 ++ tests/multitest/metrics/data_integrity.py | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/metrics/data_interity/test_column_summary_metric.py b/tests/metrics/data_interity/test_column_summary_metric.py index d7669070eb..4ea7529b65 100644 --- a/tests/metrics/data_interity/test_column_summary_metric.py +++ b/tests/metrics/data_interity/test_column_summary_metric.py @@ -91,6 +91,7 @@ "p25": 1.5, "p50": 2.0, "p75": 2.5, + "skew": 0.0, "std": 1.0, "unique": 3, "unique_percentage": 100.0, @@ -110,6 +111,7 @@ "p25": 1.5, "p50": 2.0, "p75": 2.5, + "skew": 0.0, "std": 1.0, "unique": 3, "unique_percentage": 100.0, diff --git a/tests/multitest/metrics/data_integrity.py b/tests/multitest/metrics/data_integrity.py index d52ae6526a..4c8084acf3 100644 --- a/tests/multitest/metrics/data_integrity.py +++ b/tests/multitest/metrics/data_integrity.py @@ -86,6 +86,7 @@ def column_summary_metric_period(): p25=1.5, p50=2, p75=2.5, + skew=0.0, std=1, unique=3, unique_percentage=100,