diff --git a/examples/how_to_questions/metrics/data_integrity/dataset_rouge_summary_metric.ipynb b/examples/how_to_questions/metrics/data_integrity/dataset_rouge_summary_metric.ipynb new file mode 100644 index 0000000000..f0abc6298c --- /dev/null +++ b/examples/how_to_questions/metrics/data_integrity/dataset_rouge_summary_metric.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evidently Dataset ROUGE Summary Metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from evidently.report import Report\n", + "from evidently.metrics import ROUGESummaryMetric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_data = {\n", + " \"summary\": [\"hello there\", \"general kenobi\"],\n", + "}\n", + "\n", + "current_df = pd.DataFrame(current_data)\n", + "\n", + "reference_data = {\n", + " \"summary\": [\"hello there\", \"no de\"]\n", + "}\n", + "\n", + "current_df = pd.DataFrame(current_data)\n", + "reference_df = pd.DataFrame(reference_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "report = Report(metrics=[\n", + " ROUGESummaryMetric(column_name=\"summary\", rouge_n=2)\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "report.run(current_data=current_df, reference_data=reference_df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "report.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "report.as_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "report.as_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.19" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/how_to_questions/metrics/data_integrity/dataset_summary_metric.ipynb b/examples/how_to_questions/metrics/data_integrity/dataset_summary_metric.ipynb index 78a65bbee8..9b8be1b638 100644 --- a/examples/how_to_questions/metrics/data_integrity/dataset_summary_metric.ipynb +++ b/examples/how_to_questions/metrics/data_integrity/dataset_summary_metric.ipynb @@ -116,7 +116,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.13" + "version": "3.8.19" } }, "nbformat": 4, diff --git a/requirements.dev.txt b/requirements.dev.txt index dc0589f754..a4f38fdd44 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -17,6 +17,7 @@ pip-audit pyspark ruff==0.3.7 pre-commit==3.5.0 +evaluate==0.4.1 # service dependencies litestar>=2.7.1 diff --git a/requirements.min.txt b/requirements.min.txt index e7a5d12f28..cd8ee86f57 100644 --- a/requirements.min.txt +++ b/requirements.min.txt @@ -31,4 +31,5 @@ openai==1.16.2 evaluate==0.4.1 transformers[torch]==4.39.3 sentence-transformers==2.7.0 +rouge-score==0.1.2 chromadb==0.4.0 diff --git a/setup.py b/setup.py index d5df8514e0..63e81651a5 100644 --- a/setup.py +++ b/setup.py @@ -76,6 +76,7 @@ "deprecation>=2.1.0", "uuid6>=2024.7.10", "cryptography>=43.0.1", + "evaluate>=0.4.1", ], extras_require={ "dev": [ @@ -93,16 +94,18 @@ "types-python-dateutil==2.8.19", "types-ujson>=5.4.0", "pillow==10.3.0", - "httpx==0.27.0", + "httpx==0.24.1", "ruff==0.3.7", "pre-commit==3.5.0", "pytest-asyncio==0.23.7", + "evaluate>=0.4.1", ], "llm": [ "openai>=1.16.2", "evaluate>=0.4.1", "transformers[torch]>=4.39.3", "sentence-transformers>=2.7.0", + "rouge-score>=0.1.2", "chromadb>=0.4.0", ], "spark": ["pyspark>=3.4.0"], diff --git a/src/evidently/metrics/__init__.py b/src/evidently/metrics/__init__.py index c88a28babf..773b77626c 100644 --- a/src/evidently/metrics/__init__.py +++ b/src/evidently/metrics/__init__.py @@ -32,6 +32,7 @@ from .data_integrity.column_summary_metric import ColumnSummaryMetric from .data_integrity.dataset_missing_values_metric import DatasetMissingValuesMetric from .data_integrity.dataset_summary_metric import DatasetSummaryMetric +from .data_integrity.rouge_summary_metric import ROUGESummaryMetric from .data_quality.column_category_metric import ColumnCategoryMetric from .data_quality.column_correlations_metric import ColumnCorrelationsMetric from .data_quality.column_distribution_metric import ColumnDistributionMetric @@ -99,6 +100,7 @@ "ColumnSummaryMetric", "DatasetMissingValuesMetric", "DatasetSummaryMetric", + "ROUGESummaryMetric", "ColumnCategoryMetric", "ColumnCorrelationsMetric", "ColumnDistributionMetric", diff --git a/src/evidently/metrics/_registry.py b/src/evidently/metrics/_registry.py index 1ed0ce8345..26f6e58a8a 100644 --- a/src/evidently/metrics/_registry.py +++ b/src/evidently/metrics/_registry.py @@ -138,6 +138,13 @@ "evidently.metrics.data_integrity.dataset_summary_metric.DatasetSummaryMetric", "evidently:metric:DatasetSummaryMetric", ) + +register_type_alias( + Metric, + "evidently.metrics.data_integrity.rouge_summary_metric.ROUGESummaryMetric", + "evidently:metric:ROUGESummaryMetric", +) + register_type_alias( Metric, "evidently.metrics.data_quality.column_category_metric.ColumnCategoryMetric", @@ -570,6 +577,11 @@ "evidently.metrics.data_integrity.dataset_summary_metric.DatasetSummaryMetricResult", "evidently:metric_result:DatasetSummaryMetricResult", ) +register_type_alias( + MetricResult, + "evidently.metrics.data_integrity.rouge_summary_metric.ROUGESummaryMetricResult", + "evidently:metric_result:ROUGESummaryMetricResult", +) register_type_alias( MetricResult, "evidently.metrics.data_quality.column_category_metric.CategoryStat", diff --git a/src/evidently/metrics/data_integrity/rouge_summary_metric.py b/src/evidently/metrics/data_integrity/rouge_summary_metric.py new file mode 100644 index 0000000000..c9c53aeb2b --- /dev/null +++ b/src/evidently/metrics/data_integrity/rouge_summary_metric.py @@ -0,0 +1,103 @@ +from typing import List + +import evaluate +import pandas as pd + +from evidently.base_metric import InputData +from evidently.base_metric import Metric +from evidently.base_metric import MetricResult +from evidently.core import IncludeTags +from evidently.model.widget import BaseWidgetInfo +from evidently.options.base import AnyOptions +from evidently.renderers.base_renderer import MetricRenderer +from evidently.renderers.base_renderer import default_renderer +from evidently.renderers.html_widgets import header_text +from evidently.renderers.html_widgets import table_data +from evidently.renderers.html_widgets import text_widget + + +class ROUGESummaryMetricResult(MetricResult): + class Config: + type_alias = "evidently:metric_result:ROUGESummaryMetricResult" + field_tags = { + "current": {IncludeTags.Current}, + "reference": {IncludeTags.Reference}, + "rouge_type": {IncludeTags.Parameter}, + "per_row_scores": {IncludeTags.Parameter}, + "summary_score": {IncludeTags.Parameter}, + } + + current: list + reference: list + rouge_type: str + per_row_scores: list + summary_score: float + + +class ROUGESummaryMetric(Metric[ROUGESummaryMetricResult]): + class Config: + type_alias = "evidently:metric:ROUGESummaryMetric" + arbitrary_types_allowed = True + + column_name: str + rouge_n: int + + def __init__(self, column_name: str, rouge_n: int, options: AnyOptions = None): + self.column_name = column_name + self.rouge_n = rouge_n + super().__init__(options=options) + + def _calculate_summary_rouge(self, current: pd.Series, reference: pd.Series): + rouge_evaluator = evaluate.load("rouge") + + current = current.astype(str).tolist() + reference = reference.astype(str).tolist() + + rouge_scores = rouge_evaluator.compute( + rouge_types=[f"rouge{self.rouge_n}"], predictions=current, references=reference, use_aggregator=False + ) + + per_row_rouge_scores = rouge_scores[f"rouge{self.rouge_n}"] + + summary_rouge_score = sum(per_row_rouge_scores) / len(per_row_rouge_scores) + + return per_row_rouge_scores, summary_rouge_score, current, reference + + def calculate(self, data: InputData) -> ROUGESummaryMetricResult: + if data.current_data is None or data.reference_data is None: + raise ValueError("The current data or the reference data is None.") + if len(data.current_data[self.column_name]) == 0 or len(data.reference_data[self.column_name]) == 0: + raise ValueError("The current data or the reference data is empty.") + + per_row_rouge_scores, summary_rouge_score, current, reference = self._calculate_summary_rouge( + data.current_data[self.column_name], data.reference_data[self.column_name] + ) + + result = ROUGESummaryMetricResult( + rouge_type=f"ROUGE-{self.rouge_n}", + per_row_scores=per_row_rouge_scores, + summary_score=summary_rouge_score, + current=current, + reference=reference, + ) + return result + + +@default_renderer(wrap_type=ROUGESummaryMetric) +class ROUGESummaryMetricRenderer(MetricRenderer): + @staticmethod + def _get_table(metric) -> BaseWidgetInfo: + column_names = ["Metric", "current", "reference", "score"] + rows = [] + for i in range(len(metric.current)): + rows.append([metric.rouge_type, metric.current[i], metric.reference[i], metric.per_row_scores[i]]) + # rows.append(["metric.rouge_type", 1, "metric.current[i]", "metric.reference[i]", 2.4]) + return table_data(title="", column_names=column_names, data=rows) + + def render_html(self, obj: ROUGESummaryMetric) -> List[BaseWidgetInfo]: + metric = obj.get_result() + return [ + header_text(label="ROUGE Metric"), + self._get_table(metric), + text_widget(text=f"{metric.summary_score}", title="Overall ROUGE score"), + ] diff --git a/tests/metrics/data_interity/test_dataset_rouge_summary_metric.py b/tests/metrics/data_interity/test_dataset_rouge_summary_metric.py new file mode 100644 index 0000000000..814bf39ec2 --- /dev/null +++ b/tests/metrics/data_interity/test_dataset_rouge_summary_metric.py @@ -0,0 +1,45 @@ +import json + +import pandas as pd +import pytest + +from evidently.metrics.data_integrity.rouge_summary_metric import ROUGESummaryMetric +from evidently.report.report import Report + + +@pytest.mark.parametrize( + "current_df, reference_df, metric, expected_json", + ( + ( + pd.DataFrame( + { + "summary": ["hello there", "general kenobi"], + } + ), + pd.DataFrame({"summary": ["hello there", "no de"]}), + ROUGESummaryMetric(column_name="summary", rouge_n=1), + { + "current": ["hello there", "general kenobi"], + "reference": ["hello there", "no de"], + "rouge_type": "ROUGE-1", + "per_row_scores": [1.0, 0.0], + "summary_score": 0.5, + }, + ), + ), +) +def test_rouge_summary_metric_with_report( + current_df: pd.DataFrame, + reference_df: pd.DataFrame, + metric, + expected_json: dict, +) -> None: + report = Report(metrics=[metric]) + + report.run(current_data=current_df, reference_data=reference_df) + + assert report.show() + json_result = report.json() + assert len(json_result) > 0 + result = json.loads(json_result) + assert result["metrics"][0]["result"] == expected_json diff --git a/tests/multitest/metrics/data_integrity.py b/tests/multitest/metrics/data_integrity.py index d52ae6526a..7973928f44 100644 --- a/tests/multitest/metrics/data_integrity.py +++ b/tests/multitest/metrics/data_integrity.py @@ -16,6 +16,7 @@ from evidently.metrics.data_integrity.column_summary_metric import NumericCharacteristics from evidently.metrics.data_integrity.dataset_missing_values_metric import DatasetMissingValuesMetric from evidently.metrics.data_integrity.dataset_summary_metric import DatasetSummaryMetric +from evidently.metrics.data_integrity.rouge_summary_metric import ROUGESummaryMetric from tests.multitest.conftest import AssertExpectedResult from tests.multitest.conftest import Error from tests.multitest.conftest import NoopOutcome @@ -206,6 +207,27 @@ def dataset_summary_metric(): ) +@metric +def rouge_summary_metric(): + return TestMetric( + name="rouge_summary_metric", + metric=ROUGESummaryMetric(column_name="summary", rouge_n=1), + fingerprint="bfc616f760b973d2cbfbf0540c7b2c71", + outcomes=NoopOutcome(), + datasets=[ + TestDataset( + "rouge_summary_metric_data", + current=pd.DataFrame( + { + "summary": ["hello there", "general kenobi"], + } + ), + reference=pd.DataFrame({"summary": ["hello there", "no de"]}), + ), + ], + ) + + @metric def column_reg_exp_metric(): return TestMetric(