From 5d566f50d3983305d30b4990eed8620b03e358ce Mon Sep 17 00:00:00 2001 From: Jaimin Godhani <112328542+Jai0401@users.noreply.github.com> Date: Sat, 15 Feb 2025 02:09:06 +0530 Subject: [PATCH] Add BLEU metric to Evidently Fixes #1319 Add BLEU metric implementation to compute scores for each row and a summary BLEU metric for the dataset. * **New BLEU Metric Implementation** - Create `src/evidently/metrics/bleu_metric.py` to implement the BLEU metric. - Define BLEUMetricResult and BLEUMetric classes. - Implement BLEUMetricRenderer for HTML rendering of BLEU scores. * **Integration with Existing Code** - Update `src/evidently/metric_preset/text_evals.py` to include BLEU metric in the descriptors list. - Update `src/evidently/metrics/__init__.py` to import and reference the BLEU metric. * **Testing** - Add `tests/metrics/test_bleu_metric.py` to include tests for the BLEU metric. --- src/evidently/metric_preset/text_evals.py | 3 +- src/evidently/metrics/__init__.py | 2 + src/evidently/metrics/bleu_metric.py | 60 +++++++++++++++++++++++ tests/metrics/test_bleu_metric.py | 15 ++++++ 4 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 src/evidently/metrics/bleu_metric.py create mode 100644 tests/metrics/test_bleu_metric.py diff --git a/src/evidently/metric_preset/text_evals.py b/src/evidently/metric_preset/text_evals.py index 8750baadc4..2748eada4f 100644 --- a/src/evidently/metric_preset/text_evals.py +++ b/src/evidently/metric_preset/text_evals.py @@ -12,6 +12,7 @@ from evidently.metric_preset.metric_preset import AnyMetric from evidently.metric_preset.metric_preset import MetricPreset from evidently.metrics import ColumnSummaryMetric +from evidently.metrics.bleu_metric import BLEUMetric from evidently.utils.data_preprocessing import DataDefinition @@ -37,4 +38,4 @@ def generate_metrics( OOV(), NonLetterCharacterPercentage(), ] - return [ColumnSummaryMetric(desc.on(self.column_name)) for desc in descriptors] + return [ColumnSummaryMetric(desc.on(self.column_name)) for desc in descriptors] + [BLEUMetric(reference_column="reference", hypothesis_column=self.column_name)] diff --git a/src/evidently/metrics/__init__.py b/src/evidently/metrics/__init__.py index c88a28babf..553a58d00f 100644 --- a/src/evidently/metrics/__init__.py +++ b/src/evidently/metrics/__init__.py @@ -72,6 +72,7 @@ from .regression_performance.regression_performance_metrics import RegressionPerformanceMetrics from .regression_performance.regression_quality import RegressionQualityMetric from .regression_performance.top_error import RegressionTopErrorMetric +from .bleu_metric import BLEUMetric __all__ = [ "ClassificationClassBalance", @@ -141,5 +142,6 @@ "ScoreDistribution", "MRRKMetric", "RecCasesTable", + "BLEUMetric", "_registry", ] diff --git a/src/evidently/metrics/bleu_metric.py b/src/evidently/metrics/bleu_metric.py new file mode 100644 index 0000000000..9b46887725 --- /dev/null +++ b/src/evidently/metrics/bleu_metric.py @@ -0,0 +1,60 @@ +import numpy as np +import pandas as pd +from typing import List, Optional, Tuple +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction + +from evidently.base_metric import InputData, Metric, MetricResult +from evidently.core import IncludeTags +from evidently.metric_results import ColumnMetricResult +from evidently.model.widget import BaseWidgetInfo +from evidently.renderers.base_renderer import MetricRenderer +from evidently.renderers.base_renderer import default_renderer +from evidently.renderers.html_widgets import header_text, table_data + + +class BLEUMetricResult(MetricResult): + class Config: + type_alias = "evidently:metric_result:BLEUMetricResult" + field_tags = { + "bleu_scores": {IncludeTags.Current}, + "average_bleu_score": {IncludeTags.Current}, + } + + bleu_scores: List[float] + average_bleu_score: float + + +class BLEUMetric(Metric[BLEUMetricResult]): + class Config: + type_alias = "evidently:metric:BLEUMetric" + + def __init__(self, reference_column: str, hypothesis_column: str, options: Optional[dict] = None): + self.reference_column = reference_column + self.hypothesis_column = hypothesis_column + super().__init__(options=options) + + def calculate(self, data: InputData) -> BLEUMetricResult: + reference_texts = data.current_data[self.reference_column] + hypothesis_texts = data.current_data[self.hypothesis_column] + + bleu_scores = [ + sentence_bleu([ref.split()], hyp.split(), smoothing_function=SmoothingFunction().method1) + for ref, hyp in zip(reference_texts, hypothesis_texts) + ] + average_bleu_score = np.mean(bleu_scores) + + return BLEUMetricResult(bleu_scores=bleu_scores, average_bleu_score=average_bleu_score) + + +@default_renderer(wrap_type=BLEUMetric) +class BLEUMetricRenderer(MetricRenderer): + def render_html(self, obj: BLEUMetric) -> List[BaseWidgetInfo]: + result = obj.get_result() + headers = ["Row", "BLEU Score"] + data = [[i, score] for i, score in enumerate(result.bleu_scores)] + data.append(["Average", result.average_bleu_score]) + + return [ + header_text(label="BLEU Scores"), + table_data(column_names=headers, data=data, title="BLEU Scores per Row"), + ] diff --git a/tests/metrics/test_bleu_metric.py b/tests/metrics/test_bleu_metric.py new file mode 100644 index 0000000000..2366f8e8aa --- /dev/null +++ b/tests/metrics/test_bleu_metric.py @@ -0,0 +1,15 @@ +import pytest +import pandas as pd +from evidently.metrics.bleu_metric import BLEUMetric + +def test_bleu_metric(): + data = pd.DataFrame({ + "reference": ["the cat is on the mat", "there is a cat on the mat"], + "hypothesis": ["the cat is on the mat", "there is cat on mat"] + }) + metric = BLEUMetric(reference_column="reference", hypothesis_column="hypothesis") + result = metric.calculate(data) + assert result.average_bleu_score == pytest.approx(0.7598356856515925, rel=1e-2) + assert len(result.bleu_scores) == 2 + assert result.bleu_scores[0] == pytest.approx(1.0, rel=1e-2) + assert result.bleu_scores[1] == pytest.approx(0.519671371303185, rel=1e-2)