diff --git a/src/evidently/metric_preset/text_evals.py b/src/evidently/metric_preset/text_evals.py index 8750baadc4..2748eada4f 100644 --- a/src/evidently/metric_preset/text_evals.py +++ b/src/evidently/metric_preset/text_evals.py @@ -12,6 +12,7 @@ from evidently.metric_preset.metric_preset import AnyMetric from evidently.metric_preset.metric_preset import MetricPreset from evidently.metrics import ColumnSummaryMetric +from evidently.metrics.bleu_metric import BLEUMetric from evidently.utils.data_preprocessing import DataDefinition @@ -37,4 +38,4 @@ def generate_metrics( OOV(), NonLetterCharacterPercentage(), ] - return [ColumnSummaryMetric(desc.on(self.column_name)) for desc in descriptors] + return [ColumnSummaryMetric(desc.on(self.column_name)) for desc in descriptors] + [BLEUMetric(reference_column="reference", hypothesis_column=self.column_name)] diff --git a/src/evidently/metrics/__init__.py b/src/evidently/metrics/__init__.py index c88a28babf..553a58d00f 100644 --- a/src/evidently/metrics/__init__.py +++ b/src/evidently/metrics/__init__.py @@ -72,6 +72,7 @@ from .regression_performance.regression_performance_metrics import RegressionPerformanceMetrics from .regression_performance.regression_quality import RegressionQualityMetric from .regression_performance.top_error import RegressionTopErrorMetric +from .bleu_metric import BLEUMetric __all__ = [ "ClassificationClassBalance", @@ -141,5 +142,6 @@ "ScoreDistribution", "MRRKMetric", "RecCasesTable", + "BLEUMetric", "_registry", ] diff --git a/src/evidently/metrics/bleu_metric.py b/src/evidently/metrics/bleu_metric.py new file mode 100644 index 0000000000..9b46887725 --- /dev/null +++ b/src/evidently/metrics/bleu_metric.py @@ -0,0 +1,60 @@ +import numpy as np +import pandas as pd +from typing import List, Optional, Tuple +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction + +from evidently.base_metric import InputData, Metric, MetricResult +from evidently.core import IncludeTags +from evidently.metric_results import ColumnMetricResult +from evidently.model.widget import BaseWidgetInfo +from evidently.renderers.base_renderer import MetricRenderer +from evidently.renderers.base_renderer import default_renderer +from evidently.renderers.html_widgets import header_text, table_data + + +class BLEUMetricResult(MetricResult): + class Config: + type_alias = "evidently:metric_result:BLEUMetricResult" + field_tags = { + "bleu_scores": {IncludeTags.Current}, + "average_bleu_score": {IncludeTags.Current}, + } + + bleu_scores: List[float] + average_bleu_score: float + + +class BLEUMetric(Metric[BLEUMetricResult]): + class Config: + type_alias = "evidently:metric:BLEUMetric" + + def __init__(self, reference_column: str, hypothesis_column: str, options: Optional[dict] = None): + self.reference_column = reference_column + self.hypothesis_column = hypothesis_column + super().__init__(options=options) + + def calculate(self, data: InputData) -> BLEUMetricResult: + reference_texts = data.current_data[self.reference_column] + hypothesis_texts = data.current_data[self.hypothesis_column] + + bleu_scores = [ + sentence_bleu([ref.split()], hyp.split(), smoothing_function=SmoothingFunction().method1) + for ref, hyp in zip(reference_texts, hypothesis_texts) + ] + average_bleu_score = np.mean(bleu_scores) + + return BLEUMetricResult(bleu_scores=bleu_scores, average_bleu_score=average_bleu_score) + + +@default_renderer(wrap_type=BLEUMetric) +class BLEUMetricRenderer(MetricRenderer): + def render_html(self, obj: BLEUMetric) -> List[BaseWidgetInfo]: + result = obj.get_result() + headers = ["Row", "BLEU Score"] + data = [[i, score] for i, score in enumerate(result.bleu_scores)] + data.append(["Average", result.average_bleu_score]) + + return [ + header_text(label="BLEU Scores"), + table_data(column_names=headers, data=data, title="BLEU Scores per Row"), + ] diff --git a/tests/metrics/test_bleu_metric.py b/tests/metrics/test_bleu_metric.py new file mode 100644 index 0000000000..2366f8e8aa --- /dev/null +++ b/tests/metrics/test_bleu_metric.py @@ -0,0 +1,15 @@ +import pytest +import pandas as pd +from evidently.metrics.bleu_metric import BLEUMetric + +def test_bleu_metric(): + data = pd.DataFrame({ + "reference": ["the cat is on the mat", "there is a cat on the mat"], + "hypothesis": ["the cat is on the mat", "there is cat on mat"] + }) + metric = BLEUMetric(reference_column="reference", hypothesis_column="hypothesis") + result = metric.calculate(data) + assert result.average_bleu_score == pytest.approx(0.7598356856515925, rel=1e-2) + assert len(result.bleu_scores) == 2 + assert result.bleu_scores[0] == pytest.approx(1.0, rel=1e-2) + assert result.bleu_scores[1] == pytest.approx(0.519671371303185, rel=1e-2)