Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/evidently/metric_preset/text_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from evidently.metric_preset.metric_preset import AnyMetric
from evidently.metric_preset.metric_preset import MetricPreset
from evidently.metrics import ColumnSummaryMetric
from evidently.metrics.bleu_metric import BLEUMetric
from evidently.utils.data_preprocessing import DataDefinition


Expand All @@ -37,4 +38,4 @@ def generate_metrics(
OOV(),
NonLetterCharacterPercentage(),
]
return [ColumnSummaryMetric(desc.on(self.column_name)) for desc in descriptors]
return [ColumnSummaryMetric(desc.on(self.column_name)) for desc in descriptors] + [BLEUMetric(reference_column="reference", hypothesis_column=self.column_name)]
2 changes: 2 additions & 0 deletions src/evidently/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
from .regression_performance.regression_performance_metrics import RegressionPerformanceMetrics
from .regression_performance.regression_quality import RegressionQualityMetric
from .regression_performance.top_error import RegressionTopErrorMetric
from .bleu_metric import BLEUMetric

__all__ = [
"ClassificationClassBalance",
Expand Down Expand Up @@ -141,5 +142,6 @@
"ScoreDistribution",
"MRRKMetric",
"RecCasesTable",
"BLEUMetric",
"_registry",
]
60 changes: 60 additions & 0 deletions src/evidently/metrics/bleu_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np
import pandas as pd
from typing import List, Optional, Tuple
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

from evidently.base_metric import InputData, Metric, MetricResult
from evidently.core import IncludeTags
from evidently.metric_results import ColumnMetricResult
from evidently.model.widget import BaseWidgetInfo
from evidently.renderers.base_renderer import MetricRenderer
from evidently.renderers.base_renderer import default_renderer
from evidently.renderers.html_widgets import header_text, table_data


class BLEUMetricResult(MetricResult):
class Config:
type_alias = "evidently:metric_result:BLEUMetricResult"
field_tags = {
"bleu_scores": {IncludeTags.Current},
"average_bleu_score": {IncludeTags.Current},
}

bleu_scores: List[float]
average_bleu_score: float


class BLEUMetric(Metric[BLEUMetricResult]):
class Config:
type_alias = "evidently:metric:BLEUMetric"

def __init__(self, reference_column: str, hypothesis_column: str, options: Optional[dict] = None):
self.reference_column = reference_column
self.hypothesis_column = hypothesis_column
super().__init__(options=options)

def calculate(self, data: InputData) -> BLEUMetricResult:
reference_texts = data.current_data[self.reference_column]
hypothesis_texts = data.current_data[self.hypothesis_column]

bleu_scores = [
sentence_bleu([ref.split()], hyp.split(), smoothing_function=SmoothingFunction().method1)
for ref, hyp in zip(reference_texts, hypothesis_texts)
]
average_bleu_score = np.mean(bleu_scores)

return BLEUMetricResult(bleu_scores=bleu_scores, average_bleu_score=average_bleu_score)


@default_renderer(wrap_type=BLEUMetric)
class BLEUMetricRenderer(MetricRenderer):
def render_html(self, obj: BLEUMetric) -> List[BaseWidgetInfo]:
result = obj.get_result()
headers = ["Row", "BLEU Score"]
data = [[i, score] for i, score in enumerate(result.bleu_scores)]
data.append(["Average", result.average_bleu_score])

return [
header_text(label="BLEU Scores"),
table_data(column_names=headers, data=data, title="BLEU Scores per Row"),
]
15 changes: 15 additions & 0 deletions tests/metrics/test_bleu_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest
import pandas as pd
from evidently.metrics.bleu_metric import BLEUMetric

def test_bleu_metric():
data = pd.DataFrame({
"reference": ["the cat is on the mat", "there is a cat on the mat"],
"hypothesis": ["the cat is on the mat", "there is cat on mat"]
})
metric = BLEUMetric(reference_column="reference", hypothesis_column="hypothesis")
result = metric.calculate(data)
assert result.average_bleu_score == pytest.approx(0.7598356856515925, rel=1e-2)
assert len(result.bleu_scores) == 2
assert result.bleu_scores[0] == pytest.approx(1.0, rel=1e-2)
assert result.bleu_scores[1] == pytest.approx(0.519671371303185, rel=1e-2)
Loading