Skip to content

Commit

Permalink
refactor: Move metrics to a different module, rename function (#37)
Browse files Browse the repository at this point in the history
  • Loading branch information
carlosgjs authored Feb 6, 2024
1 parent 7891902 commit 2a26151
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 85 deletions.
33 changes: 2 additions & 31 deletions src/autora/doc/pipelines/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
from timeit import default_timer as timer
from typing import Dict, List, Tuple

import nltk
import torch
import typer
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from nltk.translate.meteor_score import single_meteor_score

from autora.doc.classes.EvalResult import EvalResult
from autora.doc.pipelines.metrics import eval_bleu_meteor
from autora.doc.runtime.predict_hf import Predictor
from autora.doc.runtime.prompts import PROMPTS, PromptIds
from autora.doc.util import get_prompts_from_file
Expand All @@ -22,33 +20,6 @@
logger = logging.getLogger(__name__)


def evaluate_documentation(predictions: List[str], references: List[str]) -> Tuple[float, float]:
nltk.download("wordnet")

# Tokenize references
tokenized_references = [ref.split() for ref in references]
# Currently there is only 1 prediction for 1 reference, need to avg in future
tokenized_predictions = [pred.split() if pred else [] for pred in predictions]

# Calculate BLEU score with smoothing function
# SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
bleu = corpus_bleu(
# Wrap each reference list in another list
[[tokenized_ref] for tokenized_ref in tokenized_references],
tokenized_predictions,
smoothing_function=SmoothingFunction().method1,
)

# Calculate METEOR scores
meteor_scores = [
single_meteor_score(tokenized_ref, tokenized_pred)
for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
]
meteor = sum(meteor_scores) / len(predictions) if predictions else 0

return (bleu, meteor)


@app.command(help="Evaluate a model for code-to-documentation generation for all prompts in the prompts_file")
def eval_prompts(
data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"),
Expand Down Expand Up @@ -143,7 +114,7 @@ def eval_prompt(
timer_start = timer()
predictions = pred.predict(prompt, inputs, **param_dict)
timer_end = timer()
bleu, meteor = evaluate_documentation(predictions, labels)
bleu, meteor = eval_bleu_meteor(predictions, labels)
pred_time = timer_end - timer_start
mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
for i in range(len(inputs)):
Expand Down
32 changes: 32 additions & 0 deletions src/autora/doc/pipelines/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import List, Tuple

import nltk
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from nltk.translate.meteor_score import single_meteor_score


def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[float, float]:
nltk.download("wordnet")

# Tokenize references
tokenized_references = [ref.split() for ref in references]
# Currently there is only 1 prediction for 1 reference, need to avg in future
tokenized_predictions = [pred.split() if pred else [] for pred in predictions]

# Calculate BLEU score with smoothing function
# SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
bleu = corpus_bleu(
# Wrap each reference list in another list
[[tokenized_ref] for tokenized_ref in tokenized_references],
tokenized_predictions,
smoothing_function=SmoothingFunction().method1,
)

# Calculate METEOR scores
meteor_scores = [
single_meteor_score(tokenized_ref, tokenized_pred)
for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
]
meteor = sum(meteor_scores) / len(predictions) if predictions else 0

return (bleu, meteor)
56 changes: 2 additions & 54 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from pathlib import Path
from typing import Dict, List

import jsonlines
import pytest
from typing import List

from autora.doc.classes.EvalResult import EvalResult
from autora.doc.pipelines.main import eval, eval_prompts, evaluate_documentation, generate, import_data
from autora.doc.pipelines.main import eval, eval_prompts, generate, import_data
from autora.doc.runtime.prompts import PromptIds

# dummy HF model for testing
Expand All @@ -20,55 +17,6 @@ def test_predict() -> None:
assert len(output) > 0, "Expected non-empty output"


def test_evaluation() -> None:
# Test Case: Meteor and Bleu scores are close to 1
data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
with jsonlines.open(data) as reader:
items = [item for item in reader]
labels = [item["output"] for item in items]
predictions = [item["output"] for item in items]

bleu, meteor = evaluate_documentation(predictions, labels)
assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"


def test_extra_token_in_prediction() -> None:
# Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
labels = ["this is a test"]
predictions = ["this is a test extra"]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"


def test_missing_token_in_prediction() -> None:
# bleu score is less, meteor is higher
labels = ["this is a test"]
predictions = ["this is a"]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"


def test_completely_different_tokens() -> None:
# both scores are less, as no common tokens
labels = ["this is a test"]
predictions = ["completely different sentence"]
bleu, meteor = evaluate_documentation(predictions, labels)
assert bleu <= 0.1, f"BLEU Score is {bleu}"
assert meteor <= 0.1, f"METEOR Score is {meteor}"


def test_partially_matching_tokens() -> None:
# As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
labels = ["this is a test"]
predictions = ["this is a different test"]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"


def test_generate() -> None:
python_file = __file__
output = Path("output.txt")
Expand Down
55 changes: 55 additions & 0 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from pathlib import Path

import jsonlines
import pytest

from autora.doc.pipelines.metrics import eval_bleu_meteor


def test_evaluation() -> None:
# Test Case: Meteor and Bleu scores are close to 1
data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
with jsonlines.open(data) as reader:
items = [item for item in reader]
labels = [item["output"] for item in items]
predictions = [item["output"] for item in items]

bleu, meteor = eval_bleu_meteor(predictions, labels)
assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"


def test_extra_token_in_prediction() -> None:
# Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
labels = ["this is a test"]
predictions = ["this is a test extra"]
bleu, meteor = eval_bleu_meteor(predictions, labels)
assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"


def test_missing_token_in_prediction() -> None:
# bleu score is less, meteor is higher
labels = ["this is a test"]
predictions = ["this is a"]
bleu, meteor = eval_bleu_meteor(predictions, labels)
assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"


def test_completely_different_tokens() -> None:
# both scores are less, as no common tokens
labels = ["this is a test"]
predictions = ["completely different sentence"]
bleu, meteor = eval_bleu_meteor(predictions, labels)
assert bleu <= 0.1, f"BLEU Score is {bleu}"
assert meteor <= 0.1, f"METEOR Score is {meteor}"


def test_partially_matching_tokens() -> None:
# As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
labels = ["this is a test"]
predictions = ["this is a different test"]
bleu, meteor = eval_bleu_meteor(predictions, labels)
assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"

0 comments on commit 2a26151

Please sign in to comment.