Skip to content

Commit

Permalink
feat: Implement SemScore metric (#38)
Browse files Browse the repository at this point in the history
  • Loading branch information
carlosgjs authored Feb 6, 2024
1 parent 2a26151 commit fb36435
Show file tree
Hide file tree
Showing 11 changed files with 70 additions and 31 deletions.
2 changes: 1 addition & 1 deletion .github/actions/deps/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ runs:
python3 -m pip install --upgrade pip
- name: Install project
shell: sh
run: pip install ".[dev,train,cuda]"
run: pip install ".[dev,pipelines,cuda]"
3 changes: 3 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@ ignore_missing_imports = True

[mypy-nltk.*]
ignore_missing_imports = True

[mypy-sentence_transformers.*]
ignore_missing_imports = True
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ Once you have created a new environment, you can install this project for local
development using the following commands:

```
>> pip install -e .'[dev,train]'
>> pip install -e .'[dev,pipelines]'
>> pre-commit install
>> conda install pandoc
```

Notes:
1) The single quotes around `'[dev]'` may not be required for your operating system.
3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA.
3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,pipelines,cuda]"` if you want to use CUDA.
2) `pre-commit install` will initialize pre-commit for this local repository, so
that a set of tests will be run prior to completing a local commit. For more
information, see the Python Project Template documentation on
Expand Down
1 change: 1 addition & 0 deletions azureml/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ dependencies:
- nltk
# This works, while installing from pytorch and cuda from conda does not
- torch==2.0.1
- sentence_transformers>=2.3.1
18 changes: 12 additions & 6 deletions notebooks/generate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"# Uncomment to clone and install autodoc from GitHub\n",
"# !pip uninstall -y autora-doc\n",
"# !git clone https://github.com/AutoResearch/autodoc.git\n",
"# !pip install \"./autodoc[cuda,train]\"\n",
"# !pip install \"./autodoc[cuda,pipelines]\"\n",
"\n",
"# IMPORTANT: Please restart the runtime after running the above commands"
]
Expand All @@ -42,7 +42,7 @@
"%autoreload 2\n",
"from autora.doc.runtime.predict_hf import Predictor, preprocess_code\n",
"from autora.doc.runtime.prompts import PROMPTS, PromptIds, PromptBuilder, SYS_GUIDES\n",
"from autora.doc.pipelines.main import evaluate_documentation\n",
"from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore\n",
"from autora.doc.pipelines.main import eval_prompt, load_data"
]
},
Expand Down Expand Up @@ -111,9 +111,13 @@
" top_k=10,\n",
" num_ret_seq=1,\n",
" )\n",
" bleu, meteor = evaluate_documentation(output, [label])\n",
" bleu, meteor = eval_bleu_meteor(output, [label])\n",
" sem_score = eval_semscore(output, [label])\n",
"\n",
" for i, o in enumerate(output):\n",
" print(f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}\\n{o}\\n*************\\n\")"
" print(\n",
" f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}, sem_score={sem_score}\\n{o}\\n*************\\n\"\n",
" )"
]
},
{
Expand Down Expand Up @@ -176,8 +180,10 @@
},
"outputs": [],
"source": [
"out, bleu, meteor = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n",
"print(f\"bleu={bleu}, meteor={meteor}\\n{out[0]}\\n*************\\n\")"
"eval_result = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n",
"print(\n",
" f\"bleu={eval_result.bleu_score}, meteor={eval_result.meteor_score}, sem_score={eval_result.sem_score}\\n{eval_result.predictions[0]}\\n*************\\n\"\n",
")"
]
},
{
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ dependencies = [
# This works, while installing from pytorch and cuda from conda does not",
"torch==2.0.1",
"transformers>=4.37.2",
"nltk",
]

# On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
Expand All @@ -44,7 +43,7 @@ dev = [
"ipykernel",
"hf_transfer",
]
train = ["jsonlines", "mlflow"]
pipelines = ["jsonlines", "mlflow", "nltk", "sentence-transformers>=2.3.1"]
azure = ["azureml-core", "azureml-mlflow"]
cuda = ["bitsandbytes>=0.42.0", "accelerate>=0.24.1", "xformers"]

Expand Down
3 changes: 2 additions & 1 deletion src/autora/doc/classes/EvalResult.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
class EvalResult:
"""Class for storing LLM evaluation results"""

prediction: List[str]
predictions: List[str]
prompt: str
bleu_score: Optional[float] = None
meteor_score: Optional[float] = None
sem_score: Optional[float] = None
20 changes: 7 additions & 13 deletions src/autora/doc/pipelines/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import typer

from autora.doc.classes.EvalResult import EvalResult
from autora.doc.pipelines.metrics import eval_bleu_meteor
from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore
from autora.doc.runtime.predict_hf import Predictor
from autora.doc.runtime.prompts import PROMPTS, PromptIds
from autora.doc.util import get_prompts_from_file
Expand Down Expand Up @@ -52,14 +52,8 @@ def eval_prompts(
predictor = Predictor(model_path)
for i in range(len(prompts_list)):
logger.info(f"Starting to run model on prompt {i}")
prediction_with_scores = eval_prompt(data_file, predictor, prompts_list[i], param_dict)
eval_result = eval_prompt(data_file, predictor, prompts_list[i], param_dict)
logger.info(f"Model run completed on prompt {i}: {prompts_list[i]}")
eval_result = EvalResult(
prediction_with_scores[0],
prompts_list[i],
prediction_with_scores[1],
prediction_with_scores[2],
)
results_list.append(eval_result)
return results_list

Expand All @@ -72,7 +66,7 @@ def eval(
param: List[str] = typer.Option(
[], help="Additional float parameters to pass to the model as name=float pairs"
),
) -> Tuple[List[str], float, float]:
) -> EvalResult:
import mlflow

mlflow.autolog()
Expand Down Expand Up @@ -104,9 +98,7 @@ def load_data(data_file: str) -> Tuple[List[str], List[str]]:
return inputs, labels


def eval_prompt(
data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float]
) -> Tuple[List[str], float, float]:
def eval_prompt(data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float]) -> EvalResult:
import mlflow

inputs, labels = load_data(data_file)
Expand All @@ -115,6 +107,7 @@ def eval_prompt(
predictions = pred.predict(prompt, inputs, **param_dict)
timer_end = timer()
bleu, meteor = eval_bleu_meteor(predictions, labels)
semscore = eval_semscore(predictions, labels)
pred_time = timer_end - timer_start
mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
for i in range(len(inputs)):
Expand All @@ -133,7 +126,8 @@ def eval_prompt(
mlflow.log_metric("tokens/sec", total_tokens / pred_time)
mlflow.log_metric("bleu_score", round(bleu, 5))
mlflow.log_metric("meteor_score", round(meteor, 5))
return predictions, bleu, meteor
mlflow.log_metric("semscore", round(semscore, 5))
return EvalResult(predictions, prompt, bleu, meteor, semscore)


@app.command()
Expand Down
23 changes: 22 additions & 1 deletion src/autora/doc/pipelines/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import nltk
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from nltk.translate.meteor_score import single_meteor_score
from numpy import dot, mean, nan_to_num
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer


def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[float, float]:
Expand All @@ -27,6 +30,24 @@ def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[flo
single_meteor_score(tokenized_ref, tokenized_pred)
for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
]
meteor = sum(meteor_scores) / len(predictions) if predictions else 0
meteor: float = nan_to_num(mean(meteor_scores), nan=0)

return (bleu, meteor)


def eval_semscore(predictions: List[str], references: List[str]) -> float:
"""
Calculate sentence embedding similarity score.
https://arxiv.org/pdf/2401.17072.pdf
"""
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def score(pred: str, ref: str) -> float:
encodings = model.encode([pred, ref])
assert len(encodings) == 2
cos_dist: float = dot(encodings[0], encodings[1]) / norm(encodings[0]) * norm(encodings[1])
return cos_dist

scores = [score(pred, ref) for pred, ref in zip(predictions, references)]
semscore: float = nan_to_num(mean(scores), nan=0)
return semscore
8 changes: 4 additions & 4 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

def test_predict() -> None:
data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
outputs, _, _ = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, [])
assert len(outputs) == 3, "Expected 3 outputs"
for output in outputs:
eval_result = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, [])
assert len(eval_result.predictions) == 3, "Expected 3 outputs"
for output in eval_result.predictions:
assert len(output) > 0, "Expected non-empty output"


Expand Down Expand Up @@ -42,5 +42,5 @@ def test_eval_prompts() -> None:
results: List[EvalResult] = eval_prompts(str(data_file), TEST_HF_MODEL, str(prompts_file), [])
assert len(results) == 3, "Expected 3 outputs"
for result in results:
assert result.prediction is not None, "The prediction should not be None"
assert result.predictions is not None, "The prediction should not be None"
assert result.prompt is not None, "The prompt should not be None"
16 changes: 15 additions & 1 deletion tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import jsonlines
import pytest

from autora.doc.pipelines.metrics import eval_bleu_meteor
from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore


def test_evaluation() -> None:
Expand Down Expand Up @@ -53,3 +53,17 @@ def test_partially_matching_tokens() -> None:
bleu, meteor = eval_bleu_meteor(predictions, labels)
assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"


def test_semscore() -> None:
# Test Case: SemScore is close to 1
labels = ["this is really good"]
predictions = ["this is great"]
semscore = eval_semscore(predictions, labels)
assert semscore >= 0.6, f"SemScore is {semscore}"

semscore = eval_semscore(labels, labels)
assert semscore == pytest.approx(1.0), f"SemScore is {semscore}"

semscore = eval_semscore([], [])
assert semscore == 0, f"SemScore is {semscore}"

0 comments on commit fb36435

Please sign in to comment.