feat: Implement SemScore metric (#38)

AutoResearch · Feb 6, 2024 · fb36435 · fb36435
1 parent 2a26151
commit fb36435
Show file tree

Hide file tree

Showing 11 changed files with 70 additions and 31 deletions.
diff --git a/.github/actions/deps/action.yaml b/.github/actions/deps/action.yaml
@@ -18,4 +18,4 @@ runs:
         python3 -m pip install --upgrade pip
     - name: Install project
       shell: sh
-      run: pip install ".[dev,train,cuda]"
+      run: pip install ".[dev,pipelines,cuda]"
diff --git a/.mypy.ini b/.mypy.ini
@@ -11,3 +11,6 @@ ignore_missing_imports = True
 
 [mypy-nltk.*]
 ignore_missing_imports = True
+
+[mypy-sentence_transformers.*]
+ignore_missing_imports = True
diff --git a/README.md b/README.md
@@ -31,14 +31,14 @@ Once you have created a new environment, you can install this project for local
 development using the following commands:
 
 ```
->> pip install -e .'[dev,train]'
+>> pip install -e .'[dev,pipelines]'
 >> pre-commit install
 >> conda install pandoc
 ```
 
 Notes:
 1) The single quotes around `'[dev]'` may not be required for your operating system.
-3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA.
+3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,pipelines,cuda]"` if you want to use CUDA.
 2) `pre-commit install` will initialize pre-commit for this local repository, so
    that a set of tests will be run prior to completing a local commit. For more
    information, see the Python Project Template documentation on

diff --git a/azureml/conda.yml b/azureml/conda.yml
@@ -17,3 +17,4 @@ dependencies:
     - nltk
     # This works, while installing from pytorch and cuda from conda does not
     - torch==2.0.1
+    - sentence_transformers>=2.3.1
diff --git a/notebooks/generate.ipynb b/notebooks/generate.ipynb
@@ -25,7 +25,7 @@
     "# Uncomment to clone and install autodoc from GitHub\n",
     "# !pip uninstall -y autora-doc\n",
     "# !git clone https://github.com/AutoResearch/autodoc.git\n",
-    "# !pip install \"./autodoc[cuda,train]\"\n",
+    "# !pip install \"./autodoc[cuda,pipelines]\"\n",
     "\n",
     "# IMPORTANT: Please restart the runtime after running the above commands"
    ]
@@ -42,7 +42,7 @@
     "%autoreload 2\n",
     "from autora.doc.runtime.predict_hf import Predictor, preprocess_code\n",
     "from autora.doc.runtime.prompts import PROMPTS, PromptIds, PromptBuilder, SYS_GUIDES\n",
-    "from autora.doc.pipelines.main import evaluate_documentation\n",
+    "from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore\n",
     "from autora.doc.pipelines.main import eval_prompt, load_data"
    ]
   },
@@ -111,9 +111,13 @@
     "        top_k=10,\n",
     "        num_ret_seq=1,\n",
     "    )\n",
-    "    bleu, meteor = evaluate_documentation(output, [label])\n",
+    "    bleu, meteor = eval_bleu_meteor(output, [label])\n",
+    "    sem_score = eval_semscore(output, [label])\n",
+    "\n",
     "    for i, o in enumerate(output):\n",
-    "        print(f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}\\n{o}\\n*************\\n\")"
+    "        print(\n",
+    "            f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}, sem_score={sem_score}\\n{o}\\n*************\\n\"\n",
+    "        )"
    ]
   },
   {
@@ -176,8 +180,10 @@
    },
    "outputs": [],
    "source": [
-    "out, bleu, meteor = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n",
-    "print(f\"bleu={bleu}, meteor={meteor}\\n{out[0]}\\n*************\\n\")"
+    "eval_result = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n",
+    "print(\n",
+    "    f\"bleu={eval_result.bleu_score}, meteor={eval_result.meteor_score}, sem_score={eval_result.sem_score}\\n{eval_result.predictions[0]}\\n*************\\n\"\n",
+    ")"
    ]
   },
   {

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,6 @@ dependencies = [
     # This works, while installing from pytorch and cuda from conda does not",
     "torch==2.0.1",
     "transformers>=4.37.2",
-    "nltk",
 ]
 
 # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
@@ -44,7 +43,7 @@ dev = [
     "ipykernel",
     "hf_transfer",
 ]
-train = ["jsonlines", "mlflow"]
+pipelines = ["jsonlines", "mlflow", "nltk", "sentence-transformers>=2.3.1"]
 azure = ["azureml-core", "azureml-mlflow"]
 cuda = ["bitsandbytes>=0.42.0", "accelerate>=0.24.1", "xformers"]
 

diff --git a/src/autora/doc/classes/EvalResult.py b/src/autora/doc/classes/EvalResult.py
@@ -6,7 +6,8 @@
 class EvalResult:
     """Class for storing LLM evaluation results"""
 
-    prediction: List[str]
+    predictions: List[str]
     prompt: str
     bleu_score: Optional[float] = None
     meteor_score: Optional[float] = None
+    sem_score: Optional[float] = None
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
@@ -7,7 +7,7 @@
 import typer
 
 from autora.doc.classes.EvalResult import EvalResult
-from autora.doc.pipelines.metrics import eval_bleu_meteor
+from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore
 from autora.doc.runtime.predict_hf import Predictor
 from autora.doc.runtime.prompts import PROMPTS, PromptIds
 from autora.doc.util import get_prompts_from_file
@@ -52,14 +52,8 @@ def eval_prompts(
         predictor = Predictor(model_path)
         for i in range(len(prompts_list)):
             logger.info(f"Starting to run model on prompt {i}")
-            prediction_with_scores = eval_prompt(data_file, predictor, prompts_list[i], param_dict)
+            eval_result = eval_prompt(data_file, predictor, prompts_list[i], param_dict)
             logger.info(f"Model run completed on prompt {i}: {prompts_list[i]}")
-            eval_result = EvalResult(
-                prediction_with_scores[0],
-                prompts_list[i],
-                prediction_with_scores[1],
-                prediction_with_scores[2],
-            )
             results_list.append(eval_result)
         return results_list
 
@@ -72,7 +66,7 @@ def eval(
     param: List[str] = typer.Option(
         [], help="Additional float parameters to pass to the model as name=float pairs"
     ),
-) -> Tuple[List[str], float, float]:
+) -> EvalResult:
     import mlflow
 
     mlflow.autolog()
@@ -104,9 +98,7 @@ def load_data(data_file: str) -> Tuple[List[str], List[str]]:
         return inputs, labels
 
 
-def eval_prompt(
-    data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float]
-) -> Tuple[List[str], float, float]:
+def eval_prompt(data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float]) -> EvalResult:
     import mlflow
 
     inputs, labels = load_data(data_file)
@@ -115,6 +107,7 @@ def eval_prompt(
     predictions = pred.predict(prompt, inputs, **param_dict)
     timer_end = timer()
     bleu, meteor = eval_bleu_meteor(predictions, labels)
+    semscore = eval_semscore(predictions, labels)
     pred_time = timer_end - timer_start
     mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
     for i in range(len(inputs)):
@@ -133,7 +126,8 @@ def eval_prompt(
     mlflow.log_metric("tokens/sec", total_tokens / pred_time)
     mlflow.log_metric("bleu_score", round(bleu, 5))
     mlflow.log_metric("meteor_score", round(meteor, 5))
-    return predictions, bleu, meteor
+    mlflow.log_metric("semscore", round(semscore, 5))
+    return EvalResult(predictions, prompt, bleu, meteor, semscore)
 
 
 @app.command()

diff --git a/src/autora/doc/pipelines/metrics.py b/src/autora/doc/pipelines/metrics.py
@@ -3,6 +3,9 @@
 import nltk
 from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
 from nltk.translate.meteor_score import single_meteor_score
+from numpy import dot, mean, nan_to_num
+from numpy.linalg import norm
+from sentence_transformers import SentenceTransformer
 
 
 def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[float, float]:
@@ -27,6 +30,24 @@ def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[flo
         single_meteor_score(tokenized_ref, tokenized_pred)
         for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
     ]
-    meteor = sum(meteor_scores) / len(predictions) if predictions else 0
+    meteor: float = nan_to_num(mean(meteor_scores), nan=0)
 
     return (bleu, meteor)
+
+
+def eval_semscore(predictions: List[str], references: List[str]) -> float:
+    """
+    Calculate sentence embedding similarity score.
+    https://arxiv.org/pdf/2401.17072.pdf
+    """
+    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
+
+    def score(pred: str, ref: str) -> float:
+        encodings = model.encode([pred, ref])
+        assert len(encodings) == 2
+        cos_dist: float = dot(encodings[0], encodings[1]) / norm(encodings[0]) * norm(encodings[1])
+        return cos_dist
+
+    scores = [score(pred, ref) for pred, ref in zip(predictions, references)]
+    semscore: float = nan_to_num(mean(scores), nan=0)
+    return semscore
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -11,9 +11,9 @@
 
 def test_predict() -> None:
     data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
-    outputs, _, _ = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, [])
-    assert len(outputs) == 3, "Expected 3 outputs"
-    for output in outputs:
+    eval_result = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, [])
+    assert len(eval_result.predictions) == 3, "Expected 3 outputs"
+    for output in eval_result.predictions:
         assert len(output) > 0, "Expected non-empty output"
 
 
@@ -42,5 +42,5 @@ def test_eval_prompts() -> None:
     results: List[EvalResult] = eval_prompts(str(data_file), TEST_HF_MODEL, str(prompts_file), [])
     assert len(results) == 3, "Expected 3 outputs"
     for result in results:
-        assert result.prediction is not None, "The prediction should not be None"
+        assert result.predictions is not None, "The prediction should not be None"
         assert result.prompt is not None, "The prompt should not be None"
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -3,7 +3,7 @@
 import jsonlines
 import pytest
 
-from autora.doc.pipelines.metrics import eval_bleu_meteor
+from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore
 
 
 def test_evaluation() -> None:
@@ -53,3 +53,17 @@ def test_partially_matching_tokens() -> None:
     bleu, meteor = eval_bleu_meteor(predictions, labels)
     assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
     assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"
+
+
+def test_semscore() -> None:
+    # Test Case: SemScore is close to 1
+    labels = ["this is really good"]
+    predictions = ["this is great"]
+    semscore = eval_semscore(predictions, labels)
+    assert semscore >= 0.6, f"SemScore is {semscore}"
+
+    semscore = eval_semscore(labels, labels)
+    assert semscore == pytest.approx(1.0), f"SemScore is {semscore}"
+
+    semscore = eval_semscore([], [])
+    assert semscore == 0, f"SemScore is {semscore}"