[TEST] fix integrations test despte random max_samples

YannDubs · YannDubs · commit 5d172c869008 · 2024-01-20T17:23:09.000-08:00
diff --git a/src/alpaca_eval/decoders/huggingface_api.py b/src/alpaca_eval/decoders/huggingface_api.py
@@ -51,7 +51,7 @@ def huggingface_api_completions(
         token=constants.HUGGINGFACEHUB_API_TOKEN,
     )
 
-    default_kwargs = dict(do_sample=do_sample, options=dict(wait_for_model=True), return_full_text=False)
+    default_kwargs = dict(do_sample=do_sample, return_full_text=False)
     default_kwargs.update(kwargs)
     logging.info(f"Kwargs to completion: {default_kwargs}")
 
@@ -72,32 +72,33 @@ def huggingface_api_completions(
                 )
     logging.info(f"Time for {n_examples} completions: {t}")
 
-    completions = [completion["generated_text"] for completion in completions]
-
     # unclear pricing
     price = [np.nan] * len(completions)
     avg_time = [t.duration / n_examples] * len(completions)
 
     return dict(completions=completions, price_per_example=price, time_per_example=avg_time)
 
 
-def inference_helper(prompt: str, inference, params, n_retries=100, waiting_time=2) -> dict:
+def inference_helper(prompt: str, inference, params, n_retries=100, waiting_time=2) -> str:
     for _ in range(n_retries):
-        output = inference(inputs=prompt, params=params)
-        if "error" in output and n_retries > 0:
-            error = output["error"]
-            if "Rate limit reached" in output["error"]:
-                logging.warning(f"Rate limit reached... Trying again in {waiting_time} seconds. Full error: {error}")
-                time.sleep(waiting_time)
-            elif "Input validation error" in error and "max_new_tokens" in error:
-                params["max_new_tokens"] = int(params["max_new_tokens"] * 0.8)
-                logging.warning(
-                    f"`max_new_tokens` too large. Reducing target length to {params['max_new_tokens']}, " f"Retrying..."
-                )
-                if params["max_new_tokens"] == 0:
+        try:
+            # TODO: check why doesn't stop after </s>
+            output = inference(prompt=prompt, **params)
+        except Exception as error:
+            if n_retries > 0:
+                if "Rate limit reached" in error:
+                    logging.warning(f"Rate limit reached... Trying again in {waiting_time} seconds.")
+                    time.sleep(waiting_time)
+                elif "Input validation error" in error and "max_new_tokens" in error:
+                    params["max_new_tokens"] = int(params["max_new_tokens"] * 0.8)
+                    logging.warning(
+                        f"`max_new_tokens` too large. Reducing target length to {params['max_new_tokens']}, "
+                        f"Retrying..."
+                    )
+                    if params["max_new_tokens"] == 0:
+                        raise ValueError(f"Error in inference. Full error: {error}")
+                else:
                     raise ValueError(f"Error in inference. Full error: {error}")
             else:
-                raise ValueError(f"Error in inference. Full error: {error}")
-        else:
-            return output[0]
-    raise ValueError(f"Error in inference. We tried {n_retries} times and failed.")
+                raise ValueError(f"Error in inference. We tried {n_retries} times and failed. Full error: {error}")
+        return output
diff --git a/tests/integration_tests/test_example_integration.py b/tests/integration_tests/test_example_integration.py
@@ -1,10 +1,15 @@
+import os
 import subprocess
 
 import pytest
 
 
+# example file is from 003 so should always lose against gpt4 turbo
 @pytest.mark.slow
 def test_cli_evaluate_example():
+    env = os.environ.copy()
+    env["IS_ALPACA_EVAL_2"] = "True"
+
     result = subprocess.run(
         [
             "alpaca_eval",
@@ -19,15 +24,18 @@ def test_cli_evaluate_example():
         ],
         capture_output=True,
         text=True,
+        env=env,
     )
     normalized_output = " ".join(result.stdout.split())
-    expected_output = " ".join("example 33.33 33.33 3".split())
+    expected_output = " ".join("example 0.00 0.00 3".split())
 
     assert expected_output in normalized_output
 
 
 @pytest.mark.slow
 def test_openai_fn_evaluate_example():
+    env = os.environ.copy()
+    env["IS_ALPACA_EVAL_2"] = "True"
     result = subprocess.run(
         [
             "alpaca_eval",
@@ -42,6 +50,7 @@ def test_openai_fn_evaluate_example():
         ],
         capture_output=True,
         text=True,
+        env=env,
     )
     normalized_output = " ".join(result.stdout.split())
     expected_output = " ".join("example 0.00 0.00 2".split())
diff --git a/tests/test_decoders_unit.py b/tests/test_decoders_unit.py
@@ -57,7 +57,7 @@ def test_cohere_completions(mocker):
 def test_huggingface_api_completions(mocker):
     mocker.patch(
         "alpaca_eval.decoders.huggingface_api.inference_helper",
-        return_value=dict(generated_text="Mocked completion text"),
+        return_value="Mocked completion text",
     )
     result = huggingface_api_completions(
         ["Prompt 1", "Prompt 2"],

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def test_cohere_completions(mocker):`
`57`	`57`	`def test_huggingface_api_completions(mocker):`
`58`	`58`	`mocker.patch(`
`59`	`59`	`"alpaca_eval.decoders.huggingface_api.inference_helper",`
`60`		`- return_value=dict(generated_text="Mocked completion text"),`
	`60`	`+ return_value="Mocked completion text",`
`61`	`61`	`)`
`62`	`62`	`result = huggingface_api_completions(`
`63`	`63`	`["Prompt 1", "Prompt 2"],`