Updated tgi_model and added parameters for endpoint_model (#208)

shaltielshmid · web-flow · commit 7fcaab354853 · 2024-07-04T10:23:10.000+02:00
* Added image url parameter

* Fixed up tgi model config

* Undid tgi available check

* Adjust tgi parameter names, and checked for attr existence

* Fixed task Id in argparse

* Removed obfuscation from private functions, to allow inheritance to override

* Updated tgi model to inherit from endpoint and just modify client calls

* Added option to specify model id in config for tgi model

* Added option to specify custom env vars

* Updated env vras

* Applied ruff format

* Added docs + readme

* Ruff format
diff --git a/README.md b/README.md
@@ -139,7 +139,7 @@ accelerate launch --multi_gpu --num_processes=<num_gpus> run_evals_accelerate.py
     --output_dir output_dir
 ```
 
-Examples of possible configuration files are provided in `examples/model_configs`.
+You can find the template of the expected model configuration in [examples/model_configs/base_model.yaml_](./examples/model_configs/base_model.yaml). 
 
 ### Evaluating a large model with pipeline parallelism
 
@@ -182,6 +182,25 @@ python run_evals_accelerate.py \
     --output_dir output_dir
 ```
 
+### Evaluate the model on a server/container.
+
+An alternative to launching the evaluation locally is to serve the model on a TGI-compatible server/container and then run the evaluation by sending requests to the server. The command is the same as before, except you specify a path to a yaml config file (detailed below):
+
+```shell
+python run_evals_accelerate.py \
+    --model_config_path="/path/to/config/file"\
+    --tasks <task parameters> \
+    --output_dir output_dir
+```
+
+There are two types of configuration files that can be provided for running on the server:
+
+1. [endpoint_model.yaml](./examples/model_configs/endpoint_model.yaml): This configuration allows you to launch the model using [HuggingFace's Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated). You can specify in the configuration file all the relevant parameters, and then `lighteval` will automatically deploy the endpoint, run the evaluation, and finally delete the endpoint (unless you specify an endpoint that was already launched, in which case the endpoint won't be deleted afterwards).
+
+2. [tgi_model.yaml](./examples/model_configs/tgi_model.yaml): This configuration lets you specify the URL of a model running in a TGI container, such as one deployed on HuggingFace's serverless inference. 
+
+Templates for these configurations can be found in [examples/model_configs](./examples/model_configs/).
+
 ### Evaluate a model on extended, community, or custom tasks.
 
 Independently of the default tasks provided in `lighteval` that you will find in the `tasks_table.jsonl` file, you can use `lighteval` to evaluate models on tasks that require special processing (or have been added by the community). These tasks have their own evaluation suites and are defined as follows:
@@ -190,7 +209,6 @@ Independently of the default tasks provided in `lighteval` that you will find in
 * `community`: tasks that have been added by the community. See the [`community_tasks`](./community_tasks) folder for examples.
 * `custom`: tasks that are defined locally and not present in the core library. Use this suite if you want to experiment with designing a special metric or task.
 
-
 For example, to run an extended task like `ifeval`, you can run:
 ```shell
 python run_evals_accelerate.py \
diff --git a/examples/model_configs/endpoint_model.yaml b/examples/model_configs/endpoint_model.yaml
@@ -5,7 +5,7 @@ model:
     model: "meta-llama/Llama-2-7b-hf"
     revision: "main"
     dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
-    reuse_existing: false # if true, ignore all params in instance
+    reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation
   instance:
     accelerator: "gpu"
     region: "eu-west-1"
@@ -15,5 +15,8 @@ model:
     framework: "pytorch"
     endpoint_type: "protected"
     namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace
+    image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
+    env_vars: 
+      null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
   generation:
     add_special_tokens: true
diff --git a/examples/model_configs/tgi_model.yaml b/examples/model_configs/tgi_model.yaml
@@ -3,3 +3,4 @@ model:
   instance:
     inference_server_address: ""
     inference_server_auth: null
+    model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
@@ -20,10 +20,11 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-""" Example run command:
+"""Example run command:
 accelerate config
 accelerate launch run_evals_accelerate.py --tasks="leaderboard|hellaswag|5|1" --output_dir "/scratch/evals" --model_args "pretrained=gpt2"
 """
+
 import argparse
 
 from lighteval.main_accelerate import CACHE_DIR, main
@@ -70,7 +71,7 @@ def get_parser():
         "--tasks",
         type=str,
         default=None,
-        help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a texte file with a list of tasks",
+        help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5|0' or path to a texte file with a list of tasks",
     )
     parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots")
     return parser
diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
@@ -92,8 +92,9 @@ def __init__(
                             "MAX_TOTAL_TOKENS": "2048",
                             "MODEL_ID": "/repository",
                             **config.get_dtype_args(),
+                            **config.get_custom_env_vars(),
                         },
-                        "url": "ghcr.io/huggingface/text-generation-inference:1.1.0",
+                        "url": (config.image_url or "ghcr.io/huggingface/text-generation-inference:1.1.0"),
                     },
                 )
             hlog("Deploying your endpoint. Please wait.")
@@ -149,7 +150,7 @@ def max_length(self):
             self._max_length = 2048
         return self._max_length
 
-    def __async_process_request(
+    def _async_process_request(
         self, context: str, stop_tokens: list[str], max_tokens: int
     ) -> Coroutine[None, list[TextGenerationOutput], str]:
         # Todo: add an option to launch with conversational instead for chat prompts
@@ -165,7 +166,7 @@ def __async_process_request(
 
         return generated_text
 
-    def __process_request(self, context: str, stop_tokens: list[str], max_tokens: int) -> TextGenerationOutput:
+    def _process_request(self, context: str, stop_tokens: list[str], max_tokens: int) -> TextGenerationOutput:
         # Todo: add an option to launch with conversational instead for chat prompts
         # https://huggingface.co/docs/huggingface_hub/v0.20.3/en/package_reference/inference_client#huggingface_hub.AsyncInferenceClient.conversational
         generated_text = self.client.text_generation(
@@ -179,13 +180,13 @@ def __process_request(self, context: str, stop_tokens: list[str], max_tokens: in
 
         return generated_text
 
-    async def __async_process_batch_generate(
+    async def _async_process_batch_generate(
         self,
         requests: list[GreedyUntilRequest],
     ) -> list[TextGenerationOutput]:
         return await asyncio.gather(
             *[
-                self.__async_process_request(
+                self._async_process_request(
                     context=request.context,
                     stop_tokens=as_list(request.stop_sequence),
                     max_tokens=request.generation_size,
@@ -194,25 +195,25 @@ async def __async_process_batch_generate(
             ]
         )
 
-    def __process_batch_generate(
+    def _process_batch_generate(
         self,
         requests: list[GreedyUntilRequest],
     ) -> list[TextGenerationOutput]:
         return [
-            self.__process_request(
+            self._process_request(
                 context=request.context,
                 stop_tokens=as_list(request.stop_sequence),
                 max_tokens=request.generation_size,
             )
             for request in requests
         ]
 
-    async def __async_process_batch_logprob(
+    async def _async_process_batch_logprob(
         self, requests: list[LoglikelihoodRequest], rolling: bool = False
     ) -> list[TextGenerationOutput]:
         return await asyncio.gather(
             *[
-                self.__async_process_request(
+                self._async_process_request(
                     context=request.context if rolling else request.context + request.choice,
                     stop_tokens=[],
                     max_tokens=1,
@@ -221,11 +222,11 @@ async def __async_process_batch_logprob(
             ]
         )
 
-    def __process_batch_logprob(
+    def _process_batch_logprob(
         self, requests: list[LoglikelihoodRequest], rolling: bool = False
     ) -> list[TextGenerationOutput]:
         return [
-            self.__process_request(
+            self._process_request(
                 context=request.context if rolling else request.context + request.choice,
                 stop_tokens=[],
                 max_tokens=1,
@@ -267,9 +268,9 @@ def greedy_until(
                     )
 
                 if self.use_async:
-                    responses = asyncio.run(self.__async_process_batch_generate(batch))
+                    responses = asyncio.run(self._async_process_batch_generate(batch))
                 else:
-                    responses = self.__process_batch_generate(batch)
+                    responses = self._process_batch_generate(batch)
                 for response in responses:
                     results.append(
                         GenerateReturn(
@@ -303,9 +304,9 @@ def loglikelihood(
 
             for batch in tqdm(dataloader, desc="Loglikelihoods", position=1, leave=False, disable=self.disable_tqdm):
                 if self.use_async:
-                    responses = asyncio.run(self.__async_process_batch_logprob(batch))
+                    responses = asyncio.run(self._async_process_batch_logprob(batch))
                 else:
-                    responses = self.__process_batch_logprob(batch)
+                    responses = self._process_batch_logprob(batch)
                 for cur_request, response in zip(batch, responses):
                     cont_toks = torch.tensor(cur_request.tokenized_continuation)
                     len_choice = len(cont_toks)
@@ -351,9 +352,9 @@ def loglikelihood_rolling(
                 dataloader, desc="Loglikelihoods, rolling", position=1, leave=False, disable=self.disable_tqdm
             ):
                 if self.use_async:
-                    responses = asyncio.run(self.__async_process_batch_logprob(batch, rolling=True))
+                    responses = asyncio.run(self._async_process_batch_logprob(batch, rolling=True))
                 else:
-                    responses = self.__process_batch_logprob(batch, rolling=True)
+                    responses = self._process_batch_logprob(batch, rolling=True)
                 for response in responses:
                     logits = [t.logprob for t in response.details.tokens[:-1]]
 
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
@@ -200,6 +200,7 @@ def init_configs(self, env_config: EnvConfig):
 class TGIModelConfig:
     inference_server_address: str
     inference_server_auth: str
+    model_id: str
 
 
 @dataclass
@@ -224,6 +225,8 @@ class InferenceEndpointModelConfig:
     add_special_tokens: bool = True
     revision: str = "main"
     namespace: str = None  # The namespace under which to launch the endopint. Defaults to the current user's namespace
+    image_url: str = None
+    env_vars: dict = None
 
     def get_dtype_args(self) -> Dict[str, str]:
         model_dtype = self.model_dtype.lower()
@@ -237,14 +240,17 @@ def get_dtype_args(self) -> Dict[str, str]:
             return {"DTYPE": model_dtype}
         return {}
 
+    def get_custom_env_vars(self) -> Dict[str, str]:
+        return {k: str(v) for k, v in self.env_vars.items()} if self.env_vars else {}
+
     @staticmethod
     def nullable_keys() -> list[str]:
         """
         Returns the list of optional keys in an endpoint model configuration. By default, the code requires that all the
         keys be specified in the configuration in order to launch the endpoint. This function returns the list of keys
         that are not required and can remain None.
         """
-        return ["namespace"]
+        return ["namespace", "env_vars", "image_url"]
 
 
 def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]) -> BaseModelConfig:  # noqa: C901
@@ -271,16 +277,17 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
 
         return BaseModelConfig(**args_dict)
 
-    if args.model_config:
+    if hasattr(args, "model_config") and args.model_config:
         config = args.model_config["model"]
     else:
         with open(args.model_config_path, "r") as f:
             config = yaml.safe_load(f)["model"]
 
     if config["type"] == "tgi":
         return TGIModelConfig(
-            inference_server_address=args["instance"]["inference_server_address"],
-            inference_server_auth=args["instance"]["inference_server_auth"],
+            inference_server_address=config["instance"]["inference_server_address"],
+            inference_server_auth=config["instance"]["inference_server_auth"],
+            model_id=config["instance"]["model_id"],
         )
 
     if config["type"] == "endpoint":
@@ -303,6 +310,8 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
                 instance_size=config["instance"]["instance_size"],
                 instance_type=config["instance"]["instance_type"],
                 namespace=config["instance"]["namespace"],
+                image_url=config["instance"].get("image_url", None),
+                env_vars=config["instance"].get("env_vars", None),
             )
         return InferenceModelConfig(model=config["base_params"]["endpoint_name"])
 
diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
@@ -88,10 +88,12 @@ def load_model_with_tgi(config: TGIModelConfig):
         raise ImportError(NO_TGI_ERROR_MSG)
 
     hlog(f"Load model from inference server: {config.inference_server_address}")
-    model = ModelClient(address=config.inference_server_address, auth_token=config.inference_server_auth)
+    model = ModelClient(
+        address=config.inference_server_address, auth_token=config.inference_server_auth, model_id=config.model_id
+    )
     model_name = str(model.model_info["model_id"])
     model_sha = model.model_info["model_sha"]
-    model_precision = model.model_info["dtype"]
+    model_precision = model.model_info["model_dtype"]
     model_size = -1
     model_info = ModelInfo(
         model_name=model_name,
diff --git a/src/lighteval/models/tgi_model.py b/src/lighteval/models/tgi_model.py
diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py