huggingface · clefourrier · Apr 22, 2024
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ Install the dependencies. For the default installation, you just need:
 pip install .
 ```
 
-If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`):
+If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`optimum`,`quantization`,`adapters`,`nanotron`):
 
 ```bash
 pip install '.[optional1,optional2]'
@@ -237,7 +237,7 @@ However, we are very grateful to the Harness and HELM teams for their continued
         - [main_accelerate.py](https://github.com/huggingface/lighteval/blob/main/src/lighteval/main_accelerate.py) and [main_nanotron.py](https://github.com/huggingface/lighteval/blob/main/src/lighteval/main_nanotron.py) are our entry points to run evaluation
         - [logging](https://github.com/huggingface/lighteval/tree/main/src/lighteval/logging): Our loggers, to display experiment information and push it to the hub after a run
         - [metrics](https://github.com/huggingface/lighteval/tree/main/src/lighteval/metrics): All the available metrics you can use. They are described in metrics, and divided between sample metrics (applied at the sample level, such as a prediction accuracy) and corpus metrics (applied over the whole corpus). You'll also find available normalisation functions.
-        - [models](https://github.com/huggingface/lighteval/tree/main/src/lighteval/models): Possible models to use. We cover transformers (base_model), with adapter or delta weights, as well as TGI models locally deployed (it's likely the code here is out of date though), and brrr/nanotron models.
+        - [models](https://github.com/huggingface/lighteval/tree/main/src/lighteval/models): Possible models to use. We cover transformers (base_model), with adapter or delta weights, as well as inference endpoint models, and brrr/nanotron models.
         - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `tasks_table.jsonl`, and you'll find all the prompts in `tasks_prompt_formatting.py`. Popular tasks requiring custom logic are exceptionally added in the [extended tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended).
 - [examples/tasks](https://github.com/huggingface/lighteval/tree/main/examples/tasks) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking.
 - [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, that we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks.

diff --git a/examples/model_configs/base_model.yaml b/examples/model_configs/base_model.yaml
@@ -1,5 +1,5 @@
 model:
-  type: "base" # can be base, tgi, or endpoint
+  type: "base" # can be base or endpoint
   base_params:
     model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
     dtype: "bfloat16"

diff --git a/examples/model_configs/endpoint_model.yaml b/examples/model_configs/endpoint_model.yaml
@@ -1,5 +1,5 @@
 model:
-  type: "endpoint" # can be base, tgi, or endpoint
+  type: "endpoint" # can be base or endpoint
   base_params:
     endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
     model: "meta-llama/Llama-2-7b-hf"

diff --git a/examples/model_configs/tgi_model.yaml b/examples/model_configs/tgi_model.yaml
diff --git a/pyproject.toml b/pyproject.toml
@@ -78,7 +78,6 @@ dependencies = [
 
 [project.optional-dependencies]
 accelerate = ["accelerate"]
-tgi = ["text-generation==0.6.0"]
 optimum = ["optimum==1.12.0"]
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
 adapters = ["peft==0.3.0"]

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
@@ -25,20 +25,19 @@
 
 import collections
 import copy
-from typing import Dict, Union
+from typing import Dict
 
 from pytablewriter import LatexTableWriter, MarkdownTableWriter
 
 from lighteval.logging.evaluation_tracker import EvaluationTracker
 from lighteval.logging.hierarchical_logger import hlog
 from lighteval.models.base_model import BaseModel
-from lighteval.models.tgi_model import ModelClient
 from lighteval.tasks.lighteval_task import LightevalTask
 from lighteval.tasks.requests import Doc, Request, RequestType, TaskExampleId
 
 
 def evaluate(  # noqa: C901
-    lm: Union[BaseModel, ModelClient],
+    lm: BaseModel,
     requests_dict: Dict[RequestType, list[Request]],
     docs: Dict[TaskExampleId, Doc],
     task_dict: Dict[str, LightevalTask],

diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -35,12 +35,12 @@
 from lighteval.models.model_loader import load_model
 from lighteval.tasks.lighteval_task import LightevalTask, create_requests_from_tasks
 from lighteval.tasks.registry import Registry, taskinfo_selector
-from lighteval.utils import is_accelerate_available, is_tgi_available
+from lighteval.utils import is_accelerate_available
 from lighteval.utils_parallelism import test_all_gather
 
 
-if not is_accelerate_available() and not is_tgi_available():
-    hlog_warn("Using either accelerate or text-generation to run this script is advised.")
+if not is_accelerate_available():
+    hlog_warn("Using accelerate to run this script is advised.")
 
 TOKEN = os.getenv("HF_TOKEN")
 CACHE_DIR = os.getenv("HF_HOME")

diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
@@ -640,9 +640,9 @@ def _generate(
 
                 decoded_generations.append(decoded_generation)
 
-            if num_samples == 1:  # We only return one item
-                result_generations = result_generations[0]
-                decoded_generations = decoded_generations[0]
+            # if num_samples == 1:  # We only return one item
+            #    result_generations = result_generations[0]
+            #    decoded_generations = decoded_generations[0]
 
             cur_response = GenerateReturn(
                 result=decoded_generations,

diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
@@ -195,12 +195,6 @@ def init_configs(self, env_config: EnvConfig):
         return self._init_configs(self.base_model, env_config)
 
 
-@dataclass
-class TGIModelConfig:
-    inference_server_address: str
-    inference_server_auth: str
-
-
 @dataclass
 class InferenceModelConfig:
     model: str
@@ -275,12 +269,6 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
         with open(args.model_config_path, "r") as f:
             config = yaml.safe_load(f)["model"]
 
-    if config["type"] == "tgi":
-        return TGIModelConfig(
-            inference_server_address=args["instance"]["inference_server_address"],
-            inference_server_auth=args["instance"]["inference_server_auth"],
-        )
-
     if config["type"] == "endpoint":
         reuse_existing_endpoint = config["base_params"]["reuse_existing"]
         complete_config_endpoint = all(

diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
@@ -35,10 +35,8 @@
     EnvConfig,
     InferenceEndpointModelConfig,
     InferenceModelConfig,
-    TGIModelConfig,
 )
-from lighteval.models.tgi_model import ModelClient
-from lighteval.utils import NO_TGI_ERROR_MSG, is_accelerate_available, is_tgi_available
+from lighteval.utils import is_accelerate_available
 
 
 if is_accelerate_available():
@@ -54,9 +52,9 @@ class ModelInfo:
 
 
 def load_model(  # noqa: C901
-    config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig],
+    config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, InferenceEndpointModelConfig],
     env_config: EnvConfig,
-) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient], ModelInfo]:
+) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel], ModelInfo]:
     """Will load either a model from an inference server or a model from a checkpoint. depending
     on the arguments passed to the program.
 
@@ -70,38 +68,15 @@ def load_model(  # noqa: C901
         ValueError: If you did not specify a base model when using delta weights or adapter weights
 
     Returns:
-        Union[BaseModel, AdapterModel, DeltaModel, ModelClient]: The model that will be evaluated
+        Union[BaseModel, AdapterModel, DeltaModel]: The model that will be evaluated
     """
-    # Inference server loading
-    if isinstance(config, TGIModelConfig):
-        return load_model_with_tgi(config)
-
     if isinstance(config, InferenceEndpointModelConfig) or isinstance(config, InferenceModelConfig):
         return load_model_with_inference_endpoints(config, env_config=env_config)
 
     if isinstance(config, BaseModelConfig):
         return load_model_with_accelerate_or_default(config=config, env_config=env_config)
 
 
-def load_model_with_tgi(config: TGIModelConfig):
-    if not is_tgi_available():
-        raise ImportError(NO_TGI_ERROR_MSG)
-
-    hlog(f"Load model from inference server: {config.inference_server_address}")
-    model = ModelClient(address=config.inference_server_address, auth_token=config.inference_server_auth)
-    model_name = str(model.model_info["model_id"])
-    model_sha = model.model_info["model_sha"]
-    model_precision = model.model_info["dtype"]
-    model_size = -1
-    model_info = ModelInfo(
-        model_name=model_name,
-        model_sha=model_sha,
-        model_dtype=model_precision,
-        model_size=model_size,
-    )
-    return model, model_info
-
-
 def load_model_with_inference_endpoints(config: InferenceEndpointModelConfig, env_config: EnvConfig):
     hlog("Spin up model using inference endpoint.")
     model = InferenceEndpointModel(config=config, env_config=env_config)

diff --git a/src/lighteval/models/tgi_model.py b/src/lighteval/models/tgi_model.py
diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py
@@ -152,13 +152,6 @@ def is_accelerate_available() -> bool:
 NO_ACCELERATE_ERROR_MSG = "You requested the use of accelerate for this evaluation, but it is not available in your current environement. Please install it using pip."
 
 
-def is_tgi_available() -> bool:
-    return importlib.util.find_spec("text-generation") is not None
-
-
-NO_TGI_ERROR_MSG = "You are trying to start a text generation inference endpoint, but text-generation is not present in your local environement. Please install it using pip."
-
-
 def is_nanotron_available() -> bool:
     return importlib.util.find_spec("nanotron") is not None