diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 9ad55466a..bdca88841 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -9,6 +9,8 @@ - sections: - local: saving-and-reading-results title: Save and read results + - local: use-litellm-as-backend + title: Use LITELLM as backend - local: using-the-python-api title: Use the Python API - local: adding-a-custom-task diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/use-litellm-as-backend.mdx new file mode 100644 index 000000000..96f264c54 --- /dev/null +++ b/docs/source/use-litellm-as-backend.mdx @@ -0,0 +1,81 @@ +# Litellm as backend + +Lighteval allows to use litellm, a backend allowing you to call all LLM APIs +using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, +OpenAI, Groq etc.]. + +Documentation for available APIs and compatible endpoints can be found [here](https://docs.litellm.ai/docs/). + +## Quick use + +```bash +lighteval endpoint litellm \ + "gpt-3.5-turbo" \ + "lighteval|gsm8k|0|0" +``` + +## Using a config file + +Litellm allows generation with any OpenAI compatible endpoint, for example you +can evaluate a model running on a local vllm server. + +To do so you will need to use a config file like so: + +```yaml +model: + base_params: + model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" + base_url: "URL OF THE ENDPOINT YOU WANT TO USE" + api_key: "" # remove or keep empty as needed + generation: + temperature: 0.5 + max_new_tokens: 256 + stop_tokens: [""] + top_p: 0.9 + seed: 0 + repetition_penalty: 1.0 + frequency_penalty: 0.0 +``` + +## Use Hugging Face Inference Providers + +With this you can also access HuggingFace Inference servers, let's look at how to evaluate DeepSeek-R1-Distill-Qwen-32B. + +First, let's look at how to acess the model, we can find this from [the model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B). + +Step 1: + +![Step 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lighteval/litellm-guide-2.png) + +Step 2: + +![Step 2](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lighteval/litellm-guide-1.png) + +Great ! Now we can simply copy paste the base_url and our api key to eval our model. + +> [!WARNING] +> Do not forget to prepend the provider in the `model_name`. Here we use an +> openai compatible endpoint to the provider is `openai`. + +```yaml +model: + base_params: + model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" + base_url: "https://router.huggingface.co/hf-inference/v1" + api_key: "YOUR KEY" # remove or keep empty as needed + generation: + temperature: 0.5 + max_new_tokens: 256 # This will overide the default from the tasks config + top_p: 0.9 + seed: 0 + repetition_penalty: 1.0 + frequency_penalty: 0.0 +``` + +And then, we are able to eval our model on any eval available in Lighteval. + +```bash +lighteval endpoint litellm \ + "examples/model_configs/litellm_model.yaml" \ + "lighteval|gsm8k|0|0" +``` diff --git a/examples/model_configs/litellm_model.yaml b/examples/model_configs/litellm_model.yaml new file mode 100644 index 000000000..6d2b7a30f --- /dev/null +++ b/examples/model_configs/litellm_model.yaml @@ -0,0 +1,12 @@ +model: + base_params: + model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" + base_url: "https://router.huggingface.co/hf-inference/v1" + generation: + temperature: 0.5 + max_new_tokens: 256 + stop_tokens: [""] + top_p: 0.9 + seed: 0 + repetition_penalty: 1.0 + frequency_penalty: 0.0 diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py index 858cdcde3..510442610 100644 --- a/src/lighteval/main_endpoint.py +++ b/src/lighteval/main_endpoint.py @@ -385,8 +385,11 @@ def tgi( @app.command(rich_help_panel="Evaluation Backends") def litellm( # === general === - model_name: Annotated[ - str, Argument(help="The model name to evaluate (has to be available through the litellm API.") + model_args: Annotated[ + str, + Argument( + help="config file path for the litellm model, or a comma separated string of model args (model_name={},base_url={},provider={})" + ), ], tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], # === Common parameters === @@ -462,7 +465,11 @@ def litellm( # TODO (nathan): better handling of model_args parallelism_manager = ParallelismManager.NONE - model_config = LiteLLMModelConfig(model=model_name) + if model_args.endswith(".yaml"): + model_config = LiteLLMModelConfig.from_path(model_args) + else: + model_name = model_args.split(",")[0].strip() + model_config = LiteLLMModelConfig(model=model_name) pipeline_params = PipelineParameters( launcher_type=parallelism_manager, diff --git a/src/lighteval/models/litellm_model.py b/src/lighteval/models/litellm_model.py index 840061788..5c4235601 100644 --- a/src/lighteval/models/litellm_model.py +++ b/src/lighteval/models/litellm_model.py @@ -21,17 +21,18 @@ # SOFTWARE. import logging -import os import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from typing import Optional +import yaml from tqdm import tqdm from lighteval.data import GenerativeTaskDataset from lighteval.models.abstract_model import LightevalModel from lighteval.models.endpoints.endpoint_model import ModelInfo +from lighteval.models.model_input import GenerationParameters from lighteval.models.model_output import ( GenerativeResponse, LoglikelihoodResponse, @@ -63,6 +64,32 @@ @dataclass class LiteLLMModelConfig: model: str + provider: Optional[str] = None + base_url: Optional[str] = None + api_key: Optional[str] = None + generation_parameters: GenerationParameters = None + + def __post_init__(self): + if self.generation_parameters is None: + self.generation_parameters = GenerationParameters() + + @classmethod + def from_path(cls, path): + with open(path, "r") as f: + config = yaml.safe_load(f)["model"] + + model = config["base_params"]["model_name"] + provider = config["base_params"].get("provider", None) + base_url = config["base_params"].get("base_url", None) + api_key = config["base_params"].get("api_key", None) + generation_parameters = GenerationParameters.from_dict(config) + return cls( + model=model, + provider=provider, + base_url=base_url, + generation_parameters=generation_parameters, + api_key=api_key, + ) class LiteLLMClient(LightevalModel): @@ -79,15 +106,17 @@ def __init__(self, config, env_config) -> None: model_dtype=None, model_size="", ) - self.provider = config.model.split("/")[0] - self.base_url = os.getenv(f"{self.provider.upper()}_BASE_URL", None) + self.model = config.model + self.provider = config.provider or config.model.split("/")[0] + self.base_url = config.base_url + self.api_key = config.api_key + self.generation_parameters = config.generation_parameters + self.API_MAX_RETRY = 5 self.API_RETRY_SLEEP = 3 self.API_RETRY_MULTIPLIER = 2 self.CONCURENT_CALLS = 20 # 100 leads to hitting Anthropic rate limits - self.TEMPERATURE = 0.3 - self.TOP_P = 0.95 - self.model = config.model + self._tokenizer = encode self.pairwise_tokenization = False litellm.drop_params = True @@ -126,18 +155,19 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se kwargs = { "model": self.model, "messages": prompt, - "max_completion_tokens": max_new_tokens, "logprobs": return_logits if self.provider == "openai" else None, "base_url": self.base_url, "n": num_samples, "caching": True, + "api_key": self.api_key, } if "o1" in self.model: logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.") else: - kwargs["temperature"] = self.TEMPERATURE - kwargs["top_p"] = self.TOP_P - kwargs["stop"] = stop_sequence + kwargs.update(self.generation_parameters.to_litellm_dict()) + + if kwargs.get("max_completion_tokens", None) is None: + kwargs["max_completion_tokens"] = max_new_tokens response = litellm.completion(**kwargs) diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index 30288363d..b6859232e 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -32,15 +32,15 @@ class GenerationParameters: length_penalty: Optional[float] = None # vllm, transformers presence_penalty: Optional[float] = None # vllm - max_new_tokens: Optional[int] = None # vllm, transformers, tgi + max_new_tokens: Optional[int] = None # vllm, transformers, tgi, litellm min_new_tokens: Optional[int] = None # vllm, transformers - seed: Optional[int] = None # vllm, tgi - stop_tokens: Optional[list[str]] = None # vllm, transformers, tgi - temperature: Optional[float] = None # vllm, transformers, tgi + seed: Optional[int] = None # vllm, tgi litellm + stop_tokens: Optional[list[str]] = None # vllm, transformers, tgi, litellm + temperature: Optional[float] = None # vllm, transformers, tgi, litellm top_k: Optional[int] = None # vllm, transformers, tgi min_p: Optional[float] = None # vllm, transformers - top_p: Optional[int] = None # vllm, transformers, tgi + top_p: Optional[int] = None # vllm, transformers, tgi, litellm truncate_prompt: Optional[bool] = None # vllm, tgi @classmethod @@ -59,6 +59,24 @@ def from_dict(cls, config_dict: dict): """ return GenerationParameters(**config_dict.get("generation", {})) + def to_litellm_dict(self) -> dict: + """Selects relevant generation and sampling parameters for litellm models. + Doc: https://docs.litellm.ai/docs/completion/input#input-params-1 + + Returns: + dict: The parameters to create a litellm.SamplingParams in the model config. + """ + args = { + "max_completion_tokens": self.max_new_tokens, + "stop": self.stop_tokens, + "temperature": self.temperature, + "top_p": self.top_p, + "seed": self.seed, + "repetition_penalty": self.repetition_penalty, + "frequency_penalty": self.frequency_penalty, + } + return {k: v for k, v in args.items() if v is not None} + def to_vllm_dict(self) -> dict: """Selects relevant generation and sampling parameters for vllm models. Doc: https://docs.vllm.ai/en/v0.5.5/dev/sampling_params.html