Skip to content

Commit

Permalink
[Docs, Bugfix] Update Logo and Tokenizer Bug Fix (#11)
Browse files Browse the repository at this point in the history
* logo

* readme

* Tokenizer bug fix

* logo: update logo

* make format

* revert make format for config.py
  • Loading branch information
anmolagarwalcp810 authored Aug 30, 2024
1 parent 09d935e commit d5f18de
Show file tree
Hide file tree
Showing 12 changed files with 47 additions and 19 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
<!-- <p align="center">
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="docs/_static/logo/dark.png">
<img alt="vLLM" src="docs/_static/logo/light.png" width=50%>
</picture>
</p> -->
</p>

<h3 align="center">
Tool to benchmark LLM Inference Systems
Expand Down
Binary file modified docs/_static/logo/dark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_static/logo/light.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 7 additions & 7 deletions etalon/capacity_search/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,24 @@ def to_config_dict(self):
class ModelConfig:
name: str
identifier: str
tokenizer: str = None
parallel_specs: List[str] = field(default_factory=list)
traces: List[str] = field(default_factory=list)

def get_key(self):
return f"{self.name}"

def get_human_readable_name(self):
return f"Model: {self.name}"
return f"Model: {self.name}, Tokenizer: {self.tokenizer}"

def to_config_dict(self):
return {"model_name": self.identifier}
return {"model_name": self.identifier, "tokenizer_name": self.tokenizer}

def to_args(self):
return f"--model {self.identifier}"
command = f"--model {self.identifier}"
if self.tokenizer:
command += f" --tokenizer {self.tokenizer}"
return command

def is_parallel_spec_valid(self, spec_name: str) -> bool:
return not self.parallel_specs or spec_name in self.parallel_specs
Expand Down Expand Up @@ -312,10 +316,6 @@ def generate_job_configs(cls, config: dict):
and server_config.openai_server_engine
in ["vllm", "lightllm", "fastertransformers", "sarathi-serve"]
)
or (
model_config.name != "gpt-3.5-turbo"
and server_config.openai_server_engine == "default"
)
or (
request_generator_config.trace_file_name == "sharegpt"
and request_config.request_generator_max_tokens == 16384
Expand Down
2 changes: 1 addition & 1 deletion etalon/core/hf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_tokenizer(
)
except TypeError as e:
# The LLaMA tokenizer causes a protobuf error in some environments.
err_msg = "Failed to load the tokenizer."
err_msg = "Failed to load the tokenizer. If model name is correct, consider setting --tokenizer CLI arg to equivalent model on HuggingFace."
raise RuntimeError(err_msg) from e
except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
Expand Down
10 changes: 7 additions & 3 deletions etalon/core/llm_clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@


def construct_clients(
model_name: str, llm_api: str, num_clients: int, use_ray: bool = True
model_name: str,
tokenizer_name: str,
llm_api: str,
num_clients: int,
use_ray: bool = True,
) -> List[BaseLLMClient]:
"""Construct LLMClients that will be used to make requests to the LLM API.
Expand All @@ -36,8 +40,8 @@ def construct_clients(
)

if use_ray:
clients = [impl.remote(model_name) for _ in range(num_clients)]
clients = [impl.remote(model_name, tokenizer_name) for _ in range(num_clients)]
else:
clients = [impl(model_name) for _ in range(num_clients)]
clients = [impl(model_name, tokenizer_name) for _ in range(num_clients)]

return clients
5 changes: 3 additions & 2 deletions etalon/core/llm_clients/base_llm_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
class BaseLLMClient:
"""A client for making requests to a LLM API e.g Anyscale Endpoints."""

def __init__(self, model_name: str) -> None:
def __init__(self, model_name: str, tokenizer_name: str) -> None:
self.model_name = model_name
self.tokenizer = get_tokenizer(
model_name,
tokenizer_name,
trust_remote_code=True,
)

Expand Down
4 changes: 2 additions & 2 deletions etalon/core/llm_clients/openai_chat_completions_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
class OpenAIChatCompletionsClient(BaseLLMClient):
"""Client for OpenAI Chat Completions API."""

def __init__(self, model_name: str) -> None:
super().__init__(model_name)
def __init__(self, model_name: str, tokenizer_name: str) -> None:
super().__init__(model_name, tokenizer_name)
self.client = httpx.AsyncClient()

def total_tokens(self, response_list: List[str]) -> int:
Expand Down
2 changes: 2 additions & 0 deletions etalon/core/requests_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class RequestsLauncher:
def __init__(
self,
model: str,
tokenizer_name: str,
llm_api: str,
num_ray_clients: int,
num_concurrent_requests_per_client: int,
Expand All @@ -23,6 +24,7 @@ def __init__(
AsyncRequestsManager.remote(
client_id=client_id,
model=model,
tokenizer_name=tokenizer_name,
llm_api=llm_api,
max_concurrent_requests=num_concurrent_requests_per_client,
)
Expand Down
8 changes: 7 additions & 1 deletion etalon/core/requests_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,20 @@ class AsyncRequestsManager:
"""Manages requests for single LLM API client."""

def __init__(
self, client_id: int, model: str, llm_api: str, max_concurrent_requests: int
self,
client_id: int,
model: str,
tokenizer_name: str,
llm_api: str,
max_concurrent_requests: int,
):
self.max_concurrent_requests = max_concurrent_requests
self.requests_queue = asyncio.Queue(maxsize=max_concurrent_requests)
self.results = []
# just create a single client per manager
self.llm_client = construct_clients(
model_name=model,
tokenizer_name=tokenizer_name,
llm_api=llm_api,
num_clients=1,
use_ray=False,
Expand Down
1 change: 1 addition & 0 deletions etalon/prefill_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def run(self):
os.makedirs(run_dir, exist_ok=True)
run_benchmark(
model=self.args.model,
tokenizer_name=self.args.tokenizer,
output_dir=run_dir,
additional_sampling_params=self.args.additional_sampling_params,
num_ray_clients=PREFILL_NUM_RAY_CLIENTS,
Expand Down
16 changes: 15 additions & 1 deletion etalon/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ async def collect_results(

async def run_main_loop(
model: str,
tokenizer_name: str,
llm_api: str,
tokenizer: Any,
additional_sampling_params: Optional[Dict[str, Any]] = None,
Expand All @@ -123,6 +124,7 @@ async def run_main_loop(
):
req_launcher = RequestsLauncher(
model=model,
tokenizer_name=tokenizer_name,
llm_api=llm_api,
num_ray_clients=num_ray_clients,
num_concurrent_requests_per_client=num_concurrent_requests_per_client,
Expand Down Expand Up @@ -185,6 +187,7 @@ async def run_main_loop(

def run_benchmark(
model: str,
tokenizer_name: str,
output_dir: str,
additional_sampling_params: Optional[Dict[str, Any]] = None,
num_ray_clients: int = 2,
Expand Down Expand Up @@ -239,7 +242,7 @@ def run_benchmark(
)

tokenizer = get_tokenizer(
model,
tokenizer_name=tokenizer_name,
trust_remote_code=True,
)

Expand All @@ -265,6 +268,7 @@ def run_benchmark(
asyncio.run(
run_main_loop(
model=model,
tokenizer_name=tokenizer_name,
llm_api=llm_api,
tokenizer=tokenizer,
additional_sampling_params=additional_sampling_params,
Expand Down Expand Up @@ -300,6 +304,12 @@ def parse_args():
args.add_argument(
"--model", type=str, required=True, help="The model to use for this load test."
)
args.add_argument(
"--tokenizer",
type=str,
required=False,
help="The tokenizer to use for this load test. By default, the tokenizer is inferred from the model.",
)
args.add_argument(
"--num-ray-clients",
type=int,
Expand Down Expand Up @@ -591,6 +601,9 @@ def parse_args():

args = args.parse_args()

if args.tokenizer is None:
args.tokenizer = args.model

if not args.should_use_given_dir:
benchmark_identifier = f"{args.model}_{args.request_interval_generator_provider}_{args.request_length_generator_provider}"
benchmark_identifier = re.sub(r"[^\w\d-]+", "-", benchmark_identifier)
Expand Down Expand Up @@ -629,6 +642,7 @@ def parse_args():
llm_api=args.llm_api,
output_dir=args.output_dir,
model=args.model,
tokenizer_name=args.tokenizer,
timeout=args.timeout,
max_num_completed_requests=args.max_num_completed_requests,
num_ray_clients=args.num_ray_clients,
Expand Down

0 comments on commit d5f18de

Please sign in to comment.