diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator.py index a9e5dc30..ec38f1fe 100644 --- a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator.py +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator.py @@ -3,8 +3,8 @@ import json import logging -import os import uuid +from pathlib import Path import ray import yaml @@ -54,14 +54,27 @@ def catalog( ) -> ExperimentCatalog: """Returns the Experiments your actuator provides""" - # The catalog be formed in code here or read from a file containing the Experiments models - # This shows reading from a file + # Loading experiment definitions for yaml files contained in the `experiments` directory. + # NOTE: Only files can be placed in the experiments directory, + # but each file can contain multiple experiment definitions + curr_path = Path(__file__) + exp_dir = curr_path.parent / Path("experiments") + logger.debug(f"Experiments dir {exp_dir.absolute()}") + experiments = [] + for exp_file in exp_dir.iterdir(): + if exp_file.is_dir(): + continue + + logger.debug(f"Loading experiments from {exp_file.name}") + try: + file_data = exp_file.read_text() + data = yaml.safe_load(file_data) + except yaml.YAMLError: + error_message = f"File {exp_file.name} is a malformed YAML" + logger.error(error_message) + raise ValueError(error_message) - path = os.path.abspath(__file__) - path = os.path.split(path)[0] - with open(os.path.join(path, "experiments.yaml")) as f: - data = yaml.safe_load(f) - experiments = [Experiment(**data[e]) for e in data] + experiments.extend([Experiment.model_validate(data[e]) for e in data]) return ExperimentCatalog( catalogIdentifier=cls.identifier, @@ -176,7 +189,11 @@ async def submit( if experiment.deprecated is True: raise DeprecatedExperimentError(f"Experiment {experiment} is deprecated") - if experiment.identifier == "performance-testing-full": + if experiment.identifier in [ + "performance-testing-full", + "performance-testing-geospatial-full", + "performance-testing-geospatial-full-custom-dataset", + ]: if not self.env_manager: raise MissingConfigurationForExperimentError( f"Actuator configuration did not contain sufficient information for a kubernetes environment manager to be created. " @@ -197,7 +214,7 @@ async def submit( ) # Execute experiment - # Note: Here the experiment instance is just past for convenience since we retrieved it above + # Note: Here the experiment instance is just passed for convenience since we retrieved it above run_resource_and_workload_experiment.remote( request=request, experiment=experiment, diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/datasets/india_url_in_b64_out.jsonl b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/datasets/india_url_in_b64_out.jsonl new file mode 100644 index 00000000..693bbc09 --- /dev/null +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/datasets/india_url_in_b64_out.jsonl @@ -0,0 +1 @@ +{"prompt":{"data": {"data": "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif","data_format": "url","out_data_format": "b64_json","indices": [1, 2, 3, 8, 11, 12]},"priority": 0,"softmax": false}} diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/geospatial_valencia.jsonl b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/datasets/valencia_url_in_b64_out.jsonl similarity index 100% rename from plugins/actuators/vllm_performance/ado_actuators/vllm_performance/geospatial_valencia.jsonl rename to plugins/actuators/vllm_performance/ado_actuators/vllm_performance/datasets/valencia_url_in_b64_out.jsonl diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py index 3da664a8..d588a884 100644 --- a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py @@ -7,6 +7,7 @@ import subprocess import sys import time +import traceback import ray from ado_actuators.vllm_performance.actuator_parameters import ( @@ -22,6 +23,7 @@ VLLMDtype, ) from ado_actuators.vllm_performance.vllm_performance_test.execute_benchmark import ( + execute_geospatial_benchmark, execute_random_benchmark, ) from ray.actor import ActorHandle @@ -142,6 +144,9 @@ def _create_environment( reuse_deployment=False, pvc_name=actuator.pvc_template, namespace=actuator.namespace, + skip_tokenizer_init=values.get("skip_tokenizer_init"), + enforce_eager=values.get("enforce_eager"), + io_processor_plugin=values.get("io_processor_plugin"), ) # Update manager env_manager.done_creating.remote(definition=definition) @@ -151,6 +156,7 @@ def _create_environment( logger.error( f"Attempt {attempt}. Failed to create test environment {e}" ) + logger.error(traceback.format_exception(e)) error = f"Failed to create test environment {e}" time.sleep(tmout) tmout *= 2 @@ -279,23 +285,42 @@ def run_resource_and_workload_experiment( start = time.time() result = None try: - result = execute_random_benchmark( - base_url=base_url, - model=values.get("model"), - interpreter=actuator_parameters.interpreter, - num_prompts=int(values.get("num_prompts")), - request_rate=request_rate, - max_concurrency=max_concurrency, - hf_token=actuator_parameters.hf_token, - benchmark_retries=actuator_parameters.benchmark_retries, - retries_timeout=actuator_parameters.retries_timeout, - number_input_tokens=int(values.get("number_input_tokens")), - max_output_tokens=int(values.get("max_output_tokens")), - burstiness=float(values.get("burstiness")), - ) + if experiment.identifier in [ + "performance-testing-geospatial-full", + "performance-testing-geospatial-full-custom-dataset", + ]: + result = execute_geospatial_benchmark( + base_url=base_url, + model=values.get("model"), + interpreter=actuator_parameters.interpreter, + num_prompts=int(values.get("num_prompts")), + request_rate=request_rate, + max_concurrency=max_concurrency, + hf_token=actuator_parameters.hf_token, + benchmark_retries=actuator_parameters.benchmark_retries, + retries_timeout=actuator_parameters.retries_timeout, + burstiness=float(values.get("burstiness")), + dataset=values.get("dataset"), + ) + else: + result = execute_random_benchmark( + base_url=base_url, + model=values.get("model"), + interpreter=actuator_parameters.interpreter, + num_prompts=int(values.get("num_prompts")), + request_rate=request_rate, + max_concurrency=max_concurrency, + hf_token=actuator_parameters.hf_token, + benchmark_retries=actuator_parameters.benchmark_retries, + retries_timeout=actuator_parameters.retries_timeout, + number_input_tokens=int(values.get("number_input_tokens")), + max_output_tokens=int(values.get("max_output_tokens")), + burstiness=float(values.get("burstiness")), + dataset=values.get("dataset"), + ) logger.debug(f"benchmark executed in {time.time() - start} sec") except Exception as e: - logger.error(f"Failed to execute VLLM performance test {e}") + logger.error(traceback.format_exception(e)) error = f"Failed to execute VLLM performance test {e}" finally: if pf is not None: @@ -379,20 +404,36 @@ def run_workload_experiment( error = None measured_values = [] try: - result = execute_random_benchmark( - base_url=values.get("endpoint"), - model=values.get("model"), - interpreter=actuator_parameters.interpreter, - num_prompts=int(values.get("num_prompts")), - request_rate=request_rate, - max_concurrency=max_concurrency, - hf_token=actuator_parameters.hf_token, - benchmark_retries=actuator_parameters.benchmark_retries, - retries_timeout=actuator_parameters.retries_timeout, - number_input_tokens=int(values.get("number_input_tokens")), - max_output_tokens=int(values.get("max_output_tokens")), - burstiness=float(values.get("burstiness")), - ) + if experiment.identifier == "performance-testing-geospatial-endpoint": + result = execute_geospatial_benchmark( + base_url=values.get("endpoint"), + model=values.get("model"), + interpreter=actuator_parameters.interpreter, + num_prompts=int(values.get("num_prompts")), + request_rate=request_rate, + max_concurrency=max_concurrency, + hf_token=actuator_parameters.hf_token, + benchmark_retries=actuator_parameters.benchmark_retries, + retries_timeout=actuator_parameters.retries_timeout, + burstiness=float(values.get("burstiness")), + dataset=values.get("dataset"), + ) + else: + result = execute_random_benchmark( + base_url=values.get("endpoint"), + model=values.get("model"), + interpreter=actuator_parameters.interpreter, + num_prompts=int(values.get("num_prompts")), + request_rate=request_rate, + max_concurrency=max_concurrency, + hf_token=actuator_parameters.hf_token, + benchmark_retries=actuator_parameters.benchmark_retries, + retries_timeout=actuator_parameters.retries_timeout, + number_input_tokens=int(values.get("number_input_tokens")), + max_output_tokens=int(values.get("max_output_tokens")), + burstiness=float(values.get("burstiness")), + dataset=values.get("dataset"), + ) except Exception as e: logger.error(f"Failed to execute VLLM performance test {e}") error = f"Failed to execute VLLM performance test {e}" diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiments.yaml b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiments/performance_testing.yaml similarity index 90% rename from plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiments.yaml rename to plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiments/performance_testing.yaml index 1d03b13a..a60a17d4 100644 --- a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiments.yaml +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiments/performance_testing.yaml @@ -56,6 +56,12 @@ performance_testing-full: variableType: 'DISCRETE_VARIABLE_TYPE' domainRange: [ 1, 10000 ] interval: 1 + - identifier: 'dataset' + metadata: + description: "(benchmark) The dataset to be used for the experiment" + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ 'random' ] - identifier: image metadata: description: "(deployment) Docker image to use to create vllm deployments" @@ -120,6 +126,18 @@ performance_testing-full: propertyDomain: variableType: "CATEGORICAL_VARIABLE_TYPE" values: [ 'NVIDIA-A100-80GB-PCIe', 'NVIDIA-A100-SXM4-80GB' ] + - identifier: 'skip_tokenizer_init' + metadata: + description: "(deployment) skip tokenizer initialization" + propertyDomain: + variableType: BINARY_VARIABLE_TYPE + values: [True, False] + - identifier: 'enforce_eager' + metadata: + description: "(deployment) enforce PyTorch eager mode" + propertyDomain: + variableType: BINARY_VARIABLE_TYPE + values: [True, False] defaultParameterization: - property: identifier: 'image' @@ -149,6 +167,9 @@ performance_testing-full: - property: identifier: 'max_output_tokens' value: 128 + - property: + identifier: 'dataset' + value: 'random' - property: identifier: 'gpu_memory_utilization' value: .9 @@ -167,6 +188,12 @@ performance_testing-full: - property: identifier: 'gpu_type' value: 'NVIDIA-A100-80GB-PCIe' + - property: + identifier: 'skip_tokenizer_init' + value: False + - property: + identifier: 'enforce_eager' + value: False # measurements targetProperties: - identifier: "duration" @@ -221,6 +248,7 @@ performance_testing-endpoint: description: 'The endpoint(s) to test' propertyDomain: variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE" + values: ["http://localhost:8000"] - identifier: 'request_rate' metadata: description: "The number of requests to send per second" @@ -264,6 +292,12 @@ performance_testing-endpoint: variableType: 'DISCRETE_VARIABLE_TYPE' domainRange: [ -1, 500 ] # -1 means no concurrency control interval: 1 + - identifier: 'dataset' + metadata: + description: "(benchmark) The dataset to be used for the experiment" + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ 'random' ] defaultParameterization: - value: 1000 property: @@ -280,6 +314,9 @@ performance_testing-endpoint: - value: 128 property: identifier: 'max_output_tokens' + - property: + identifier: 'dataset' + value: 'random' # measurements targetProperties: - identifier: "duration" @@ -318,4 +355,4 @@ performance_testing-endpoint: - identifier: "p75_e2el_ms" - identifier: "p99_e2el_ms" metadata: - description: 'Test inference performance of a model served by vLLM endpoint across inference workload configurations' + description: 'Test inference performance of a model served by vLLM endpoint across inference workload configurations' \ No newline at end of file diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiments/performance_testing_geospatial.yaml b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiments/performance_testing_geospatial.yaml new file mode 100644 index 00000000..65ee2733 --- /dev/null +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiments/performance_testing_geospatial.yaml @@ -0,0 +1,474 @@ +# Copyright (c) IBM Corporation +# SPDX-License-Identifier: MIT + +# The input to an experiment is an Entity. For the Entity to be a valid input +# it's properties which match what is defined here +performance_testing-geospatial-endpoint: + identifier: performance-testing-geospatial-endpoint + actuatorIdentifier: "vllm_performance" + requiredProperties: # Any entity passed to this experiment must have constitutive properties with these values + - identifier: 'model' + metadata: + description: 'model to use for testing. Assumed to be served by all endpoints tested. Required to obtain correct tokenizer for benchmarking metrics calculation' + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] + - identifier: 'endpoint' + metadata: + description: 'The endpoint(s) to test' + propertyDomain: + variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE" + values: ["http://localhost:8000"] + - identifier: 'request_rate' + metadata: + description: "The number of requests to send per second" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [-1,1000] + interval: 1 # -1 means send all requests at time 0 + optionalProperties: + - identifier: 'num_prompts' + metadata: + description: "The number of prompts to send (total number of requests)" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [1,10001] + interval: 1 + - identifier: 'burstiness' + metadata: + description: "The burstiness of the requests - 1.0 is a Poisson distribution with rate = request_rate. Others are gamma distributions with lambda = request_rate and shape = burstiness." + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ 0, 10 ] + interval: 1 + - identifier: 'max_concurrency' + metadata: + description: "The maximum number of concurrent requests to send" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ -1, 500 ] # -1 means no concurrency control + interval: 1 + - identifier: 'dataset' + metadata: + description: "The dataset to be used for the experiment" + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ 'india_url_in_b64_out', 'valencia_url_in_b64_out' ] + defaultParameterization: + - value: 100 + property: + identifier: 'num_prompts' + - value: -1 + property: + identifier: 'max_concurrency' + - value: 1.0 + property: + identifier: 'burstiness' + - property: + identifier: 'dataset' + value: 'india_url_in_b64_out' + # measurements + targetProperties: + - identifier: "duration" + - identifier: "completed" + - identifier: "total_input_tokens" + - identifier: "total_output_tokens" + - identifier: "request_throughput" + - identifier: "mean_e2el_ms" + - identifier: "median_e2el_ms" + - identifier: "std_e2el_ms" + - identifier: "p25_e2el_ms" + - identifier: "p50_e2el_ms" + - identifier: "p75_e2el_ms" + - identifier: "p99_e2el_ms" + metadata: + description: 'Test inference performance of a geospatial model served by vLLM endpoint across inference workload configurations' +performance_testing-geospatial-full: + identifier: performance-testing-geospatial-full + actuatorIdentifier: "vllm_performance" + requiredProperties: # Any entity passed to this experiment must have constitutive properties with these values + - identifier: 'model' + metadata: + description: 'model to use for testing. Assumed to be served by all endpoints tested. Required to obtain correct tokenizer for benchmarking metrics calculation' + propertyDomain: + variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE" + values: [ "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11" ] + - identifier: 'request_rate' + metadata: + description: "(benchmark) The number of requests to send per second" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [-1,1000] + interval: 1 # -1 means send all requests at time 0 + optionalProperties: + - identifier: 'num_prompts' + metadata: + description: "(benchmark) The number of prompts to send (total number of requests)" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [1,10001] + interval: 1 + - identifier: 'max_concurrency' + metadata: + description: "(benchmark) The maximum number of concurrent requests to send" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ -1, 500 ] # -1 means no concurrency control + interval: 1 + - identifier: 'burstiness' + metadata: + description: "(benchmark) The burstiness of the requests - 1.0 is a Poisson distribution with rate = request_rate. Others are gamma distributions with lambda = request_rate and shape = burstiness." + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ 0, 10 ] + interval: 1 + - identifier: 'dataset' + metadata: + description: "(benchmark) The dataset to be used for the experiment" + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ 'india_url_in_b64_out', 'valencia_url_in_b64_out' ] + - identifier: image + metadata: + description: "(deployment) Docker image to use to create vllm deployments" + propertyDomain: + variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE" + values: [ "quay.io/dataprep1/data-prep-kit/vllm_image:0.1" ] + - identifier: n_cpus + metadata: + description: "(deployment) the number of CPUs to use" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ 1,17 ] + interval: 1 + - identifier: memory + metadata: + description: "(deployment) the amount of memory to allocate to vLLM pod" + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ "64Gi", "128Gi", "256Gi" ] + - identifier: dtype + metadata: + description: "(deployment) data type for model weights and activations. “auto” will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models." + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ "auto", "half", "float16", "bfloat16", "float", "float32" ] + - identifier: 'gpu_memory_utilization' + metadata: + description: "(deployment) The fraction of GPU memory to be used for the model executor," + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + values: [ .5, .75, .9 ] + - identifier: 'cpu_offload' + metadata: + description: "(deployment) The amount of model weights in GB to offload to the CPU per GPU. 0 means all weights are on GPU," + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + values: [ 0, 8, 16, 24, 32 ] + - identifier: 'max_num_seq' + metadata: + description: "(deployment) Maximum number of sequences per iteration" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [32,2049] + interval: 32 + - identifier: 'max_batch_tokens' + metadata: + description: "(deployment) maximum number of batched tokens per iteration" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ 8192, 32769] + interval: 1024 + - identifier: 'n_gpus' + metadata: + description: "(deployment) Number of GPUs to use" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [1,9] + interval: 1 + - identifier: 'gpu_type' + metadata: + description: "(deployment) The GPU type to use" + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ 'NVIDIA-A100-80GB-PCIe', 'NVIDIA-A100-SXM4-80GB' ] + - identifier: 'skip_tokenizer_init' + metadata: + description: "(deployment) skip tokenizer initialization" + propertyDomain: + variableType: BINARY_VARIABLE_TYPE + values: [True, False] + - identifier: 'enforce_eager' + metadata: + description: "(deployment) enforce pytorch eager mode" + propertyDomain: + variableType: BINARY_VARIABLE_TYPE + values: [True, False] + - identifier: 'io_processor_plugin' + metadata: + description: 'IO Processor plugin to load for the model' + propertyDomain: + variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE" + values: [ None, "terratorch_segmentation" ] + defaultParameterization: + - property: + identifier: 'image' + value: "quay.io/dataprep1/data-prep-kit/vllm_image:0.1" + - property: + identifier: n_cpus + value: 8 + - property: + identifier: + memory + value: "128Gi" + - property: + identifier: dtype + value: "auto" + - property: + identifier: 'num_prompts' + value: 500 + - property: + identifier: 'max_concurrency' + value: -1 + - property: + identifier: 'burstiness' + value: 1.0 + - property: + identifier: 'gpu_memory_utilization' + value: .9 + - property: + identifier: 'cpu_offload' + value: 0 + - property: + identifier: 'max_num_seq' + value: 256 + - property: + identifier: 'max_batch_tokens' + value: 16384 + - property: + identifier: 'n_gpus' + value: 1 + - property: + identifier: 'gpu_type' + value: 'NVIDIA-A100-80GB-PCIe' + - property: + identifier: 'skip_tokenizer_init' + value: True + - property: + identifier: 'enforce_eager' + value: True + - property: + identifier: 'io_processor_plugin' + value: "terratorch_segmentation" + - property: + identifier: 'dataset' + value: 'india_url_in_b64_out' + # measurements + targetProperties: + - identifier: "duration" + - identifier: "completed" + - identifier: "total_input_tokens" + - identifier: "total_output_tokens" + - identifier: "request_throughput" + - identifier: "mean_e2el_ms" + - identifier: "median_e2el_ms" + - identifier: "std_e2el_ms" + - identifier: "p25_e2el_ms" + - identifier: "p50_e2el_ms" + - identifier: "p75_e2el_ms" + - identifier: "p99_e2el_ms" + metadata: + description: 'VLLM performance testing across compute resource and workload configuration' +performance_testing-geospatial-full-custom-dataset: + identifier: performance-testing-geospatial-full-custom-dataset + actuatorIdentifier: "vllm_performance" + requiredProperties: # Any entity passed to this experiment must have constitutive properties with these values + - identifier: 'model' + metadata: + description: 'model to use for testing. Assumed to be served by all endpoints tested. Required to obtain correct tokenizer for benchmarking metrics calculation' + propertyDomain: + variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE" + values: [ "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11" ] + - identifier: 'request_rate' + metadata: + description: "(benchmark) The number of requests to send per second" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [-1,1000] + interval: 1 # -1 means send all requests at time 0 + - identifier: 'dataset' + metadata: + description: "(benchmark) The dataset to be used for the experiment" + propertyDomain: + variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE" + values: ["custom_dataset.jsonl"] + optionalProperties: + - identifier: 'num_prompts' + metadata: + description: "(benchmark) The number of prompts to send (total number of requests)" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [1,10001] + interval: 1 + - identifier: 'max_concurrency' + metadata: + description: "(benchmark) The maximum number of concurrent requests to send" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ -1, 500 ] # -1 means no concurrency control + interval: 1 + - identifier: 'burstiness' + metadata: + description: "(benchmark) The burstiness of the requests - 1.0 is a Poisson distribution with rate = request_rate. Others are gamma distributions with lambda = request_rate and shape = burstiness." + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ 0, 10 ] + interval: 1 + - identifier: image + metadata: + description: "(deployment) Docker image to use to create vllm deployments" + propertyDomain: + variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE" + values: [ "quay.io/dataprep1/data-prep-kit/vllm_image:0.1" ] + - identifier: n_cpus + metadata: + description: "(deployment) the number of CPUs to use" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ 1,17 ] + interval: 1 + - identifier: memory + metadata: + description: "(deployment) the amount of memory to allocate to vLLM pod" + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ "64Gi", "128Gi", "256Gi" ] + - identifier: dtype + metadata: + description: "(deployment) data type for model weights and activations. “auto” will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models." + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ "auto", "half", "float16", "bfloat16", "float", "float32" ] + - identifier: 'gpu_memory_utilization' + metadata: + description: "(deployment) The fraction of GPU memory to be used for the model executor," + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + values: [ .5, .75, .9 ] + - identifier: 'cpu_offload' + metadata: + description: "(deployment) The amount of model weights in GB to offload to the CPU per GPU. 0 means all weights are on GPU," + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + values: [ 0, 8, 16, 24, 32 ] + - identifier: 'max_num_seq' + metadata: + description: "(deployment) Maximum number of sequences per iteration" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [32,2049] + interval: 32 + - identifier: 'max_batch_tokens' + metadata: + description: "(deployment) maximum number of batched tokens per iteration" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [ 8192, 32769] + interval: 1024 + - identifier: 'n_gpus' + metadata: + description: "(deployment) Number of GPUs to use" + propertyDomain: + variableType: 'DISCRETE_VARIABLE_TYPE' + domainRange: [1,9] + interval: 1 + - identifier: 'gpu_type' + metadata: + description: "(deployment) The GPU type to use" + propertyDomain: + variableType: "CATEGORICAL_VARIABLE_TYPE" + values: [ 'NVIDIA-A100-80GB-PCIe', 'NVIDIA-A100-SXM4-80GB' ] + - identifier: 'skip_tokenizer_init' + metadata: + description: "(deployment) skip tokenizer initialization" + propertyDomain: + variableType: BINARY_VARIABLE_TYPE + values: [True, False] + - identifier: 'enforce_eager' + metadata: + description: "(deployment) enforce PyTorch eager mode" + propertyDomain: + variableType: BINARY_VARIABLE_TYPE + values: [True, False] + - identifier: 'io_processor_plugin' + metadata: + description: 'IO Processor plugin to load for the model' + propertyDomain: + variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE" + values: [ "terratorch_segmentation" ] + defaultParameterization: + - property: + identifier: 'image' + value: "quay.io/dataprep1/data-prep-kit/vllm_image:0.1" + - property: + identifier: n_cpus + value: 8 + - property: + identifier: + memory + value: "128Gi" + - property: + identifier: dtype + value: "auto" + - property: + identifier: 'num_prompts' + value: 500 + - property: + identifier: 'max_concurrency' + value: -1 + - property: + identifier: 'burstiness' + value: 1.0 + - property: + identifier: 'gpu_memory_utilization' + value: .9 + - property: + identifier: 'cpu_offload' + value: 0 + - property: + identifier: 'max_num_seq' + value: 256 + - property: + identifier: 'max_batch_tokens' + value: 16384 + - property: + identifier: 'n_gpus' + value: 1 + - property: + identifier: 'gpu_type' + value: 'NVIDIA-A100-80GB-PCIe' + - property: + identifier: 'skip_tokenizer_init' + value: True + - property: + identifier: 'enforce_eager' + value: True + - property: + identifier: 'io_processor_plugin' + value: "terratorch_segmentation" + # measurements + targetProperties: + - identifier: "duration" + - identifier: "completed" + - identifier: "total_input_tokens" + - identifier: "total_output_tokens" + - identifier: "request_throughput" + - identifier: "mean_e2el_ms" + - identifier: "median_e2el_ms" + - identifier: "std_e2el_ms" + - identifier: "p25_e2el_ms" + - identifier: "p50_e2el_ms" + - identifier: "p75_e2el_ms" + - identifier: "p99_e2el_ms" + metadata: + description: 'VLLM performance testing across compute resource and workload configuration' \ No newline at end of file diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/create_environment.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/create_environment.py index 87ee719d..c3a2a2b1 100644 --- a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/create_environment.py +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/create_environment.py @@ -40,6 +40,9 @@ def create_test_environment( reuse_pvc: bool = True, pvc_name: str = "vllm-support", namespace: str = "vllm-testing", + enforce_eager: bool = False, + skip_tokenizer_init: bool = False, + io_processor_plugin: str | None = None, ) -> None: """ Create test deployment @@ -113,15 +116,13 @@ def create_test_environment( n_gpus=n_gpus, n_cpus=n_cpus, memory=memory, - max_batch_tokens=max_batch_tokens, - gpu_memory_utilization=gpu_memory_utilization, - dtype=dtype, - cpu_offload=cpu_offload, - max_num_seq=max_num_seq, template=deployment_template, claim_name=pvc_name, hf_token=hf_token, reuse=reuse_deployment, + enforce_eager=enforce_eager, + skip_tokenizer_init=skip_tokenizer_init, + io_processor_plugin=io_processor_plugin, ) logger.debug("deployment created") c_manager.wait_deployment_ready(k8_name=k8_name) diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/manage_components.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/manage_components.py index dfef4725..9fddc978 100644 --- a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/manage_components.py +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/manage_components.py @@ -231,6 +231,9 @@ def create_deployment( claim_name: str | None = None, hf_token: str | None = None, reuse: bool = False, + enforce_eager: bool = False, + skip_tokenizer_init: bool = False, + io_processor_plugin: str | None = None, ) -> None: """ create deployment for model @@ -293,6 +296,9 @@ def create_deployment( template=template, claim_name=claim_name, hf_token=hf_token, + enforce_eager=enforce_eager, + skip_tokenizer_init=skip_tokenizer_init, + io_processor_plugin=io_processor_plugin, ), ) except ApiException as e: diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/yaml_support/build_components.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/yaml_support/build_components.py index 7fb29b17..36ab4fe0 100644 --- a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/yaml_support/build_components.py +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/yaml_support/build_components.py @@ -1,6 +1,7 @@ # Copyright (c) IBM Corporation # SPDX-License-Identifier: MIT +import json import logging import os import sys @@ -77,6 +78,9 @@ def deployment_yaml( template: str = "deployment.yaml", claim_name: str | None = None, hf_token: str | None = None, + enforce_eager: bool = False, + skip_tokenizer_init: bool = False, + io_processor_plugin: str | None = None, ) -> dict[str, Any]: """ Generate deployment yaml @@ -138,6 +142,30 @@ def deployment_yaml( [{"name": PVC_NAME, "persistentVolumeClaim": {"claimName": claim_name}}] ) + vllm_serve_args = [ + model, + "--max-num-batched-tokens", + f"{max_batch_tokens}", + "--gpu-memory-utilization", + f"{gpu_memory_utilization}", + "--cpu-offload-gb", + f"{cpu_offload}", + "--max-num-seq", + f"{max_num_seq}", + "--tensor-parallel-size", + f"{n_gpus}", + "--dtype", + dtype.value, + ] + + if enforce_eager: + vllm_serve_args.append("--enforce-eager") + if skip_tokenizer_init: + vllm_serve_args.append("--skip-tokenizer-init") + if io_processor_plugin is not None: + vllm_serve_args.append("--io-processor-plugin") + vllm_serve_args.append(io_processor_plugin) + # container container = spec["containers"][0] # image @@ -151,19 +179,16 @@ def deployment_yaml( limits["cpu"] = str(n_cpus) limits["memory"] = memory limits["nvidia.com/gpu"] = str(n_gpus) - # env variables to to set parameters for docker execution - container["env"] = [ - {"name": "MODEL", "value": model}, - {"name": "GPU_MEMORY_UTILIZATION", "value": str(gpu_memory_utilization)}, - {"name": "DTYPE", "value": dtype.value}, - {"name": "CPU_OFFLOAD_GB", "value": str(cpu_offload)}, - {"name": "MAX_NUM_BATCHED_TOKENS", "value": str(max_batch_tokens)}, - {"name": "MAX_NUM_SEQ", "value": str(max_num_seq)}, - {"name": "TENSOR_PARALLEL_SIZE", "value": str(n_gpus)}, - ] + + # command + container["command"] = ["vllm", "serve"] + container["args"] = vllm_serve_args + if hf_token is not None: - container["env"].extend([{"name": "HF_TOKEN", "value": hf_token}]) + container["env"] = [{"name": "HF_TOKEN", "value": hf_token}] if claim_name is not None: + if "env" not in container: + container["env"] = [] container["env"].extend( [ { @@ -180,7 +205,7 @@ def deployment_yaml( ] ) - # return + logger.debug(json.dumps(deployment_yaml, indent=2)) return deployment_yaml @staticmethod diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/yaml_support/deployment.yaml b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/yaml_support/deployment.yaml index 2b90302a..2659550d 100644 --- a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/yaml_support/deployment.yaml +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/k8/yaml_support/deployment.yaml @@ -68,6 +68,4 @@ spec: emptyDir: medium: Memory nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-80GB-PCIe - #nvidia.com/gpu.product: Tesla-V100-PCIE-16GB - #kubernetes.io/hostname: cpu15 \ No newline at end of file + nvidia.com/gpu.product: NVIDIA-A100-80GB-PCIe \ No newline at end of file diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/vllm_performance_test/execute_benchmark.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/vllm_performance_test/execute_benchmark.py index 12a05754..839aa528 100644 --- a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/vllm_performance_test/execute_benchmark.py +++ b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/vllm_performance_test/execute_benchmark.py @@ -12,11 +12,19 @@ get_results, ) +logger = logging.getLogger("vllm-bench") + +default_geospatial_datasets_filenames = { + "india_url_in_b64_out": "india_url_in_b64_out.jsonl", + "valencia_url_in_b64_out": "valencia_url_in_b64_out.jsonl", +} + def execute_benchmark( base_url: str, model: str, - data_set: str, + dataset: str, + backend: str = "openai", interpreter: str = "python", num_prompts: int = 500, request_rate: int | None = None, @@ -24,7 +32,7 @@ def execute_benchmark( hf_token: str | None = None, benchmark_retries: int = 3, retries_timeout: int = 5, - data_set_path: str | None = None, + dataset_path: str | None = None, custom_args: dict[str, Any] | None = None, burstiness: float = 1, ) -> dict[str, Any]: @@ -32,57 +40,51 @@ def execute_benchmark( Execute benchmark :param base_url: url for vllm endpoint :param model: model - :param data_set: data set name ["sharegpt", "sonnet", "random", "hf"] - :param interpreter - name of Python interpreter + :param dataset: data set name ["sharegpt", "sonnet", "random", "hf"] + :param backend: name of the vLLM benchmark backend to be used ["vllm", "openai", "openai-chat", "openai-audio", "openai-embeddings"] + :param interpreter: name of Python interpreter :param num_prompts: number of prompts :param request_rate: request rate - :param max_concurrency: max concurrency + :param max_concurrency: maximum number of concurrent requests :param hf_token: huggingface token :param benchmark_retries: number of benchmark execution retries :param retries_timeout: timeout between initial retry - :param data_set_path: path to the dataset + :param dataset_path: path to the dataset :param custom_args: custom arguments to pass to the benchmark. + :param burstiness: burstiness factor of the request generation, 0 < burstiness < 1 keys are vllm benchmark arguments. values are the values to pass to the arguments + :return: results dictionary """ - logger = logging.getLogger("vllm-bench") logger.debug( f"executing benchmark, invoking service at {base_url} with the parameters: " ) logger.debug( - f"model {model}, data set {data_set}, python {interpreter}, num prompts {num_prompts}" + f"model {model}, data set {dataset}, python {interpreter}, num prompts {num_prompts}" ) logger.debug( f"request_rate {request_rate}, max_concurrency {max_concurrency}, benchmark retries {benchmark_retries}" ) - # The code below is commented as we are switching from a script invocation to command line - # invocation. If we want to bring back script execution for any reason, this code must be - # uncommented - # parameters - # code = os.path.abspath( - # os.path.join(os.path.dirname(__file__), "benchmark_serving.py") - # ) + request = f"export HF_TOKEN={hf_token} && " if hf_token is not None else "" f_name = f"{uuid.uuid4().hex}.json" request += ( - # changing from script invocation to cli invocation - # f"{interpreter} {code} --backend openai --base-url {base_url} --dataset-name {data_set} " - f"vllm bench serve --backend openai --base-url {base_url} --dataset-name {data_set} " - f"--model {model} --seed 12345 --num-prompts {num_prompts!s} --save-result --metric-percentiles " + f"vllm bench serve --backend {backend} --base-url {base_url} --dataset-name {dataset} " + f"--model {model} --seed 12345 --num-prompts 10 --save-result --metric-percentiles " f'"25,75,99" --percentile-metrics "ttft,tpot,itl,e2el" --result-dir . --result-filename {f_name} ' f"--burstiness {burstiness} " ) - if data_set_path is not None: - request += f"--dataset-path {data_set_path} " + if dataset_path is not None: + request += f" --dataset-path {dataset_path} " if request_rate is not None: - request += f"--request-rate {request_rate!s} " + request += f" --request-rate {request_rate!s} " if max_concurrency is not None: request += f"--max-concurrency {max_concurrency!s} " if custom_args is not None: for key, value in custom_args.items(): - request += f"{key} {value!s} " + request += f" {key} {value!s} " timeout = retries_timeout logger.debug(f"Command line: {request}") @@ -106,6 +108,7 @@ def execute_benchmark( def execute_random_benchmark( base_url: str, model: str, + dataset: str, num_prompts: int = 500, request_rate: int | None = None, max_concurrency: int | None = None, @@ -121,19 +124,25 @@ def execute_random_benchmark( Execute benchmark with random dataset :param base_url: url for vllm endpoint :param model: model - :param data_set: data set name ["sharegpt", "sonnet", "random", "hf"] + :param dataset: data set name ["sharegpt", "sonnet", "random", "hf"] + :param num_prompts: number of prompts + :param request_rate: request rate + :param max_concurrency: maximum number of concurrent requests :param hf_token: huggingface token :param benchmark_retries: number of benchmark execution retries :param retries_timeout: timeout between initial retry - :param input_token_length: length of input tokens - :param output_token_length: length of output tokens + :param burstiness: burstiness factor of the request generation, 0 < burstiness < 1 + :param number_input_tokens: maximum number of input tokens for each request, + :param max_output_tokens: maximum number of output tokens for each request, + :param interpreter: name of Python interpreter + :return: results dictionary """ # Call execute_benchmark with the appropriate arguments return execute_benchmark( base_url=base_url, model=model, - data_set="random", + dataset=dataset, interpreter=interpreter, num_prompts=num_prompts, request_rate=request_rate, @@ -149,14 +158,86 @@ def execute_random_benchmark( ) +def execute_geospatial_benchmark( + base_url: str, + model: str, + dataset: str, + num_prompts: int = 500, + request_rate: int | None = None, + max_concurrency: int | None = None, + hf_token: str | None = None, + benchmark_retries: int = 3, + retries_timeout: int = 5, + burstiness: float = 1, + interpreter: str = "python", +) -> dict[str, Any]: + """ + Execute benchmark with random dataset + :param base_url: url for vllm endpoint + :param model: model + :param dataset: data set name ["sharegpt", "sonnet", "random", "hf"] + :param num_prompts: number of prompts + :param request_rate: request rate + :param max_concurrency: maximum number of concurrent requests + :param hf_token: huggingface token + :param benchmark_retries: number of benchmark execution retries + :param retries_timeout: timeout between initial retry + :param burstiness: burstiness factor of the request generation, 0 < burstiness < 1 + :param interpreter: python interpreter to use + + :return: results dictionary + """ + from pathlib import Path + + if dataset in default_geospatial_datasets_filenames: + dataset_filename = default_geospatial_datasets_filenames[dataset] + parent_path = Path(__file__).parents[1] + dataset_path = parent_path / "datasets" / dataset_filename + else: + # This can only happen with the performance-testing-geospatial-full-custom-dataset + # experiment, otherwise the dataset name is always one of the allowed ones. + # Here the assumption is that the dataset file is placed in the process working directory. + ray_working_dir = Path.cwd() + dataset_path = ray_working_dir / dataset + + if not dataset_path.is_file(): + error_string = ( + "The dataset filename provided does not exist or " + f"does not point to a valid file: {dataset_path}" + ) + logger.warning(error_string) + raise ValueError(error_string) + + logger.debug(f"Dataset path {dataset_path}") + + return execute_benchmark( + base_url=base_url, + backend="io-processor-plugin", + model=model, + dataset="custom", + interpreter=interpreter, + num_prompts=num_prompts, + request_rate=request_rate, + max_concurrency=max_concurrency, + hf_token=hf_token, + benchmark_retries=benchmark_retries, + retries_timeout=retries_timeout, + burstiness=burstiness, + custom_args={ + "--dataset-path": f"{dataset_path.resolve()}", + "--endpoint": "/pooling", + "--skip-tokenizer-init": True, + }, + ) + + if __name__ == "__main__": - results = execute_benchmark( + results = execute_geospatial_benchmark( interpreter="python3.10", - base_url="http://localhost:28015", - data_set="random", - model="openai/gpt-oss-20b", - request_rate=None, - max_concurrency=None, + base_url="http://localhost:8000", + model="ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", + request_rate=2, + max_concurrency=10, hf_token=os.getenv("HF_TOKEN"), num_prompts=100, )