diff --git a/tests/README.md b/tests/README.md index 0ceb6b949b..b79a3b7d46 100644 --- a/tests/README.md +++ b/tests/README.md @@ -8,11 +8,10 @@ This document outlines the testing framework for the Dynamo runtime system, incl ```bash tests/ -├── serve/ # E2E tests using dynamo serve -│ ├── conftest.py # test fixtures as needed for specific test area -├── run/ # E2E tests using dynamo run +├── serve/ # E2E tests │ ├── conftest.py # test fixtures as needed for specific test area ├── conftest.py # Shared fixtures and configuration +├── utils # Common utils accross tests └── README.md # This file ``` diff --git a/tests/fault_tolerance/README.md b/tests/fault_tolerance/README.md deleted file mode 100644 index 450ee101f9..0000000000 --- a/tests/fault_tolerance/README.md +++ /dev/null @@ -1,581 +0,0 @@ - - -# Fault Tolerance Test Suite - -As a large scale distributed inference serving framework in addition -to providing high throughput and low latency, Dynamo needs to -provide fault detection, resilency, and quick recovery in the face of -unforseen failures. In order to test Dynamo we are developing a test -suite to inject and measure the impact of different types of failure -conditions. - -## Test Architecture - -The fault tolerance test suite is designed as a set of pytest -configurations that launch typical dynamo serve graph deployments and -then inject failures by terminating processes in the graph. To test -the recovery time and impact of failures a set number of clients are -launched in parallel. Each client sends a set number of synchronous -requests. Log files are stored for each dynamo process as well as for -each client and inspected using a post-processing script. - -> [!NOTE] -> Test pass / failure is not an indication of SLA for recovery or resilience -> It only indicates is the test was executed and data was collected - -> [!NOTE] -> The test suite currently targets single node Dynamo Serve. -> Support for Dynamo Deploy is a work in progress. - -### Test Sequence Diagram - -```mermaid -sequenceDiagram - participant Tester as Test Runner - participant Dynamo as DynamoServeProcess - participant Circus as CircusController - participant Client as Test Clients - participant Metrics as Metrics Collector - - Tester->>Dynamo: Launch deployment graph - Dynamo-->>Tester: Signal ready - Tester->>Metrics: Start metrics collection - Tester->>Client: Spawn multiple clients - loop During Test - Client->>Dynamo: Send chat completion requests - Dynamo-->>Client: Return responses - Metrics->>Dynamo: Collect runtime metrics - end - Tester->>Dynamo: Inject failures (terminate components) - Dynamo-->>Tester: Recover/respawn as configured - Client-->>Tester: Log request results - Metrics-->>Tester: Log metrics data - Tester->>Dynamo: Shutdown deployment - Tester->>Metrics: Stop metrics collection - Tester->>Tester: Parse logs and summarize results -``` - -### Failure Scenarios - -The test suite includes several predefined fault injection scenarios designed to validate system resilience under various failure conditions. These scenarios are configured in `scenarios.py` and can be selected via pytest parameters. Below is a description of the available scenarios: - -| Scenario Name | Description | Affected Components | Timing Example | -|------------------------|-----------------------------------------------------------------------------|-------------------------------------------------|--------------------| -| **decode_worker** | Terminates decoder worker processes | `dynamo_vllmworker` | 30 seconds | -| **prefill_worker** | Terminates prefill worker processes | `dynamo_prefillworker` | 30 seconds | -| **frontend** | Terminates frontend processes handling client requests | `dynamo_frontend` | 30 seconds | -| **processor** | Terminates processor nodes responsible for task orchestration | `dynamo_processor` | 30 seconds | -| **vllm_worker** | Terminates low-level VLLM worker processes | `vllm_worker` (external to Dynamo) | 30 seconds | -| **none** | Baseline scenario with no failures | N/A | N/A | - - -#### Key Characteristics: -1. **Timing**: Failures are injected at predefined intervals (e.g., 30 seconds after test start) -2. **Severity**: The number of terminated processes can be configured (default: 1) -3. **Scope**: Failures target specific components while leaving others operational - -#### Configuration: -- **Injection Timing**: Defined in `failure_scenarios` dictionary in `scenarios.py` -- **Process Count**: Adjustable via tuple values (e.g., `("dynamo_vllmworker", 1)` terminates 1 process) -- **Component Mapping**: - - `dynamo_*`: Internal Dynamo services - - `vllm_worker`: External VLLM model workers - -#### Example Scenario Execution: - -Run all graph configurations injecting a decode_worker failure. - -```bash -cd tests/fault_tolerance -pytest test_runner.py -k decode_worker -``` - -### Test Results Directory - -For each test scenario a directory of log files is created and post processed to summarize the test. - -``` -test_worker_failure[agg-tp-2-dp-4-none] - -. -├── client_0.log.txt -├── client_1.log.txt -├── client_2.log.txt -├── client_3.log.txt -├── client_4.log.txt -├── client_5.log.txt -├── client_6.log.txt -├── client_7.log.txt -├── dynamo_Frontend -│   ├── error.log -│   └── output.log -├── dynamo.log.txt -├── dynamo_Planner -│   ├── error.log -│   └── output.log -├── dynamo_Processor -│   ├── error.log -│   └── output.log -├── dynamo_VllmWorker -│   ├── error.log -│   └── output.log -├── etcd.log.txt -├── nats-server.log.txt -├── nvidia-smi.log.txt -├── test.log.txt -└── watcher.log.txt - -``` - -| File/Directory Name | Description | -|------------------------------------|------------------------------------------------------------------------------------------------| -| **client_*.log.txt** | Request/response logs for each client instance (contains JSON-formatted request details) | -| **dynamo_*/error.log** | Error logs for specific Dynamo components (e.g., Frontend, Processor, VllmWorker) | -| **dynamo_*/output.log** | Standard output logs for Dynamo components (service startup/shutdown messages) | -| **dynamo.log.txt** | Aggregate logs for Dynamo services (orchestration and initialization) | -| **etcd.log.txt** | Logs for etcd, the distributed key-value store used for service coordination | -| **nats-server.log.txt** | Logs for NATS message broker, handling inter-service communication | -| **nvidia-smi.log.txt** | GPU monitoring logs (records utilization statistics during test execution) | -| **test.log.txt** | Primary test execution log (contains fault injection timing, process management, and test status)| -| **watcher.log.txt** | Metrics collected by the watcher service (e.g., pending requests, active workers) | - -### Summary Results - -Results are presented in table format after each test providing summary statistics. - -**Test Group:** agg-tp-2-dp-1 - -**Test Command:** dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm - -| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time | -|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:| -| none | 56.00 | 800.00 | 0.00 | 1.97 | N/A | 0.00 | N/A | 8.00 | N/A | N/A | -| frontend | 56.00 | 656.00 | 144.00 | 1.96 | 1.96 | 0.00 | 0.00 | 0.00 | 0.00 | 17.53 | -| processor | 57.00 | 584.00 | 216.00 | 1.96 | 1.96 | 0.00 | 0.00 | 0.00 | 0.00 | 25.96 | -| decode_worker | 80.00 | 520.00 | 280.00 | 2.01 | 1.98 | 0.00 | 0.00 | 8.00 | 8.00 | 37.99 | -| vllm_worker | 58.00 | 120.00 | 680.00 | 1.98 | nan | 0.00 | 0.00 | 0.00 | 0.00 | N/A | - - -| Column Name | Description | -|-----------------------|-----------------------------------------------------------------------------| -| **Failure** | Type of fault injection applied during the test (or 'none' for baseline) | -| **Startup Time** | Time (seconds) taken for the service to become ready after initialization | -| **Success** | Number of client requests that succeeded | -| **Failed** | Number of client requests that failed or were invalid | -| **Latency Before** | Average request latency (seconds) for successful requests before fault injection | -| **Latency After** | Average request latency (seconds) for successful requests after fault injection (N/A if no fault) | -| **Pending Before** | Average number of pending requests observed before fault injection | -| **Pending After** | Average number of pending requests observed after fault injection (N/A if no fault) | -| **Violations Before** | Number of successful requests exceeding SLA latency before fault injection | -| **Violations After** | Number of successful requests exceeding SLA latency after fault injection (N/A if no fault) | -| **Recovery Time** | Time (seconds) taken for failed components to recover after fault injection | - -## Example Results - -The following results were obtained running on a single node with 8 -L40 GPUs using "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" with 8 -concurrent clients each sending 100 requests. - -### Aggregated Workers - -#### No Redundancy - -To demonstrate the failure and recovery time in the case that there is -a single instance of each process we ran a simmple "agg-tp-2-dp-1" configuration. - -```mermaid -graph LR - Client["Client"] - Frontend["Frontend"] - Processor["Processor"] - - Client --> Frontend - Frontend --> Processor - Processor --> DecodePool - - %% Decode Worker Pool (vertical layout) - subgraph DecodePool["Decode Worker Pool"] - direction TB - subgraph Decode1["Decode 1"] - direction TB - D1GPU0["GPU 0"] - D1GPU1["GPU 1"] - end - end - - %% Styling - style DecodePool stroke:#000,stroke-width:2px -``` - -#### Results: - -**Test Group: agg-tp-2-dp-1** - -**Test Command:** dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm - -| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time | -|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:| -| none | 56.00 | 800.00 | 0.00 | 1.97 | N/A | 0.00 | N/A | 8.00 | N/A | N/A | -| frontend | 56.00 | 656.00 | 144.00 | 1.96 | 1.96 | 0.00 | 0.00 | 0.00 | 0.00 | 17.53 | -| processor | 57.00 | 584.00 | 216.00 | 1.96 | 1.96 | 0.00 | 0.00 | 0.00 | 0.00 | 25.96 | -| decode_worker | 80.00 | 520.00 | 280.00 | 2.01 | 1.98 | 0.00 | 0.00 | 8.00 | 8.00 | 37.99 | -| vllm_worker | 58.00 | 120.00 | 680.00 | 1.98 | nan | 0.00 | 0.00 | 0.00 | 0.00 | N/A | - - -#### Summary: - -1. Dynamo does not currently detect and recover from direct vllm worker sub process failure. (WIP) -2. Recovery time for the decode worker itself is the largest and a decode worker failure has the largest impact (as expected) -3. Overall failure count is roughly equal to recovery time multiplied by number of clients (as expected). - - -#### Redundant Workers (Over Provisoned) - -To demonstrate the failure and recovery time in the case that there -are multiple instances of each process (except for the frontend) we -ran a simple "agg-tp-2-dp-4" configuration. - -In this case we also consider the system to be "over provisioned" for -the workload as multiple workers are not needed to maintain SLA for -the 8 clients. - -```mermaid -graph LR - Client["Client"] - Frontend["Frontend"] - Processor_1["Processor 1"] - Processor_2["Processor 2"] - - Client --> Frontend - Frontend --> Processor_1 - Frontend --> Processor_2 - - subgraph DecodePool["Decode Worker Pool"] - direction LR - subgraph Decode1["Decode 1"] - direction TB - D1GPU0["GPU 0"] - D1GPU1["GPU 1"] - end - subgraph Decode2["Decode 2"] - direction TB - D2GPU0["GPU 0"] - D2GPU1["GPU 1"] - end - subgraph Decode3["Decode 3"] - direction TB - D3GPU0["GPU 0"] - D3GPU1["GPU 1"] - end - subgraph Decode4["Decode 4"] - direction TB - D4GPU0["GPU 0"] - D4GPU1["GPU 1"] - end - end - - Processor_1 --> DecodePool - Processor_2 --> DecodePool - - style DecodePool stroke:#000,stroke-width:2px -``` - -#### Results: - -**Test Group:** agg-tp-2-dp-4 - -**Test Command:** dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml --Frontend.port 8000 in /workspace/examples/llm - -| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time | -|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:| -| none | 57.00 | 800.00 | 0.00 | 1.76 | N/A | 0.00 | N/A | 0.00 | N/A | N/A | -| frontend | 57.00 | 672.00 | 128.00 | 1.77 | 1.74 | 0.00 | 0.00 | 0.00 | 0.00 | 16.65 | -| processor | 52.00 | 680.00 | 120.00 | 1.79 | 1.78 | 0.00 | 0.00 | 0.00 | 0.00 | 21.25 | -| decode_worker | 56.00 | 796.00 | 4.00 | 1.82 | 1.78 | 0.00 | 0.00 | 0.00 | 0.00 | 44.88 | -| vllm_worker | 52.00 | 634.00 | 166.00 | 1.78 | 1.78 | 0.00 | 0.00 | 0.00 | 0.00 | N/A | - -#### Summary: - -1. Dynamo does not currently detect and recover from direct vllm - worker sub process failure. In the case of redundant workers this - results in roughtly 1/4 the requests failing after the initial 30 - seconds. (WIP) -2. By immediately detecting a decode worker failure, Dynamo can limit - the failures and reroute requests to healthy workers with minimal - impact. -3. While the processor was configured with redundancy - the system was - unable to instantiate two processors successfully leading to - failure when the processor was terminated. (WIP) - - -#### Redundant Workers (Exact Provisioning) - -To demonstrate the failure and recovery time in the case that there -are multiple instances of each process (except for the frontend) we -ran a simple "agg-tp-2-dp-4" configuration. - -In this case we also consider the system to be "exact provisioned" for -the workload as we limit the max-num-seqs for each decode worker to -exactly 2. This artificially creates a scenario that results in queing -when a failur occurs before a worker is recovered. - - -```mermaid -graph LR - Client["Client"] - Frontend["Frontend"] - Processor_1["Processor 1"] - Processor_2["Processor 2"] - - Client --> Frontend - Frontend --> Processor_1 - Frontend --> Processor_2 - - subgraph DecodePool["Decode Worker Pool"] - direction LR - subgraph Decode1["Decode 1 (max 2 seq)"] - direction TB - D1GPU0["GPU 0"] - D1GPU1["GPU 1"] - end - subgraph Decode2["Decode 2 (max 2 seq)"] - direction TB - D2GPU0["GPU 0"] - D2GPU1["GPU 1"] - end - subgraph Decode3["Decode 3 (max 2 seq)"] - direction TB - D3GPU0["GPU 0"] - D3GPU1["GPU 1"] - end - subgraph Decode4["Decode 4 (max 2 seq)"] - direction TB - D4GPU0["GPU 0"] - D4GPU1["GPU 1"] - end - end - - Processor_1 --> DecodePool - Processor_2 --> DecodePool - - style DecodePool stroke:#000,stroke-width:2px -``` - -#### Results: - -**Test Group:** agg-tp-2-dp-4 - -**Test Command:** dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml --Frontend.port 8000 --VllmWorker.max_num_seqs 2 in /workspace/examples/llm - -| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time | -|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:| -| none | 57.00 | 800.00 | 0.00 | 1.77 | N/A | 0.01 | N/A | 0.00 | N/A | N/A | -| frontend | 56.00 | 664.00 | 136.00 | 1.80 | 1.77 | 0.00 | 0.00 | 0.00 | 0.00 | 17.22 | -| processor | 56.00 | 649.00 | 151.00 | 1.76 | 1.77 | 0.01 | 0.00 | 0.00 | 0.00 | 25.79 | -| decode_worker | 56.00 | 798.00 | 2.00 | 1.77 | 1.89 | 0.00 | 0.13 | 0.00 | 84.00 | 44.57 | -| vllm_worker | 56.00 | 632.00 | 168.00 | 1.80 | 2.23 | 0.00 | 0.38 | 0.00 | 232.00 | N/A | - -#### Summary: - -1. Dynamo does not currently detect and recover from direct vllm - worker sub process failure. In the case of redundant workers this - results in roughtly 1/4 the requests failing after the initial 30 - seconds. All requests after the initial 30 seconds would also be - subject to queing as a result and we see increased SLA - violations. (WIP) -2. By immediately detecting a decode worker failure, Dynamo can limit - the failures and reroute requests to healthy workers with minimal - impact. However during the recovery period requests are subject to - queing and as a results we see increased SLA violations. -3. While the processor was configured with redundancy - the system was - unable to instantiate two processors successfully leading to - failure when the processor was terminated. (WIP) - -### Disaggregated Workers - -#### No Redunancy - -To demonstrate the failure and recovery time in the case of a -disaaggregated deployment with a single instance for each process in -the graph we ran a simple `disagg-p-tp-2-dp-1-d-tp-4-dp-1` configuration. - -```mermaid -graph LR - Client["Client"] - Frontend["Frontend"] - Processor["Processor"] - PrefillQueue["Remote Prefill Queue"] - - Client --> Frontend - Frontend --> Processor - Processor <--> DecodePool - - %% Prefill Worker Pool (horizontal layout) - subgraph PrefillPool["Prefill Worker Pool"] - direction LR - subgraph Prefill1["Prefill 1"] - direction TB - P1GPU0["GPU 0"] - P1GPU1["GPU 1"] - end - end - - %% Decode Worker Pool (vertical layout) - subgraph DecodePool["Decode Worker Pool"] - direction TB - subgraph Decode1["Decode 1"] - direction TB - D1GPU0["GPU 0"] - D1GPU1["GPU 1"] - D1GPU2["GPU 2"] - D1GPU3["GPU 3"] - end - end - - - PrefillQueue --> PrefillPool - DecodePool --> PrefillQueue - PrefillPool -.-> DecodePool - - %% Styling - style PrefillPool stroke:#0066cc,stroke-width:2px - style DecodePool stroke:#000,stroke-width:2px -``` - -#### Results: - -**Test Group:** disagg-p-tp-2-dp-1-d-tp-4-dp-1 - -**Test Command:** dynamo serve graphs.disagg:Frontend -f /workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm -| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time | -|:--------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:| -| none | 83.00 | 800.00 | 0.00 | 1.19 | N/A | 0.01 | N/A | 0.00 | N/A | N/A | -| frontend | 78.00 | 664.00 | 136.00 | 1.19 | 1.19 | 0.07 | 0.02 | 0.00 | 0.00 | 17.24 | -| processor | 77.00 | 576.00 | 224.00 | 1.19 | 1.19 | 0.00 | 0.00 | 0.00 | 0.00 | 26.90 | -| decode_worker | 72.00 | 200.00 | 600.00 | 1.20 | 1.28 | 0.03 | N/A | 0.00 | 0.00 | N/A | -| prefill_worker | 81.00 | 798.00 | 2.00 | 1.19 | 1.22 | 0.05 | 0.05 | 0.00 | 0.00 | 42.31 | -| vllm_worker | 83.00 | 797.00 | 3.00 | 1.19 | 1.22 | 0.00 | 0.03 | 0.00 | 8.00 | N/A | - -#### Summary: - - -1. Dynamo does not currently detect and recover from direct vllm - worker sub process failure. In this example the vllm sub process - failure targets a prefill worker and has the same overall impact. - (WIP) - -2. Prefill worker failure causes request timeout (30 sec) and in - addition during recovery time prefill requests are queued in the - prefill queue. - -3. Decode worker failure is currently permanent in the disaggregated - case as the prefill worker holds references to memory and which are - not freed. This leads to total failure after fault injection. - - -#### Redundant Workers - -To demonstrate the failure and recovery time in the case that there -are multiple instances of each process (except for the frontend and -decode worker) we ran a simple "disagg-p-tp-2-dp-2-d-tp-4-dp-1" -configuration. - - -```mermaid -graph LR - Client["Client"] - Frontend["Frontend"] - Processor_1["Processor 1"] - Processor_2["Processor 2"] - PrefillQueue["Remote Prefill Queue"] - - Client --> Frontend - Frontend --> Processor_1 - Frontend --> Processor_2 - - Processor_1 <--> DecodePool - Processor_2 <--> DecodePool - - %% Prefill Worker Pool (horizontal layout) - subgraph PrefillPool["Prefill Worker Pool"] - direction LR - subgraph Prefill1["Prefill 1"] - direction TB - P1GPU0["GPU 0"] - P1GPU1["GPU 1"] - end - subgraph Prefill2["Prefill 2"] - direction TB - P2GPU0["GPU 0"] - P2GPU1["GPU 1"] - end - - end - - %% Decode Worker Pool (vertical layout) - subgraph DecodePool["Decode Worker Pool"] - direction TB - subgraph Decode1["Decode 1"] - direction TB - D1GPU0["GPU 0"] - D1GPU1["GPU 1"] - D1GPU2["GPU 2"] - D1GPU3["GPU 3"] - end - end - - - PrefillQueue --> PrefillPool - DecodePool --> PrefillQueue - PrefillPool -.-> DecodePool - - %% Styling - style PrefillPool stroke:#0066cc,stroke-width:2px - style DecodePool stroke:#000,stroke-width:2px -``` - -#### Results: - -**Test Group:** disagg-p-tp-2-dp-2-d-tp-4-dp-1 - -**Test Command:** dynamo serve graphs.disagg:Frontend -f /workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm - -| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time | -|:--------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:| -| none | 83.00 | 800.00 | 0.00 | 1.19 | N/A | 0.00 | N/A | 1.00 | N/A | N/A | -| frontend | 82.00 | 704.00 | 96.00 | 1.19 | 1.17 | 0.00 | 0.01 | 1.00 | 0.00 | 12.95 | -| processor | 78.00 | 795.00 | 5.00 | 1.20 | 1.18 | 0.02 | 0.01 | 1.00 | 0.00 | 25.91 | -| decode_worker | 78.00 | 199.00 | 601.00 | 1.21 | nan | 0.00 | N/A | 1.00 | 0.00 | N/A | -| prefill_worker | 77.00 | 800.00 | 0.00 | 1.22 | 1.18 | 0.00 | 0.01 | 1.00 | 1.00 | 45.14 | -| vllm_worker | 77.00 | 799.00 | 1.00 | 1.20 | 1.16 | 0.02 | 0.00 | 1.00 | 1.00 | N/A | - -#### Summary: - -1. Dynamo does not currently detect and recover from direct vllm - worker sub process failure. In this example the vllm sub process - failure targets a prefill worker and has the same overall impact. - Since the prefill workers are redundant - a failure has low impact. - -2. Redundant prefill workers are able to absorb the load and no - additional queing is needed. - -3. Decode worker failure is currently permanent in the disaggregated - case as the prefill worker holds references to memory and which are - not freed. This leads to total failure after fault injection. - -4. Redundant processors work in this case. diff --git a/tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml b/tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml deleted file mode 100644 index 462ae7c661..0000000000 --- a/tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - router-num-threads: 4 - common-configs: [model, block-size, max-model-len] - -VllmWorker: - enforce-eager: true - max-num-batched-tokens: 16384 - enable-prefix-caching: true - ServiceArgs: - workers: 1 - resources: - gpu: '1' - common-configs: [model, block-size, max-model-len] - -Planner: - environment: local - no-operation: true \ No newline at end of file diff --git a/tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml b/tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml deleted file mode 100644 index fd0df677c3..0000000000 --- a/tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - router-num-threads: 4 - common-configs: [model, block-size, max-model-len] - -VllmWorker: - enforce-eager: true - max-num-batched-tokens: 16384 - enable-prefix-caching: true - ServiceArgs: - workers: 4 - resources: - gpu: '1' - common-configs: [model, block-size, max-model-len] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml b/tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml deleted file mode 100644 index 2862b36fc6..0000000000 --- a/tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - router-num-threads: 4 - common-configs: [model, block-size, max-model-len] - -VllmWorker: - enforce-eager: true - max-num-batched-tokens: 16384 - enable-prefix-caching: true - ServiceArgs: - workers: 8 - resources: - gpu: '1' - common-configs: [model, block-size, max-model-len] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml b/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml deleted file mode 100644 index 160a2afbad..0000000000 --- a/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - router-num-threads: 4 - common-configs: [model, block-size, max-model-len] - -VllmWorker: - enforce-eager: true - max-num-batched-tokens: 16384 - enable-prefix-caching: true - tensor-parallel-size: 2 - ServiceArgs: - workers: 1 - resources: - gpu: '2' - common-configs: [model, block-size, max-model-len] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml b/tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml deleted file mode 100644 index b1b560c779..0000000000 --- a/tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - ServiceArgs: - workers: 1 - resources: - cpu: "10" - memory: "20Gi" - - -Processor: - router: round-robin - router-num-threads: 4 - common-configs: [model, block-size, max-model-len] - ServiceArgs: - workers: 2 - resources: - cpu: "10" - memory: "20Gi" - -VllmWorker: - enforce-eager: true - max-num-batched-tokens: 16384 - enable-prefix-caching: true - tensor-parallel-size: 2 - ServiceArgs: - workers: 2 - resources: - gpu: '2' - common-configs: [model, block-size, max-model-len] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml b/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml deleted file mode 100644 index 3ab65d65e3..0000000000 --- a/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - router-num-threads: 4 - common-configs: [model, block-size, max-model-len] - ServiceArgs: - workers: 2 - resources: - cpu: "10" - memory: "20Gi" -VllmWorker: - enforce-eager: true - max-num-batched-tokens: 16384 - enable-prefix-caching: true - tensor-parallel-size: 2 - ServiceArgs: - workers: 4 - resources: - gpu: '2' - common-configs: [model, block-size, max-model-len] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml deleted file mode 100644 index 77f405a9f9..0000000000 --- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - common-configs: [model, block-size] - -VllmWorker: - remote-prefill: true - conditional-disagg: true - max-local-prefill-length: 10 - max-prefill-queue-size: 2 - ServiceArgs: - workers: 1 - resources: - gpu: '1' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -PrefillWorker: - max-num-batched-tokens: 16384 - ServiceArgs: - workers: 1 - resources: - gpu: '1' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml deleted file mode 100644 index cae3c53c37..0000000000 --- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - common-configs: [model, block-size] - -VllmWorker: - remote-prefill: true - conditional-disagg: true - max-local-prefill-length: 10 - max-prefill-queue-size: 2 - tensor-parallel-size: 2 - ServiceArgs: - workers: 1 - resources: - gpu: '2' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -PrefillWorker: - max-num-batched-tokens: 16384 - tensor-parallel-size: 1 - ServiceArgs: - workers: 1 - resources: - gpu: '1' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml deleted file mode 100644 index d123b80bc0..0000000000 --- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - common-configs: [model, block-size] - ServiceArgs: - workers: 2 - resources: - cpu: "10" - memory: "20Gi" - -VllmWorker: - remote-prefill: true - conditional-disagg: true - max-local-prefill-length: 10 - max-prefill-queue-size: 2 - tensor-parallel-size: 2 - ServiceArgs: - workers: 1 - resources: - gpu: '2' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -PrefillWorker: - max-num-batched-tokens: 16384 - tensor-parallel-size: 1 - ServiceArgs: - workers: 2 - resources: - gpu: '1' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml deleted file mode 100644 index bd0d88024c..0000000000 --- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - common-configs: [model, block-size] - -VllmWorker: - remote-prefill: true - conditional-disagg: true - max-local-prefill-length: 10 - max-prefill-queue-size: 2 - tensor-parallel-size: 4 - ServiceArgs: - workers: 1 - resources: - gpu: '4' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -PrefillWorker: - max-num-batched-tokens: 16384 - ServiceArgs: - workers: 4 - resources: - gpu: '1' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml b/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml deleted file mode 100644 index 58adea6a05..0000000000 --- a/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - common-configs: [model, block-size] - -VllmWorker: - remote-prefill: true - conditional-disagg: true - max-local-prefill-length: 10 - max-prefill-queue-size: 2 - tensor-parallel-size: 4 - ServiceArgs: - workers: 1 - resources: - gpu: '4' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -PrefillWorker: - max-num-batched-tokens: 16384 - tensor-parallel-size: 2 - ServiceArgs: - workers: 1 - resources: - gpu: '2' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml b/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml deleted file mode 100644 index f6a08d3762..0000000000 --- a/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -Common: - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - block-size: 64 - max-model-len: 16384 - kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.Processor.chat/completions - port: 8000 - -Processor: - router: round-robin - common-configs: [model, block-size] - ServiceArgs: - workers: 2 - resources: - cpu: "10" - memory: "20Gi" - -VllmWorker: - remote-prefill: true - conditional-disagg: true - max-local-prefill-length: 10 - max-prefill-queue-size: 2 - tensor-parallel-size: 4 - ServiceArgs: - workers: 1 - resources: - gpu: '4' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -PrefillWorker: - max-num-batched-tokens: 16384 - tensor-parallel-size: 2 - ServiceArgs: - workers: 2 - resources: - gpu: '2' - common-configs: [model, block-size, max-model-len, kv-transfer-config] - -Planner: - environment: local - no-operation: true diff --git a/tests/fault_tolerance/conftest.py b/tests/fault_tolerance/conftest.py deleted file mode 100644 index 1b5dabfe3e..0000000000 --- a/tests/fault_tolerance/conftest.py +++ /dev/null @@ -1,79 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - - -def pytest_addoption(parser): - parser.addoption("--requests-per-client", type=int, default=100) - parser.addoption("--clients", type=int, default=10) - parser.addoption("--no-respawn", action="store_true", default=False) - parser.addoption("--input-token-length", type=int, default=100) - parser.addoption("--output-token-length", type=int, default=100) - parser.addoption("--max-num-seqs", type=int, default=None) - parser.addoption("--max-retries", type=int, default=1) - parser.addoption("--display-dynamo-output", action="store_true", default=False) - parser.addoption("--combine-process-logs", action="store_true", default=False) - parser.addoption("--hf-hub-offline", action="store_true", default=False) - - -@pytest.fixture -def display_dynamo_output(request): - return request.config.getoption("--display-dynamo-output") - - -@pytest.fixture -def max_retries(request): - return request.config.getoption("--max-retries") - - -@pytest.fixture -def max_num_seqs(request): - return request.config.getoption("--max-num-seqs") - - -@pytest.fixture -def num_clients(request): - return request.config.getoption("--clients") - - -@pytest.fixture -def input_token_length(request): - return request.config.getoption("--input-token-length") - - -@pytest.fixture -def output_token_length(request): - return request.config.getoption("--output-token-length") - - -@pytest.fixture -def requests_per_client(request): - return request.config.getoption("--requests-per-client") - - -@pytest.fixture -def respawn(request): - return not request.config.getoption("--no-respawn") - - -@pytest.fixture -def separate_process_logs(request): - return not request.config.getoption("--combine-process-logs") - - -@pytest.fixture -def hf_hub_offline(request): - return request.config.getoption("--hf-hub-offline") diff --git a/tests/fault_tolerance/parse_results.py b/tests/fault_tolerance/parse_results.py deleted file mode 100644 index f1132bc804..0000000000 --- a/tests/fault_tolerance/parse_results.py +++ /dev/null @@ -1,433 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -import os -import re -from datetime import datetime -from typing import Any - -import pandas as pd -from tabulate import tabulate - - -def parse_test_log(file_path): - start_time = None - ready_time = None - fault_time = None - start_cmd = None - if not os.path.isfile(file_path): - return None, None, None - with open(file_path, "r") as f: - for line in f: - line = line.strip() - if "Running command: dynamo serve" in line: - start_time = datetime.fromisoformat( - line.split(" ")[1].replace("T", " ") - ) - start_cmd = line.split("Running command:")[1] - elif "Deployment Ready" in line: - ready_time = datetime.fromisoformat( - line.split(" ")[1].replace("T", " ") - ) - elif "Injecting failure for:" in line: - fault_time = datetime.fromisoformat( - line.split(" ")[1].replace("T", " ") - ) - startup_time = ( - (ready_time - start_time).total_seconds() if start_time and ready_time else None - ) - return startup_time, fault_time, start_cmd - - -def parse_client_logs(test_dir, expected_length=100): - all_logs = [] - for file in os.listdir(test_dir): - if file.startswith("client_") and file.endswith(".log.txt"): - with open(os.path.join(test_dir, file), "r") as f: - request_number = 0 - for line in f: - request_number += 1 - data = json.loads(line.strip()) - for result in data["results"]: - log_entry = { - "time": datetime.fromisoformat( - data["time"].replace("T", " ") - ), - "status": result["status"], - "request_elapsed_time": result["request_elapsed_time"], - "request_number": request_number - 1, - "client": file.split("_")[1].split(".")[0], - } - if ( - "result" in result - and result["result"] - and "choices" in result["result"] - and result["result"]["choices"] - ): - log_entry["success"] = True - content = result["result"]["choices"][0]["message"][ - "content" - ] - if not content or len(content) < expected_length: - log_entry["success"] = False - else: - log_entry["success"] = False - all_logs.append(log_entry) - if len(all_logs): - df = pd.DataFrame(all_logs) - df.sort_values("time", inplace=True) - return df - - return None - - -def calculate_metrics(df, fault_time, sla=2.1): - success = df["success"].sum() - failure = len(df) - success - - if fault_time: - before_fault = df[df["time"] <= fault_time] - after_fault = df[df["time"] > fault_time] - else: - before_fault = df - after_fault = None - - # Existing latency metrics (only successful requests) - successful_before = before_fault[before_fault["success"]] - avg_before = successful_before["request_elapsed_time"].mean() - std_before = successful_before["request_elapsed_time"].std() - - avg_after, std_after = None, None - if after_fault is not None and not after_fault.empty: - successful_after = after_fault[after_fault["success"]] - avg_after = successful_after["request_elapsed_time"].mean() - std_after = successful_after["request_elapsed_time"].std() - - # SLA violations (only successful requests exceeding the SLA) - violations_before = (successful_before["request_elapsed_time"] > sla).sum() - violations_after = ( - (successful_after["request_elapsed_time"] > sla).sum() - if after_fault is not None and not after_fault.empty - else None - ) - - return ( - success, - failure, - avg_before, - std_before, - avg_after, - std_after, - violations_before, - violations_after, - ) - - -def parse_process_log(log_dir, process_name): - process_ready_line = { - "dynamo_Frontend": "added model", - "dynamo_VllmWorker": "Starting VllmWorker instance with all registered endpoints", - "dynamo_Processor": "Starting Processor instance with all registered endpoints", - "dynamo_PrefillWorker": "Starting PrefillWorker instance with all registered endpoints", - } - process_shutdown_line = { - "dynamo_Frontend": "SIGTERM received, starting graceful shutdown", - "dynamo_VllmWorker": "Received shutdown signal, shutting down DistributedRuntime", - "dynamo_Processor": "Received signal 15, initiating graceful shutdown", - "dynamo_PrefillWorker": "Shutdown hooks completed successfully", - } - process_log_path = os.path.join(log_dir, "error.log") - - if not os.path.isfile(process_log_path): - return None, None - - process_ready = [] - process_shutdown = [] - - process_start_time = None - - with open(process_log_path, "r") as f: - for line in f: - clean_line = re.sub(r"\x1b\[.*?m", "", line.strip()) # Remove ANSI codes - if not clean_line: - continue - - parts = clean_line.split() - if len(parts) < 2: - continue - - try: - # Parse timestamp (remove 'Z' for naive datetime) - timestamp = datetime.fromisoformat(parts[0].replace("Z", "")) - except ValueError: - continue - - if not process_start_time: - process_start_time = timestamp - - log_message = " ".join(parts[1:]) - - relative_time = (timestamp - process_start_time).total_seconds() - - # Check for process start lines - if process_name in process_ready_line: - if process_ready_line[process_name] in log_message: - process_ready.append((timestamp, log_message, relative_time)) - - # Check for process end lines - if process_name in process_shutdown_line: - if process_shutdown_line[process_name] in log_message: - process_shutdown.append((timestamp, log_message, relative_time)) - - return process_ready, process_shutdown - - -def parse_watcher_log(test_dir, fault_time): - before_requests = [] - after_requests = [] - watcher_log_path = os.path.join(test_dir, "watcher.log.txt") - if not os.path.isfile(watcher_log_path): - return None, None - with open(watcher_log_path, "r") as f: - for line in f: - try: - data = json.loads(line.strip()) - except json.JSONDecodeError: - continue - if "metrics" not in data: - continue - entry_time = datetime.fromisoformat(data["time"].replace("T", " ")) - for metric in data["metrics"]: - if len(metric) != 2: - continue - _, metric_data = metric - if ( - "num_requests_waiting" in metric_data - and "request_active_slots" in metric_data - and metric_data["request_active_slots"] > 0 - ): - if fault_time is None or entry_time <= fault_time: - before_requests.append(metric_data["num_requests_waiting"]) - else: - after_requests.append(metric_data["num_requests_waiting"]) - - avg_before = ( - sum(before_requests) / len(before_requests) if before_requests else None - ) - avg_after = sum(after_requests) / len(after_requests) if after_requests else None - return avg_before, avg_after - - -def calculate_recovery_time(test_dir, failure_type, fault_time): - processes = [ - "dynamo_Frontend", - "dynamo_Processor", - "dynamo_VllmWorker", - "dynamo_PrefillWorker", - ] - - process_start_ends = {} - start_time = None - - for process in processes: - starts, ends = parse_process_log(os.path.join(test_dir, process), process) - if starts: - process_start_ends[process] = (starts, ends) - - if failure_type == "processor": - start_time = process_start_ends["dynamo_Processor"][0][-1][0] - elif failure_type == "frontend": - start_time = process_start_ends["dynamo_Frontend"][0][-1][0] - elif failure_type == "decode_worker": - start_times = [ - x - for x in process_start_ends["dynamo_VllmWorker"][0] - if "VllmWorker:1" in x[1] - ] - if not start_times: - return None - start_time = start_times[-1][0] - - elif failure_type == "prefill_worker": - if "dynamo_PrefillWorker" not in process_start_ends: - return None - start_times = [ - x - for x in process_start_ends["dynamo_PrefillWorker"][0] - if "PrefillWorker:1" in x[1] - ] - start_time = start_times[-1][0] - - if not start_time: - return None - - if fault_time > start_time: - return None - - return (start_time - fault_time).total_seconds() - - -def process_test_directory(test_dir): - test_name = test_dir.split("test_worker_failure[", 1)[1].rstrip("]") - failure_type = test_name.split("-")[-1] - test_prefix = "-".join(test_name.split("-")[:-1]) - - startup_time, fault_time, start_cmd = parse_test_log( - os.path.join(test_dir, "test.log.txt") - ) - df = parse_client_logs(test_dir) - - if df is None or df.empty: - return None - pending_requests_before, pending_requests_after = parse_watcher_log( - test_dir, fault_time - ) - ( - success, - failure, - avg_before, - std_before, - avg_after, - std_after, - violations_before, - violations_after, - ) = calculate_metrics(df, fault_time) - - recovery_time = calculate_recovery_time(test_dir, failure_type, fault_time) - - return { - "test": test_prefix, - "cmd": start_cmd, - "failure": failure_type, - "start_time": startup_time, - "success_requests": success, - "failed_requests": failure, - "avg_latency_before": avg_before, - "std_latency_before": std_before, - "avg_latency_after": avg_after, - "std_latency_after": std_after, - "pending_requests_before": pending_requests_before, - "pending_requests_after": pending_requests_after, - "violations_before": violations_before, - "violations_after": violations_after, - "recovery_time": recovery_time, - } - - -def main(logs_dir, tablefmt, log_paths=[]): - results = [] - if log_paths: - for log_path in log_paths: - result = process_test_directory(log_path) - if result: - results.append(result) - elif logs_dir: - for entry in os.listdir(logs_dir): - if entry.startswith("test_worker_failure[") and os.path.isdir( - os.path.join(logs_dir, entry) - ): - result = process_test_directory(os.path.join(logs_dir, entry)) - if result: - results.append(result) - - # Group results by test prefix - grouped: dict[str, list[dict[str, Any]]] = {} - commands = {} - for res in results: - test_prefix = res["test"] - if test_prefix not in grouped: - grouped[test_prefix] = [] - commands[test_prefix] = res["cmd"] - grouped[test_prefix].append(res) - - order = [ - "none", - "frontend", - "processor", - "decode_worker", - "prefill_worker", - "vllm_worker", - ] - - # Print grouped tables - for test_prefix, group in grouped.items(): - new_group = [] - for failure in order: - for res in group: - if failure == res["failure"]: - new_group.append(res) - group = new_group - headers = [ - "Failure", - "Startup Time", - "Success", - "Failed", - "Latency Before", - "Latency After", - "Pending Before", - "Pending After", - "Violations Before", - "Violations After", - "Recovery Time", - ] - rows = [] - for res in group: - row = [ - res["failure"], - res["start_time"], # if res["start_time"] is not None else "N/A", - res["success_requests"], - res["failed_requests"], - res["avg_latency_before"], - res["avg_latency_after"], - res["pending_requests_before"], - res["pending_requests_after"], - res["violations_before"], - res["violations_after"], - res["recovery_time"], - ] - rows.append(row) - - print(f"\nTest Group: {test_prefix}") - print(f"\nTest Command: {commands[test_prefix]}") - print( - tabulate( - rows, - headers, - tablefmt=tablefmt, - floatfmt=".2f", - missingval="N/A", - numalign="right", - stralign="center", - ) - ) - print("\n" + "=" * 80) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Parse test results") - parser.add_argument("--log-dir", default=".", help="Path to the logs directory") - parser.add_argument( - "--format", choices=["fancy", "markdown"], default="fancy", help="Table format" - ) - args = parser.parse_args() - - # Map format choices to tabulate formats - tablefmt = ( - "fancy_grid" if args.format == "fancy" else "pipe" - ) # Using pipe for markdown compatibility - - main(args.log_dir, tablefmt) diff --git a/tests/fault_tolerance/scenarios.py b/tests/fault_tolerance/scenarios.py deleted file mode 100644 index 0b19273560..0000000000 --- a/tests/fault_tolerance/scenarios.py +++ /dev/null @@ -1,219 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from tests.utils.deployment_graph import ( - DeploymentGraph, - Payload, - chat_completions_response_handler, -) - -# Initial payload used for testing -# initial deployment readiness. - -text_prompt = "Tell me a short joke about AI." - -text_payload = Payload( - payload_chat={ - "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", - "messages": [ - { - "role": "user", - "content": text_prompt, # Shorter prompt - } - ], - "max_tokens": 150, - "temperature": 0.1, - # "seed": 10, - "ignore_eos": True, - "min_tokens": 150, - "stream": False, - }, - expected_log=[], - expected_response=["AI"], -) - -# Each Deployment Graph contains -# the dynamo serve module and configuration as well -# as the endpoint for interaction - -deployment_graphs = { - "agg-tp-1-dp-1": ( - DeploymentGraph( - module="graphs.agg:Frontend", - config="/workspace/tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_1, pytest.mark.vllm], - ), - text_payload, - ), - "agg-tp-1-dp-8": ( - DeploymentGraph( - module="graphs.agg:Frontend", - config="/workspace/tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_8, pytest.mark.vllm], - ), - text_payload, - ), - "agg-tp-1-dp-4": ( - DeploymentGraph( - module="graphs.agg:Frontend", - config="/workspace/tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_4, pytest.mark.vllm], - ), - text_payload, - ), - "agg-tp-2-dp-1": ( - DeploymentGraph( - module="graphs.agg:Frontend", - config="/workspace/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_2, pytest.mark.vllm], - ), - text_payload, - ), - "agg-tp-2-dp-2": ( - DeploymentGraph( - module="graphs.agg:Frontend", - config="/workspace/tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_4, pytest.mark.vllm], - ), - text_payload, - ), - "agg-tp-2-dp-4": ( - DeploymentGraph( - module="graphs.agg:Frontend", - config="/workspace/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_8, pytest.mark.vllm], - ), - text_payload, - ), - "disagg-p-tp-1-dp-1-d-tp-1-dp-1": ( - DeploymentGraph( - module="graphs.disagg:Frontend", - config="/workspace/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_2, pytest.mark.vllm], - ), - text_payload, - ), - "disagg-p-tp-1-dp-4-d-tp-4-dp-1": ( - DeploymentGraph( - module="graphs.disagg:Frontend", - config="/workspace/tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_8, pytest.mark.vllm], - ), - text_payload, - ), - "disagg-p-tp-2-dp-2-d-tp-4-dp-1": ( - DeploymentGraph( - module="graphs.disagg:Frontend", - config="/workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_8, pytest.mark.vllm], - ), - text_payload, - ), - "disagg-p-tp-2-dp-1-d-tp-4-dp-1": ( - DeploymentGraph( - module="graphs.disagg:Frontend", - config="/workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_8, pytest.mark.vllm], - ), - text_payload, - ), - "disagg-p-tp-1-dp-2-d-tp-2-dp-1": ( - DeploymentGraph( - module="graphs.disagg:Frontend", - config="/workspace/tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_4, pytest.mark.vllm], - ), - text_payload, - ), - "disagg-p-tp-1-dp-1-d-tp-2-dp-1": ( - DeploymentGraph( - module="graphs.disagg:Frontend", - config="/workspace/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml", - directory="/workspace/examples/llm", - endpoints=["v1/chat/completions"], - response_handlers=[chat_completions_response_handler], - marks=[pytest.mark.gpu_4, pytest.mark.vllm], - ), - text_payload, - ), -} - -# Each failure scenaro contains a list of failure injections -# Each failure injection has a time in seconds after the pervious injection and -# a list of failures to inject including the number of failures for each type. -# Failures are currently process termination. -# -# Example: -# -# "prefill_worker": [[30, [("dynamo_prefillworker", 1)]]], -# -# terminates 1 prefill worker after 30 seconds - -failure_scenarios = { - "decode_worker": [[30, [("dynamo_vllmworker", 1)]]], - "prefill_worker": [[30, [("dynamo_prefillworker", 1)]]], - "frontend": [[30, [("dynamo_frontend", 1)]]], - "processor": [[30, [("dynamo_processor", 1)]]], - "vllm_worker": [[30, [("vllm_worker", 1)]]], - "none": [], -} - - -@pytest.fixture(params=list(failure_scenarios.keys())) -def failures(request): - return failure_scenarios[request.param] - - -@pytest.fixture(params=list(deployment_graphs.keys())) -def deployment_graph_test(request): - """ - Fixture that provides different deployment graph test configurations. - """ - return deployment_graphs[request.param] diff --git a/tests/fault_tolerance/test_runner.py b/tests/fault_tolerance/test_runner.py deleted file mode 100644 index 461f690979..0000000000 --- a/tests/fault_tolerance/test_runner.py +++ /dev/null @@ -1,218 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import time -from contextlib import contextmanager -from multiprocessing import Process - -import psutil -import pytest - -from tests.fault_tolerance.client import client -from tests.fault_tolerance.parse_results import main as parse_results -from tests.fault_tolerance.scenarios import ( # noqa: F401 - deployment_graph_test, - failures, -) -from tests.fault_tolerance.utils.circus_controller import CircusController -from tests.fault_tolerance.utils.metrics import nvidia_smi # noqa: F401 -from tests.fault_tolerance.utils.metrics import worker_metrics # noqa: F401 -from tests.serve.test_dynamo_serve import DynamoServeProcess -from tests.utils.managed_process import terminate_process_tree - - -def _set_deployment_args(request, max_num_seqs): - decode_worker_name = "VllmWorker" - args = {} - - if max_num_seqs is not None: - args[f"--{decode_worker_name}.max_num_seqs"] = max_num_seqs - - return args - - -def _list_vllm_worker_processes(): - processes = [] - for ps_process in psutil.process_iter(["name", "cmdline"]): - try: - if "from multiprocessing.spawn import spawn_main;" in " ".join( - ps_process.cmdline() - ): - processes.append(ps_process.pid) - except Exception: - pass - return processes - - -@contextmanager -def _clients( - logger, - num_clients, - request, - deployment_graph, - server_process, - payload, - requests_per_client, - input_token_length, - output_token_length, - max_retries, -): - procs = [] - for i in range(num_clients): - procs.append( - Process( - target=client, - args=( - deployment_graph, - server_process, - payload, - request.node.name, - i, - requests_per_client, - input_token_length, - output_token_length, - max_retries, - ), - ) - ) - procs[-1].start() - yield procs - - for proc in procs: - logger.debug(f"{proc} waiting for join") - proc.join() - logger.debug(f"{proc} joined") - - -def _inject_failures(failures, logger): # noqa: F811 - circus_controller = CircusController.from_state_file("dynamo") - - for failure_time, component in failures: - time.sleep(failure_time) - for component_name, number in component: - logger.info(f"Injecting failure for: {component_name}") - - if "dynamo" in component_name: - result = circus_controller.client.call( - {"command": "list", "properties": {"name": f"{component_name}"}} - ) - if result["status"] == "error": - logger.warning(f"component {component_name} not found {result}") - continue - - num_processes = len(result["pids"]) - if number is None: - number = num_processes - for x in range(number): - pid = result["pids"][x % num_processes] - logger.info(f"Terminating {component_name} Pid {pid}") - terminate_process_tree(pid, logger, immediate_kill=True) - elif "vllm" in component_name: - vllm_processes = _list_vllm_worker_processes() - num_processes = len(vllm_processes) - if number is None: - number = len(vllm_processes) - for x in range(number): - pid = vllm_processes[x % num_processes] - terminate_process_tree(pid, logger, immediate_kill=True) - - circus_controller.close() - - -global_result_list = [] - - -@pytest.fixture(autouse=True) -def results_table(request): - yield - parse_results(logs_dir=None, log_paths=[request.node.name], tablefmt="fancy") - global_result_list.append(request.node.name) - - -@pytest.fixture(autouse=True, scope="session") -def results_summary(): - yield - parse_results(logs_dir=None, log_paths=global_result_list, tablefmt="fancy") - - -@pytest.mark.e2e -@pytest.mark.slow -def test_worker_failure( - deployment_graph_test, # noqa: F811 - request, - runtime_services, - num_clients, - requests_per_client, - worker_metrics, # noqa: F811 - respawn, - failures, # noqa: F811 - input_token_length, - output_token_length, - max_num_seqs, - max_retries, - display_dynamo_output, - nvidia_smi, # noqa: F811 - separate_process_logs, - hf_hub_offline, -): - """ - Test dynamo serve deployments with injected failures - """ - - # runtime_services is used to start nats and etcd - - logger = logging.getLogger(request.node.name) - logger.info("Starting test_deployment") - deployment_graph, payload = deployment_graph_test - - if hf_hub_offline: - os.environ["HF_HUB_OFFLINE"] = "1" - else: - if "HF_HUB_OFFLINE" in os.environ: - del os.environ["HF_HUB_OFFLINE"] - if respawn: - os.environ["DYN_CIRCUS_RESPAWN"] = "1" - else: - if "DYN_CIRCUS_RESPAWN" in os.environ: - del os.environ["DYN_CIRCUS_RESPAWN"] - - if separate_process_logs: - os.environ["DYN_CIRCUS_LOG_DIR"] = os.path.abspath(request.node.name) - - deployment_args = _set_deployment_args(request, max_num_seqs) - - with DynamoServeProcess( - deployment_graph, - request, - display_output=display_dynamo_output, - args=deployment_args, - ) as server_process: - server_process.wait_for_ready(payload) - - with _clients( - logger, - num_clients, - request, - deployment_graph, - server_process, - payload, - requests_per_client, - input_token_length, - output_token_length, - max_retries, - ): - _inject_failures(failures, logger) diff --git a/tests/fault_tolerance/utils/circus_controller.py b/tests/fault_tolerance/utils/circus_controller.py deleted file mode 100644 index 477c0f1c87..0000000000 --- a/tests/fault_tolerance/utils/circus_controller.py +++ /dev/null @@ -1,107 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging -import os -from pathlib import Path -from typing import List, Optional - -from circus.client import CircusClient -from circus.exc import CallError - -logger = logging.getLogger(__name__) - - -class CircusController: - """A circus client implementation for Dynamo""" - - def __init__(self, endpoint: str): - """Initialize connection to arbiter. - - Args: - endpoint: The circus endpoint (e.g., tcp://127.0.0.1:54927) - """ - self.endpoint = endpoint - self.client = CircusClient(endpoint=endpoint, timeout=15.0) - - @classmethod - def from_state_file(cls, namespace: str) -> "CircusController": - """ - Create a CircusController from a Dynamo state file. - - Args: - namespace: The Dynamo namespace - - Returns: - CircusController instance - - Raises: - FileNotFoundError: If state file doesn't exist - ValueError: If no endpoint found in state file - """ - state_file = ( - Path( - os.environ.get("DYN_LOCAL_STATE_DIR", Path.home() / ".dynamo" / "state") - ) - / f"{namespace}.json" - ) - if not state_file.exists(): - raise FileNotFoundError(f"State file not found: {state_file}") - - with open(state_file, "r") as f: - state = json.load(f) - - endpoint = state.get("circus_endpoint") - if not endpoint: - raise ValueError(f"No endpoint found in state file: {state_file}") - - return cls(endpoint) - - async def _get_watcher_processes(self, name: str) -> Optional[int]: - """ - Get number of processes for a watcher. - - Args: - name: The name of the watcher - - Returns: - Number of processes for the watcher. Returns None operation fails. - """ - try: - response = self.client.send_message("numprocesses", name=name) - return int(response.get("numprocesses", 0)) - except (CallError, Exception) as e: - logger.error(f"Failed to get process count for {name}: {e}") - return None - - async def _list_watchers(self) -> List[str]: - """ - List all watchers managed by circus. - - Returns: - List of watcher names. Returns None if the list operation fails. - """ - try: - response = self.client.send_message("list") - return response.get("watchers", []) - except (CallError, Exception) as e: - logger.error(f"Failed to list watchers: {e}") - return [] - - def close(self) -> None: - """Close the connection to the arbiter.""" - if hasattr(self, "client"): - self.client.stop() diff --git a/tests/fault_tolerance/utils/metrics.py b/tests/fault_tolerance/utils/metrics.py deleted file mode 100644 index c1635fb2cd..0000000000 --- a/tests/fault_tolerance/utils/metrics.py +++ /dev/null @@ -1,127 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio -import json -import os -from datetime import datetime -from multiprocessing import Process - -import psutil -import pytest - -from dynamo.runtime import dynamo_worker -from tests.fault_tolerance.utils.circus_controller import CircusController -from tests.utils.managed_process import ManagedProcess - - -def run_metrics_process(log_dir): - asyncio.run(get_metrics(log_dir)) - - -@dynamo_worker() -async def get_metrics(runtime, log_dir): - # Log # processes - # Log # metrics per vllm worker - circus_controller = None - pipeline = None - log_path = os.path.join(log_dir, "watcher.log.txt") - with open(log_path, "w") as log: - while True: - try: - await asyncio.sleep(0.5) - - if not circus_controller: - circus_controller = CircusController.from_state_file("dynamo") - if not pipeline: - pipeline = ( - await runtime.namespace("dynamo") - .component("VllmWorker") - .endpoint("load_metrics") - .client() - ) - - watchers = [] - for x in await circus_controller._list_watchers(): - result = circus_controller.client.call( - {"command": "list", "properties": {"name": f"{x}"}} - ) - watchers.append((x, result)) - - metrics = [] - for x in pipeline.instance_ids(): - async for worker_metric in await pipeline.direct(None, x): - metrics.append((x, worker_metric.data())) - - vllm_processes = [] - - for ps_process in psutil.process_iter(["name", "cmdline"]): - try: - if "from multiprocessing.spawn import spawn_main;" in " ".join( - ps_process.cmdline() - ): - vllm_processes.append(ps_process.pid) - except (psutil.NoSuchProcess, psutil.AccessDenied): - # Process may have terminated or become inaccessible during iteration - pass - - record = { - "time": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), - "watchers": watchers, - "metrics": metrics, - "vllm_processes": vllm_processes, - } - log.write(json.dumps(record) + "\n") - log.flush() - except Exception as e: - record = { - "time": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), - "watchers": [], - "metrics": [], - "vllm_processes": [], - "error": str(e), - } - log.write(json.dumps(record) + "\n") - log.flush() - - -@pytest.fixture -def worker_metrics(request): - process = Process(target=run_metrics_process, args=(request.node.name,)) - process.start() - yield - process.kill() - - -class NvidiaSMI(ManagedProcess): - def __init__(self, request): - super().__init__( - command=[ - "nvidia-smi", - "dmon", - "--select=puc", - ], - health_check_ports=[], - terminate_existing=True, - display_output=False, - data_dir=None, - log_dir=request.node.name, - ) - - -@pytest.fixture -def nvidia_smi(request): - with NvidiaSMI(request) as nvidia_smi_process: - yield nvidia_smi_process diff --git a/tests/serve/test_dynamo_serve.py b/tests/serve/test_dynamo_serve.py deleted file mode 100644 index f7b9c5947b..0000000000 --- a/tests/serve/test_dynamo_serve.py +++ /dev/null @@ -1,269 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging -import os -import time - -import pytest -import requests - -from tests.utils.deployment_graph import ( - DeploymentGraph, - Payload, - chat_completions_response_handler, -) -from tests.utils.managed_process import ManagedProcess - -text_prompt = "Tell me a short joke about AI." - -multimodal_payload = Payload( - payload_chat={ - "model": "llava-hf/llava-1.5-7b-hf", - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What is in this image?"}, - { - "type": "image_url", - "image_url": { - "url": "http://images.cocodataset.org/test2017/000000155781.jpg" - }, - }, - ], - } - ], - "max_tokens": 300, # Reduced from 500 - "stream": False, - }, - repeat_count=1, - expected_log=[], - expected_response=["bus"], -) - -deployment_graphs = { - "multimodal_agg": ( - DeploymentGraph( - module="graphs.agg:Frontend", - config="configs/agg-llava.yaml", - directory="/workspace/examples/multimodal", - endpoints=["v1/chat/completions"], - response_handlers=[ - chat_completions_response_handler, - ], - marks=[pytest.mark.gpu_2, pytest.mark.vllm], - ), - multimodal_payload, - ), -} - - -class DynamoServeProcess(ManagedProcess): - def __init__( - self, - graph: DeploymentGraph, - request, - port=8000, - timeout=900, - display_output=True, - args=None, - ): - command = ["dynamo", "serve", graph.module] - - if graph.config: - command.extend(["-f", os.path.join(graph.directory, graph.config)]) - - if args: - for k, v in args.items(): - command.extend([f"{k}", f"{v}"]) - - health_check_urls = [] - health_check_ports = [] - env = None - - # Handle multimodal deployments differently - if "multimodal" in graph.directory: - env = os.environ.copy() - env["DYNAMO_PORT"] = str(port) - else: - # Regular LLM deployments - command.extend(["--Frontend.port", str(port)]) - health_check_urls = [ - (f"http://localhost:{port}/v1/models", self._check_model) - ] - health_check_ports = [port] - - self.port = port - self.graph = graph - - super().__init__( - command=command, - timeout=timeout, - display_output=display_output, - working_dir=graph.directory, - health_check_ports=health_check_ports, - health_check_urls=health_check_urls, - delayed_start=graph.delayed_start, - stragglers=["http"], - straggler_commands=[ - "dynamo.sdk.cli.serve_dynamo", - "from multiprocessing.resource_tracker", - "from multiprocessing.spawn", - ], - log_dir=request.node.name, - env=env, - ) - - def _check_model(self, response): - try: - data = response.json() - except ValueError: - return False - if data.get("data") and len(data["data"]) > 0: - return True - return False - - def check_response( - self, payload, response, response_handler, logger=logging.getLogger() - ): - assert response.status_code == 200, "Response Error" - content = response_handler(response) - logger.info("Received Content: %s", content) - # Check for expected responses - assert content, "Empty response content" - for expected in payload.expected_response: - assert expected in content, "Expected '%s' not found in response" % expected - - def wait_for_ready(self, payload, logger=logging.getLogger()): - url = f"http://localhost:{self.port}/{self.graph.endpoints[0]}" - start_time = time.time() - retry_delay = 5 - elapsed = 0.0 - logger.info("Waiting for Deployment Ready") - json_payload = ( - payload.payload_chat - if self.graph.endpoints[0] == "v1/chat/completions" - else payload.payload_completions - ) - - while time.time() - start_time < self.graph.timeout: - elapsed = time.time() - start_time - try: - response = requests.post( - url, - json=json_payload, - timeout=self.graph.timeout - elapsed, - ) - except (requests.RequestException, requests.Timeout) as e: - logger.warning("Retrying due to Request failed: %s", e) - time.sleep(retry_delay) - continue - logger.info("Response%r", response) - if response.status_code == 500: - error = response.json().get("error", "") - if "no instances" in error: - logger.warning("Retrying due to no instances available") - time.sleep(retry_delay) - continue - if response.status_code == 404: - error = response.json().get("error", "") - if "Model not found" in error: - logger.warning("Retrying due to model not found") - time.sleep(retry_delay) - continue - # Process the response - if response.status_code != 200: - logger.error( - "Service returned status code %s: %s", - response.status_code, - response.text, - ) - pytest.fail( - "Service returned status code %s: %s" - % (response.status_code, response.text) - ) - else: - break - else: - logger.error( - "Service did not return a successful response within %s s", - self.graph.timeout, - ) - pytest.fail( - "Service did not return a successful response within %s s" - % self.graph.timeout - ) - - self.check_response(payload, response, self.graph.response_handlers[0], logger) - - logger.info("Deployment Ready") - - -@pytest.fixture( - params=[ - pytest.param("multimodal_agg", marks=[pytest.mark.vllm, pytest.mark.gpu_2]), - ] -) -def deployment_graph_test(request): - """ - Fixture that provides different deployment graph test configurations. - """ - return deployment_graphs[request.param] - - -@pytest.mark.e2e -@pytest.mark.slow -@pytest.mark.skip(reason="Multi-Modal currently failing CI, turning off for now.") -def test_serve_deployment(deployment_graph_test, request, runtime_services): - """ - Test dynamo serve deployments with different graph configurations. - """ - - # runtime_services is used to start nats and etcd - - logger = logging.getLogger(request.node.name) - logger.info("Starting test_deployment") - - deployment_graph, payload = deployment_graph_test - - with DynamoServeProcess(deployment_graph, request) as server_process: - server_process.wait_for_ready(payload, logger) - - for endpoint, response_handler in zip( - deployment_graph.endpoints, deployment_graph.response_handlers - ): - url = f"http://localhost:{server_process.port}/{endpoint}" - start_time = time.time() - elapsed = 0.0 - - request_body = ( - payload.payload_chat - if endpoint == "v1/chat/completions" - else payload.payload_completions - ) - - for _ in range(payload.repeat_count): - elapsed = time.time() - start_time - - response = requests.post( - url, - json=request_body, - timeout=deployment_graph.timeout - elapsed, - ) - server_process.check_response( - payload, response, response_handler, logger - ) diff --git a/tests/utils/deployment_graph.py b/tests/utils/deployment_graph.py index 441cbd8861..bc83aa7914 100644 --- a/tests/utils/deployment_graph.py +++ b/tests/utils/deployment_graph.py @@ -13,24 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional - - -@dataclass -class DeploymentGraph: - """ - Represents a deployment graph configuration for testing. - """ - - module: str - config: str - directory: str - endpoints: List[str] - response_handlers: List[Callable[[Any], str]] - timeout: int = 900 - delayed_start: int = 0 - marks: Optional[List[Any]] = field(default_factory=list) +from dataclasses import dataclass +from typing import Any, Dict, List, Optional @dataclass