Skip to content

Commit 52c7536

Browse files
tzulingknnshah1
andauthored
test: fault injection tests for k8s (#3194)
Signed-off-by: nnshah1 <[email protected]> Signed-off-by: [email protected] <[email protected]> Co-authored-by: nnshah1 <[email protected]>
1 parent 116b9b4 commit 52c7536

File tree

8 files changed

+2331
-0
lines changed

8 files changed

+2331
-0
lines changed

tests/fault_tolerance/deploy/README.md

Lines changed: 528 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import json
17+
import logging
18+
import os
19+
import random
20+
import time
21+
from copy import deepcopy
22+
from datetime import datetime
23+
from typing import Any, Dict
24+
25+
import requests
26+
27+
from tests.utils.managed_deployment import ManagedDeployment
28+
29+
LOG_FORMAT = "[TEST] %(asctime)s %(levelname)s %(name)s: %(message)s"
30+
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
31+
32+
33+
payload = {
34+
"model": "",
35+
"messages": [
36+
{
37+
"role": "user",
38+
"content": "",
39+
}
40+
],
41+
"max_tokens": 0,
42+
"temperature": 0.1,
43+
# "seed": 10,
44+
"ignore_eos": True,
45+
"min_tokens": 0,
46+
"stream": False,
47+
}
48+
49+
50+
# Configure logging
51+
logging.basicConfig(
52+
level=logging.INFO,
53+
format=LOG_FORMAT,
54+
datefmt=DATE_FORMAT, # ISO 8601 UTC format
55+
)
56+
57+
58+
def _get_random_prompt(length):
59+
word_list = [f"{i}" for i in range(10)]
60+
return " ".join(random.choices(word_list, k=length))
61+
62+
63+
def _single_request(
64+
url,
65+
pod,
66+
payload,
67+
model,
68+
logger,
69+
retry_attempts=1,
70+
input_token_length=100,
71+
output_token_length=100,
72+
timeout=30,
73+
retry_delay=1,
74+
):
75+
prompt = _get_random_prompt(input_token_length)
76+
payload_copy = deepcopy(payload)
77+
payload_copy["messages"][0]["content"] = prompt
78+
payload_copy["max_tokens"] = output_token_length
79+
payload_copy["min_tokens"] = output_token_length
80+
payload_copy["model"] = model
81+
response = None
82+
end_time = None
83+
start_time = time.time()
84+
results = []
85+
86+
while retry_attempts:
87+
start_request_time = time.time()
88+
response = None
89+
try:
90+
response = requests.post(
91+
url,
92+
json=payload_copy,
93+
timeout=timeout,
94+
)
95+
end_time = time.time()
96+
97+
content = None
98+
99+
try:
100+
content = response.json()
101+
except ValueError:
102+
pass
103+
104+
results.append(
105+
{
106+
"status": response.status_code,
107+
"result": content,
108+
"request_elapsed_time": end_time - start_request_time,
109+
"url": url,
110+
"pod": pod,
111+
}
112+
)
113+
114+
if response.status_code != 200:
115+
time.sleep(retry_delay)
116+
retry_attempts -= 1
117+
continue
118+
else:
119+
break
120+
121+
except (requests.RequestException, requests.Timeout) as e:
122+
results.append(
123+
{
124+
"status": str(e),
125+
"result": None,
126+
"request_elapsed_time": time.time() - start_request_time,
127+
"url": url,
128+
"pod": pod,
129+
}
130+
)
131+
time.sleep(retry_delay)
132+
retry_attempts -= 1
133+
continue
134+
135+
return {
136+
"time": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
137+
"results": results,
138+
"total_time": time.time() - start_time,
139+
"url": url,
140+
"pod": pod,
141+
}
142+
143+
144+
def client(
145+
deployment_spec,
146+
namespace,
147+
model,
148+
log_dir,
149+
index,
150+
requests_per_client,
151+
input_token_length,
152+
output_token_length,
153+
max_retries,
154+
max_request_rate,
155+
retry_delay=1,
156+
):
157+
logger = logging.getLogger(f"CLIENT: {index}")
158+
logging.getLogger("httpx").setLevel(logging.WARNING)
159+
160+
managed_deployment = ManagedDeployment(log_dir, deployment_spec, namespace)
161+
pod_ports: Dict[str, Any] = {}
162+
163+
min_elapsed_time = (1 / max_request_rate) if max_request_rate > 0 else 0.0
164+
165+
try:
166+
os.makedirs(log_dir, exist_ok=True)
167+
log_path = os.path.join(log_dir, f"client_{index}.log.txt")
168+
with open(log_path, "w") as log:
169+
for i in range(requests_per_client):
170+
pods = managed_deployment.get_pods(
171+
managed_deployment.frontend_service_name
172+
)
173+
port = 0
174+
pod_name = None
175+
176+
pods_ready = []
177+
178+
for pod in pods[managed_deployment.frontend_service_name]:
179+
if pod.ready():
180+
pods_ready.append(pod)
181+
else:
182+
if pod.name in pod_ports:
183+
pod_ports[pod.name].stop()
184+
del pod_ports[pod.name]
185+
186+
if pods_ready:
187+
pod = pods_ready[i % len(pods_ready)]
188+
if pod.name not in pod_ports:
189+
port_forward = managed_deployment.port_forward(
190+
pod, deployment_spec.port
191+
)
192+
if port_forward:
193+
pod_ports[pod.name] = port_forward
194+
if pod.name in pod_ports:
195+
port = pod_ports[pod.name].local_port
196+
pod_name = pod.name
197+
198+
url = f"http://localhost:{port}/{deployment_spec.endpoint}"
199+
200+
result = _single_request(
201+
url,
202+
pod_name,
203+
payload,
204+
model,
205+
logger,
206+
max_retries,
207+
input_token_length=input_token_length,
208+
output_token_length=output_token_length,
209+
retry_delay=retry_delay,
210+
)
211+
logger.info(
212+
f"Request: {i} Pod {pod_name} Local Port {port} Status: {result['results'][-1]['status']} Latency: {result['results'][-1]['request_elapsed_time']}"
213+
)
214+
215+
log.write(json.dumps(result) + "\n")
216+
log.flush()
217+
if result["total_time"] < min_elapsed_time:
218+
time.sleep(min_elapsed_time - result["total_time"])
219+
220+
except Exception as e:
221+
logger.error(str(e))
222+
logger.info("Exiting")
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import pytest
17+
18+
19+
def pytest_addoption(parser):
20+
parser.addoption("--image", type=str, default=None)
21+
parser.addoption("--namespace", type=str, default="fault-tolerance-test")
22+
23+
24+
@pytest.fixture
25+
def image(request):
26+
return request.config.getoption("--image")
27+
28+
29+
@pytest.fixture
30+
def namespace(request):
31+
return request.config.getoption("--namespace")

0 commit comments

Comments
 (0)