Skip to content

Commit 179f993

Browse files
fix: profilier bug fixes and doc improvements (#3530)
Signed-off-by: alec-flowers <[email protected]> Signed-off-by: hongkuanz <[email protected]> Co-authored-by: hongkuanz <[email protected]>
1 parent bfbcae7 commit 179f993

File tree

5 files changed

+53
-21
lines changed

5 files changed

+53
-21
lines changed

benchmarks/profiler/utils/config.py

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,6 @@ def remove_valued_arguments(args: list[str], key: str) -> list[str]:
145145
return args
146146

147147

148-
def join_arguments(args: list[str]) -> list[str]:
149-
# Use shlex.join to properly quote arguments that contain spaces or special characters
150-
return [shlex.join(args)]
151-
152-
153148
def append_argument(args: list[str], to_append) -> list[str]:
154149
idx = find_arg_index(args)
155150
if isinstance(to_append, list):
@@ -469,7 +464,7 @@ def convert_config(
469464
if "--no-enable-prefix-caching" not in args:
470465
args = append_argument(args, "--no-enable-prefix-caching")
471466

472-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
467+
worker_service.extraPodSpec.mainContainer.args = args
473468

474469
elif target == "decode":
475470
# Get service names by inferring from subComponentType first
@@ -500,7 +495,7 @@ def convert_config(
500495
if "--no-enable-prefix-caching" in args:
501496
args.remove("--no-enable-prefix-caching")
502497

503-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
498+
worker_service.extraPodSpec.mainContainer.args = args
504499

505500
# set num workers to 1
506501
# Use the inferred decode service name
@@ -537,7 +532,7 @@ def set_config_tp_size(
537532
except ValueError:
538533
args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
539534

540-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
535+
worker_service.extraPodSpec.mainContainer.args = args
541536

542537
return cfg.model_dump()
543538

@@ -695,7 +690,7 @@ def convert_config(
695690
if "--disable-radix-cache" not in args:
696691
args = append_argument(args, "--disable-radix-cache")
697692

698-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
693+
worker_service.extraPodSpec.mainContainer.args = args
699694

700695
elif target == "decode":
701696
# Get service names by inferring from subComponentType first
@@ -739,7 +734,7 @@ def convert_config(
739734
args, ["--load-balance-method", "round_robin"]
740735
)
741736

742-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
737+
worker_service.extraPodSpec.mainContainer.args = args
743738

744739
# set num workers to 1
745740
# Use the inferred decode service name
@@ -772,7 +767,7 @@ def set_config_tp_size(
772767
# Set --tp argument
773768
args = set_argument_value(args, "--tp", str(tp_size))
774769

775-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
770+
worker_service.extraPodSpec.mainContainer.args = args
776771
return cfg.model_dump()
777772

778773
@classmethod
@@ -807,7 +802,7 @@ def set_config_tep_size(
807802
if "--enable-dp-attention" in args:
808803
args.remove("--enable-dp-attention")
809804

810-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
805+
worker_service.extraPodSpec.mainContainer.args = args
811806
return cfg.model_dump()
812807

813808
@classmethod
@@ -842,7 +837,7 @@ def set_config_dep_size(
842837
# 4. Set --ep-size=dep_size (expert parallelism size)
843838
args = set_argument_value(args, "--ep-size", str(dep_size))
844839

845-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
840+
worker_service.extraPodSpec.mainContainer.args = args
846841
return cfg.model_dump()
847842

848843
@classmethod
@@ -989,7 +984,7 @@ def convert_config(
989984
override_str = json.dumps(override_dict)
990985
args = append_argument(args, ["--override-engine-args", override_str])
991986

992-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
987+
worker_service.extraPodSpec.mainContainer.args = args
993988

994989
elif target == "decode":
995990
# Get service names by inferring from subComponentType first
@@ -1037,7 +1032,7 @@ def convert_config(
10371032
override_str = json.dumps(override_dict)
10381033
args = append_argument(args, ["--override-engine-args", override_str])
10391034

1040-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
1035+
worker_service.extraPodSpec.mainContainer.args = args
10411036

10421037
# Set num workers to 1
10431038
# Use the inferred decode service name
@@ -1082,7 +1077,7 @@ def set_config_tp_size(
10821077
override_str = json.dumps(override_dict)
10831078
args = append_argument(args, ["--override-engine-args", override_str])
10841079

1085-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
1080+
worker_service.extraPodSpec.mainContainer.args = args
10861081

10871082
return cfg.model_dump()
10881083

deploy/utils/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,26 @@ This includes:
3434

3535
After setting up Dynamo Cloud, use this script to prepare your namespace with the additional resources needed for benchmarking and profiling workflows:
3636

37+
The setup script creates a `dynamo-pvc` with `ReadWriteMany` (RWX). If your cluster's default `storageClassName` does not support RWX, set `storageClassName` in `deploy/utils/manifests/pvc.yaml` to an RWX-capable class before running the script.
38+
39+
Example (add under `spec` in `deploy/utils/manifests/pvc.yaml`):
40+
```yaml
41+
...
42+
spec:
43+
accessModes:
44+
- ReadWriteMany
45+
storageClassName: <your-rwx-storageclass>
46+
...
47+
```
48+
49+
> [!TIP]
50+
> **Check your clusters storage classes**
51+
>
52+
> - List storage classes and provisioners:
53+
> ```bash
54+
> kubectl get sc -o wide
55+
> ```
56+
3757
```bash
3858
export NAMESPACE=your-dynamo-namespace
3959
export HF_TOKEN=<HF_TOKEN> # Optional: for HuggingFace model access

docs/benchmarks/pre_deployment_profiling.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,14 @@ If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized mes
224224

225225
3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`.
226226

227+
If it doesn't, create the secret
228+
229+
```bash
230+
export NGC_API_KEY=<you-ngc-api-key-here>
231+
kubectl create secret docker-registry nvcr-imagepullsecret --docker-server=nvcr.io --docker-username='$oauthtoken' --docker-password=$NGC_API_KEY
232+
233+
```
234+
227235

228236
## Running the Profiling Script with AI Configurator
229237

docs/kubernetes/installation_guide.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,16 +144,24 @@ kubectl create secret docker-registry docker-imagepullsecret \
144144
--docker-password=${DOCKER_PASSWORD} \
145145
--namespace=${NAMESPACE}
146146

147+
cd deploy/cloud/helm
148+
147149
# 4. Install CRDs
148150
helm upgrade --install dynamo-crds ./crds/ --namespace default
149151

150152
# 5. Install Platform
151153
helm dep build ./platform/
154+
155+
# To install cluster-wide instead, set NS_RESTRICT_FLAGS="" (empty) or omit that line entirely.
156+
157+
NS_RESTRICT_FLAGS="--set dynamo-operator.namespaceRestriction.enabled=true"
152158
helm install dynamo-platform ./platform/ \
153-
--namespace ${NAMESPACE} \
159+
--namespace "${NAMESPACE}" \
154160
--set "dynamo-operator.controllerManager.manager.image.repository=${DOCKER_SERVER}/dynamo-operator" \
155161
--set "dynamo-operator.controllerManager.manager.image.tag=${IMAGE_TAG}" \
156-
--set "dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret"
162+
--set "dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret" \
163+
${NS_RESTRICT_FLAGS}
164+
157165
```
158166

159167
[Verify Installation](#verify-installation)
@@ -166,7 +174,7 @@ kubectl get crd | grep dynamo
166174

167175
# Check operator and platform pods
168176
kubectl get pods -n ${NAMESPACE}
169-
# Expected: dynamo-operator-* and etcd-* pods Running
177+
# Expected: dynamo-operator-* and etcd-* and nats-* pods Running
170178
```
171179

172180
## Next Steps

docs/kubernetes/sla_planner_quickstart.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ flowchart TD
3939
Before deploying the SLA planner, ensure:
4040
- **Dynamo platform installed** (see [Installation Guide](/docs/kubernetes/installation_guide.md))
4141
- **[kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running.** By default, the prometheus server is not deployed in the `monitoring` namespace. If it is deployed to a different namespace, set `dynamo-operator.dynamo.metrics.prometheusEndpoint="http://prometheus-kube-prometheus-prometheus.<namespace>.svc.cluster.local:9090"`.
42-
- **Benchmarking resources setup** (see [Kubernetes utilities for Dynamo Benchmarking and Profiling](../../deploy/utils/README.md)) The script will create a `dynamo-pvc` with `ReadWriteMany` access, if your cluster's default storageClassName does not allow `ReadWriteMany`, you need to specify a different storageClassName in `pvc.yaml`.
42+
- **Benchmarking resources setup** (see [Kubernetes utilities for Dynamo Benchmarking and Profiling](../../deploy/utils/README.md)) The script will create a `dynamo-pvc` with `ReadWriteMany` access, if your cluster's default storageClassName does not allow `ReadWriteMany`, you need to specify a different storageClassName in `deploy/utils/manifests/pvc.yaml` which does support `ReadWriteMany`.
43+
4344

4445
## Pre-Deployment Profiling
4546

@@ -260,4 +261,4 @@ This is because the `subComponentType` field has only been added in newer versio
260261
---
261262

262263
> [!TIP]
263-
> **Need Help?** If you encounter issues, check the [troubleshooting section](#troubleshooting) or refer to the detailed guides linked in [Next Steps](#next-steps).
264+
> **Need Help?** If you encounter issues, check the [troubleshooting section](#troubleshooting) or refer to the detailed guides linked in [Next Steps](#next-steps).

0 commit comments

Comments
 (0)