Skip to content

Commit 604abc9

Browse files
author
Dan Gil
committed
Merge main into feature/dependency-extraction-DYN-1235
2 parents f1dc404 + 0a2a820 commit 604abc9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+328
-482
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ High-throughput, low-latency inference framework designed for serving generative
3030

3131
## Latest News
3232

33-
- [08/05] Deploy `openai/gpt-oss-120b` with disaggregated serving on NVIDIA Blackwell GPUs using Dynamo [➡️ link](./components/backends/trtllm/gpt-oss.md)
33+
- [08/05] Deploy `openai/gpt-oss-120b` with disaggregated serving on NVIDIA Blackwell GPUs using Dynamo [➡️ link](./docs/backends/trtllm/gpt-oss.md)
3434

3535
## The Era of Multi-GPU, Multi-Node
3636

@@ -65,9 +65,9 @@ Dynamo is designed to be inference engine agnostic (supports TRT-LLM, vLLM, SGLa
6565

6666
To learn more about each framework and their capabilities, check out each framework's README!
6767

68-
- **[vLLM](components/backends/vllm/README.md)**
69-
- **[SGLang](components/backends/sglang/README.md)**
70-
- **[TensorRT-LLM](components/backends/trtllm/README.md)**
68+
- **[vLLM](docs/backends/vllm/README.md)**
69+
- **[SGLang](docs/backends/sglang/README.md)**
70+
- **[TensorRT-LLM](docs/backends/trtllm/README.md)**
7171

7272
Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, OSS (Open Source Software) first development approach.
7373

benchmarks/profiler/utils/config.py

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,6 @@ def remove_valued_arguments(args: list[str], key: str) -> list[str]:
145145
return args
146146

147147

148-
def join_arguments(args: list[str]) -> list[str]:
149-
# Use shlex.join to properly quote arguments that contain spaces or special characters
150-
return [shlex.join(args)]
151-
152-
153148
def append_argument(args: list[str], to_append) -> list[str]:
154149
idx = find_arg_index(args)
155150
if isinstance(to_append, list):
@@ -469,7 +464,7 @@ def convert_config(
469464
if "--no-enable-prefix-caching" not in args:
470465
args = append_argument(args, "--no-enable-prefix-caching")
471466

472-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
467+
worker_service.extraPodSpec.mainContainer.args = args
473468

474469
elif target == "decode":
475470
# Get service names by inferring from subComponentType first
@@ -500,7 +495,7 @@ def convert_config(
500495
if "--no-enable-prefix-caching" in args:
501496
args.remove("--no-enable-prefix-caching")
502497

503-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
498+
worker_service.extraPodSpec.mainContainer.args = args
504499

505500
# set num workers to 1
506501
# Use the inferred decode service name
@@ -537,7 +532,7 @@ def set_config_tp_size(
537532
except ValueError:
538533
args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
539534

540-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
535+
worker_service.extraPodSpec.mainContainer.args = args
541536

542537
return cfg.model_dump()
543538

@@ -695,7 +690,7 @@ def convert_config(
695690
if "--disable-radix-cache" not in args:
696691
args = append_argument(args, "--disable-radix-cache")
697692

698-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
693+
worker_service.extraPodSpec.mainContainer.args = args
699694

700695
elif target == "decode":
701696
# Get service names by inferring from subComponentType first
@@ -739,7 +734,7 @@ def convert_config(
739734
args, ["--load-balance-method", "round_robin"]
740735
)
741736

742-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
737+
worker_service.extraPodSpec.mainContainer.args = args
743738

744739
# set num workers to 1
745740
# Use the inferred decode service name
@@ -772,7 +767,7 @@ def set_config_tp_size(
772767
# Set --tp argument
773768
args = set_argument_value(args, "--tp", str(tp_size))
774769

775-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
770+
worker_service.extraPodSpec.mainContainer.args = args
776771
return cfg.model_dump()
777772

778773
@classmethod
@@ -807,7 +802,7 @@ def set_config_tep_size(
807802
if "--enable-dp-attention" in args:
808803
args.remove("--enable-dp-attention")
809804

810-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
805+
worker_service.extraPodSpec.mainContainer.args = args
811806
return cfg.model_dump()
812807

813808
@classmethod
@@ -842,7 +837,7 @@ def set_config_dep_size(
842837
# 4. Set --ep-size=dep_size (expert parallelism size)
843838
args = set_argument_value(args, "--ep-size", str(dep_size))
844839

845-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
840+
worker_service.extraPodSpec.mainContainer.args = args
846841
return cfg.model_dump()
847842

848843
@classmethod
@@ -989,7 +984,7 @@ def convert_config(
989984
override_str = json.dumps(override_dict)
990985
args = append_argument(args, ["--override-engine-args", override_str])
991986

992-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
987+
worker_service.extraPodSpec.mainContainer.args = args
993988

994989
elif target == "decode":
995990
# Get service names by inferring from subComponentType first
@@ -1037,7 +1032,7 @@ def convert_config(
10371032
override_str = json.dumps(override_dict)
10381033
args = append_argument(args, ["--override-engine-args", override_str])
10391034

1040-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
1035+
worker_service.extraPodSpec.mainContainer.args = args
10411036

10421037
# Set num workers to 1
10431038
# Use the inferred decode service name
@@ -1082,7 +1077,7 @@ def set_config_tp_size(
10821077
override_str = json.dumps(override_dict)
10831078
args = append_argument(args, ["--override-engine-args", override_str])
10841079

1085-
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
1080+
worker_service.extraPodSpec.mainContainer.args = args
10861081

10871082
return cfg.model_dump()
10881083

components/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ This directory contains the core components that make up the Dynamo inference fr
2323

2424
Dynamo supports multiple inference engines (with a focus on SGLang, vLLM, and TensorRT-LLM), each with their own deployment configurations and capabilities:
2525

26-
- **[vLLM](backends/vllm/README.md)** - High-performance LLM inference with native KV cache events and NIXL-based transfer mechanisms
27-
- **[SGLang](backends/sglang/README.md)** - Structured generation language framework with ZMQ-based communication
28-
- **[TensorRT-LLM](backends/trtllm/README.md)** - NVIDIA's optimized LLM inference engine with TensorRT acceleration
26+
- **[vLLM](/docs/backends/vllm/README.md)** - High-performance LLM inference with native KV cache events and NIXL-based transfer mechanisms
27+
- **[SGLang](/docs/backends/sglang/README.md)** - Structured generation language framework with ZMQ-based communication
28+
- **[TensorRT-LLM](/docs/backends/trtllm/README.md)** - NVIDIA's optimized LLM inference engine with TensorRT acceleration
2929

3030
Each engine provides launch scripts for different deployment patterns in their respective `/launch` & `/deploy` directories.
3131

components/backends/sglang/benchmarks/bench.sh

Lines changed: 0 additions & 175 deletions
This file was deleted.

0 commit comments

Comments
 (0)