ai-dynamo
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/profiler/utils/config.py‎
Lines changed: 11 additions & 16 deletions b/‎benchmarks/profiler/utils/config.py‎
Lines changed: 11 additions & 16 deletions
diff --git a/‎components/README.md‎
Lines changed: 3 additions & 3 deletions b/‎components/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎components/backends/sglang/benchmarks/bench.sh‎
Lines changed: 0 additions & 175 deletions b/‎components/backends/sglang/benchmarks/bench.sh‎
Lines changed: 0 additions & 175 deletions
@@ -30,7 +30,7 @@ High-throughput, low-latency inference framework designed for serving generative
 
 ## Latest News
 
-- [08/05] Deploy `openai/gpt-oss-120b` with disaggregated serving on NVIDIA Blackwell GPUs using Dynamo [➡️ link](./components/backends/trtllm/gpt-oss.md)
+- [08/05] Deploy `openai/gpt-oss-120b` with disaggregated serving on NVIDIA Blackwell GPUs using Dynamo [➡️ link](./docs/backends/trtllm/gpt-oss.md)
 
 ## The Era of Multi-GPU, Multi-Node
 
@@ -65,9 +65,9 @@ Dynamo is designed to be inference engine agnostic (supports TRT-LLM, vLLM, SGLa
 
 To learn more about each framework and their capabilities, check out each framework's README!
 
-- **[vLLM](components/backends/vllm/README.md)**
-- **[SGLang](components/backends/sglang/README.md)**
-- **[TensorRT-LLM](components/backends/trtllm/README.md)**
+- **[vLLM](docs/backends/vllm/README.md)**
+- **[SGLang](docs/backends/sglang/README.md)**
+- **[TensorRT-LLM](docs/backends/trtllm/README.md)**
 
 Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, OSS (Open Source Software) first development approach.
 
 
@@ -145,11 +145,6 @@ def remove_valued_arguments(args: list[str], key: str) -> list[str]:
     return args
 
 
-def join_arguments(args: list[str]) -> list[str]:
-    # Use shlex.join to properly quote arguments that contain spaces or special characters
-    return [shlex.join(args)]
-
-
 def append_argument(args: list[str], to_append) -> list[str]:
     idx = find_arg_index(args)
     if isinstance(to_append, list):
@@ -469,7 +464,7 @@ def convert_config(
             if "--no-enable-prefix-caching" not in args:
                 args = append_argument(args, "--no-enable-prefix-caching")
 
-            worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+            worker_service.extraPodSpec.mainContainer.args = args
 
         elif target == "decode":
             # Get service names by inferring from subComponentType first
@@ -500,7 +495,7 @@ def convert_config(
             if "--no-enable-prefix-caching" in args:
                 args.remove("--no-enable-prefix-caching")
 
-            worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+            worker_service.extraPodSpec.mainContainer.args = args
 
         # set num workers to 1
         # Use the inferred decode service name
@@ -537,7 +532,7 @@ def set_config_tp_size(
         except ValueError:
             args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
 
-        worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+        worker_service.extraPodSpec.mainContainer.args = args
 
         return cfg.model_dump()
 
@@ -695,7 +690,7 @@ def convert_config(
             if "--disable-radix-cache" not in args:
                 args = append_argument(args, "--disable-radix-cache")
 
-            worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+            worker_service.extraPodSpec.mainContainer.args = args
 
         elif target == "decode":
             # Get service names by inferring from subComponentType first
@@ -739,7 +734,7 @@ def convert_config(
                         args, ["--load-balance-method", "round_robin"]
                     )
 
-            worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+            worker_service.extraPodSpec.mainContainer.args = args
 
         # set num workers to 1
         # Use the inferred decode service name
@@ -772,7 +767,7 @@ def set_config_tp_size(
         # Set --tp argument
         args = set_argument_value(args, "--tp", str(tp_size))
 
-        worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+        worker_service.extraPodSpec.mainContainer.args = args
         return cfg.model_dump()
 
     @classmethod
@@ -807,7 +802,7 @@ def set_config_tep_size(
         if "--enable-dp-attention" in args:
             args.remove("--enable-dp-attention")
 
-        worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+        worker_service.extraPodSpec.mainContainer.args = args
         return cfg.model_dump()
 
     @classmethod
@@ -842,7 +837,7 @@ def set_config_dep_size(
         # 4. Set --ep-size=dep_size (expert parallelism size)
         args = set_argument_value(args, "--ep-size", str(dep_size))
 
-        worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+        worker_service.extraPodSpec.mainContainer.args = args
         return cfg.model_dump()
 
     @classmethod
@@ -989,7 +984,7 @@ def convert_config(
             override_str = json.dumps(override_dict)
             args = append_argument(args, ["--override-engine-args", override_str])
 
-            worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+            worker_service.extraPodSpec.mainContainer.args = args
 
         elif target == "decode":
             # Get service names by inferring from subComponentType first
@@ -1037,7 +1032,7 @@ def convert_config(
             override_str = json.dumps(override_dict)
             args = append_argument(args, ["--override-engine-args", override_str])
 
-            worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+            worker_service.extraPodSpec.mainContainer.args = args
 
         # Set num workers to 1
         # Use the inferred decode service name
@@ -1082,7 +1077,7 @@ def set_config_tp_size(
         override_str = json.dumps(override_dict)
         args = append_argument(args, ["--override-engine-args", override_str])
 
-        worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+        worker_service.extraPodSpec.mainContainer.args = args
 
         return cfg.model_dump()
 
 
@@ -23,9 +23,9 @@ This directory contains the core components that make up the Dynamo inference fr
 
 Dynamo supports multiple inference engines (with a focus on SGLang, vLLM, and TensorRT-LLM), each with their own deployment configurations and capabilities:
 
-- **[vLLM](backends/vllm/README.md)** - High-performance LLM inference with native KV cache events and NIXL-based transfer mechanisms
-- **[SGLang](backends/sglang/README.md)** - Structured generation language framework with ZMQ-based communication
-- **[TensorRT-LLM](backends/trtllm/README.md)** - NVIDIA's optimized LLM inference engine with TensorRT acceleration
+- **[vLLM](/docs/backends/vllm/README.md)** - High-performance LLM inference with native KV cache events and NIXL-based transfer mechanisms
+- **[SGLang](/docs/backends/sglang/README.md)** - Structured generation language framework with ZMQ-based communication
+- **[TensorRT-LLM](/docs/backends/trtllm/README.md)** - NVIDIA's optimized LLM inference engine with TensorRT acceleration
 
 Each engine provides launch scripts for different deployment patterns in their respective `/launch` & `/deploy` directories.