ai-dynamo · dagil-nvidia · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/DGD_ARCHITECTURE_ANALYSIS.md b/DGD_ARCHITECTURE_ANALYSIS.md
diff --git a/Earthfile b/Earthfile
@@ -134,7 +134,7 @@ dynamo-build:
 
 dynamo-base-docker:
     ARG IMAGE=dynamo-base-docker
-    ARG DOCKER_SERVER=my-registry
+    ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo
     ARG IMAGE_TAG=latest
 
     FROM ubuntu:24.04
@@ -175,7 +175,7 @@ all-test:
     BUILD ./deploy/cloud/operator+test
 
 all-docker:
-    ARG DOCKER_SERVER=my-registry
+    ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo
     ARG IMAGE_TAG=latest
     BUILD ./deploy/cloud/operator+docker --DOCKER_SERVER=$DOCKER_SERVER --IMAGE_TAG=$IMAGE_TAG
 
@@ -189,6 +189,6 @@ all:
 
 # For testing
 custom:
-    ARG DOCKER_SERVER=my-registry
+    ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo
     ARG IMAGE_TAG=latest
     BUILD +all-test
diff --git a/README.md b/README.md
@@ -59,9 +59,9 @@ Dynamo is designed to be inference engine agnostic (supports TRT-LLM, vLLM, SGLa
 | [**Disaggregated Serving**](/docs/architecture/disagg_serving.md)                                 | ✅   | ✅     | ✅           |
 | [**Conditional Disaggregation**](/docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧   | 🚧     | 🚧           |
 | [**KV-Aware Routing**](/docs/architecture/kv_cache_routing.md)                                    | ✅   | ✅     | ✅           |
-| [**Load Based Planner**](/docs/architecture/load_planner.md)                                      | 🚧   | 🚧     | 🚧           |
-| [**SLA-Based Planner**](/docs/architecture/sla_planner.md)                                        | ✅   | ✅     | ✅           |
-| [**KVBM**](/docs/architecture/kvbm_architecture.md)                                               | ✅   | 🚧     | ✅           |
+| [**Load Based Planner**](docs/planner/load_planner.md)                                      | 🚧   | 🚧     | 🚧           |
+| [**SLA-Based Planner**](docs/planner/sla_planner.md)                                        | ✅   | ✅     | ✅           |
+| [**KVBM**](docs/kvbm/kvbm_architecture.md)                                               | ✅   | 🚧     | ✅           |
 
 To learn more about each framework and their capabilities, check out each framework's README!
 
@@ -74,7 +74,7 @@ Built in Rust for performance and in Python for extensibility, Dynamo is fully o
 # Installation
 
 The following examples require a few system level packages.
-Recommended to use Ubuntu 24.04 with a x86_64 CPU. See [docs/support_matrix.md](docs/support_matrix.md)
+Recommended to use Ubuntu 24.04 with a x86_64 CPU. See [docs/reference/support-matrix.md](docs/reference/support-matrix.md)
 
 ## 1. Initial setup
 

diff --git a/benchmarks/incluster/benchmark_job.yaml b/benchmarks/incluster/benchmark_job.yaml
@@ -18,7 +18,7 @@ spec:
       containers:
       - name: benchmark-runner
         # TODO: update to latest public image in next release
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0
         securityContext:
           allowPrivilegeEscalation: false
           capabilities:

@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
 import asyncio
 import logging
 import math
@@ -22,9 +23,13 @@
 import yaml
 
 from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill
-from benchmarks.profiler.utils.config import generate_dgd_config_with_planner
-from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
+from benchmarks.profiler.utils.config import (
+    CONFIG_MODIFIERS,
+    WORKER_COMPONENT_NAMES,
+    generate_dgd_config_with_planner,
+)
 from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
+from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
 from benchmarks.profiler.utils.plot import (
     plot_decode_performance,
     plot_prefill_performance,
@@ -44,12 +49,10 @@
     profile_prefill,
     profile_prefill_aiconfigurator,
 )
-from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
 from deploy.utils.dynamo_deployment import (
     DynamoDeploymentClient,
     cleanup_remaining_deployments,
 )
-from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -740,9 +743,166 @@ async def run_profile(args):
         await cleanup_remaining_deployments(deployment_clients, args.namespace)
         logger.info("Final cleanup completed.")
 
+    # deploy the optimized DGD with planner
+    if args.deploy_after_profile and not args.dry_run:
+        logger.info("Deploying the optimized DGD with planner...")
+        # TODO: check conflicts for dynamo namespace and DGD name
+        # TODO: handle deployment errors and propagate proper error messages to users
+        client = DynamoDeploymentClient(
+            namespace=args.namespace,
+            base_log_dir=f"{args.output_dir}/final_deployment",
+            model_name=model_name,
+            service_name=args.service_name,
+            frontend_port=frontend_port,
+            deployment_name=config["metadata"]["name"],
+        )
+        await client.create_deployment(f"{args.output_dir}/config_with_planner.yaml")
+
 
 if __name__ == "__main__":
-    args = create_profiler_parser()
+    parser = argparse.ArgumentParser(
+        description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
+    )
+    parser.add_argument(
+        "--namespace",
+        type=str,
+        default="dynamo-sla-profiler",
+        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=["vllm", "sglang", "trtllm"],
+        help="backend type, currently support [vllm, sglang, trtllm]",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="Path to the DynamoGraphDeployment config file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="profiling_results",
+        help="Path to the output results directory",
+    )
+    parser.add_argument(
+        "--min-num-gpus-per-engine",
+        type=int,
+        default=1,
+        help="minimum number of GPUs per engine",
+    )
+    parser.add_argument(
+        "--max-num-gpus-per-engine",
+        type=int,
+        default=8,
+        help="maximum number of GPUs per engine",
+    )
+    parser.add_argument(
+        "--skip-existing-results",
+        action="store_true",
+        help="Skip TP sizes that already have results in the output directory",
+    )
+    parser.add_argument(
+        "--force-rerun",
+        action="store_true",
+        help="Force re-running all tests even if results already exist (overrides --skip-existing-results)",
+    )
+    parser.add_argument(
+        "--isl", type=int, default=3000, help="target input sequence length"
+    )
+    parser.add_argument(
+        "--osl", type=int, default=500, help="target output sequence length"
+    )
+    parser.add_argument(
+        "--ttft", type=int, default=50, help="target Time To First Token in ms"
+    )
+    parser.add_argument(
+        "--itl", type=int, default=10, help="target Inter Token Latency in ms"
+    )
+
+    # arguments used for interpolating TTFT and ITL under different ISL/OSL
+    parser.add_argument(
+        "--max-context-length",
+        type=int,
+        default=16384,
+        help="maximum context length supported by the served model",
+    )
+    parser.add_argument(
+        "--prefill-interpolation-granularity",
+        type=int,
+        default=16,
+        help="how many samples to benchmark to interpolate TTFT under different ISL",
+    )
+    parser.add_argument(
+        "--decode-interpolation-granularity",
+        type=int,
+        default=6,
+        help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
+    )
+    parser.add_argument(
+        "--service-name",
+        type=str,
+        default="",
+        help="Service name for port forwarding (default: {deployment_name}-frontend)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Dry run the profile job",
+    )
+    parser.add_argument(
+        "--is-moe-model",
+        action="store_true",
+        dest="is_moe_model",
+        help="Enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode",
+    )
+    parser.add_argument(
+        "--num-gpus-per-node",
+        type=int,
+        default=8,
+        help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
+    )
+
+    # arguments for dgd config generation and deployment
+    parser.add_argument(
+        "--deploy-after-profile",
+        action="store_true",
+        help="deploy the optimized DGD with planner",
+    )
+    # Dynamically add all planner arguments from planner_argparse.py
+    add_planner_arguments_to_parser(parser, prefix="planner-")
+
+    # arguments if using aiconfigurator
+    parser.add_argument(
+        "--use-ai-configurator",
+        action="store_true",
+        help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
+    )
+    parser.add_argument(
+        "--aic-system",
+        type=str,
+        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
+    )
+    parser.add_argument(
+        "--aic-model-name",
+        type=str,
+        help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
+    )
+    parser.add_argument(
+        "--aic-backend",
+        type=str,
+        default="",
+        help="aiconfigurator backend of the target model, if not provided, will use args.backend",
+    )
+    parser.add_argument(
+        "--aic-backend-version",
+        type=str,
+        help="Specify backend version when using aiconfigurator to estimate perf.",
+    )
+    args = parser.parse_args()
 
     # setup file logging
     os.makedirs(args.output_dir, exist_ok=True)

@@ -106,7 +106,7 @@ class DgdPlannerServiceConfig(BaseModel):
     volumeMounts: list[VolumeMount] = [VolumeMount()]
     extraPodSpec: PodSpec = PodSpec(
         mainContainer=Container(
-            image="my-registry/dynamo-runtime:my-tag",  # placeholder
+            image="nvcr.io/nvidia/ai-dynamo/dynamo-runtime:0.6.0",  # placeholder
             workingDir="/workspace/components/src/dynamo/planner",
             command=["python3", "-m", "planner_sla"],
             args=[],

diff --git a/components/backends/sglang/deploy/README.md b/components/backends/sglang/deploy/README.md
@@ -61,7 +61,7 @@ resources:
 ```yaml
 extraPodSpec:
   mainContainer:
-    image: my-registry/sglang-runtime:my-tag
+    image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
     workingDir: /workspace/components/backends/sglang
     args:
       - "python3"
@@ -92,7 +92,7 @@ Edit the template to match your environment:
 
 ```yaml
 # Update image registry and tag
-image: my-registry/sglang-runtime:my-tag
+image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
 
 # Configure your model
 args:

diff --git a/components/backends/sglang/deploy/agg.yaml b/components/backends/sglang/deploy/agg.yaml
@@ -13,7 +13,7 @@ spec:
       replicas: 1
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
     decode:
       envFromSecret: hf-token-secret
       dynamoNamespace: sglang-agg
@@ -24,7 +24,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
           workingDir: /workspace/components/backends/sglang
           command:
           - python3

diff --git a/components/backends/sglang/deploy/agg_logging.yaml b/components/backends/sglang/deploy/agg_logging.yaml
@@ -16,7 +16,7 @@ spec:
       replicas: 1
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
     decode:
       envFromSecret: hf-token-secret
       dynamoNamespace: sglang-agg
@@ -27,7 +27,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
           workingDir: /workspace/components/backends/sglang
           command:
           - python3

diff --git a/components/backends/sglang/deploy/agg_router.yaml b/components/backends/sglang/deploy/agg_router.yaml
@@ -13,7 +13,7 @@ spec:
       replicas: 1
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
       envs:
         - name: DYN_ROUTER_MODE
           value: kv
@@ -27,7 +27,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
           workingDir: /workspace/components/backends/sglang
           command:
           - python3

diff --git a/components/backends/sglang/deploy/disagg-multinode.yaml b/components/backends/sglang/deploy/disagg-multinode.yaml
@@ -22,7 +22,7 @@ spec:
       replicas: 1
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
     decode:
       multinode:
         nodeCount: 2
@@ -35,7 +35,7 @@ spec:
           gpu: "4"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
           workingDir: /workspace/components/backends/sglang
           command:
           - python3
@@ -72,7 +72,7 @@ spec:
           gpu: "4"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
           workingDir: /workspace/components/backends/sglang
           command:
           - python3

diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml
@@ -13,7 +13,7 @@ spec:
       replicas: 1
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
     decode:
       envFromSecret: hf-token-secret
       dynamoNamespace: sglang-disagg
@@ -25,7 +25,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
           workingDir: /workspace/components/backends/sglang
           command:
           - python3
@@ -61,7 +61,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0
           workingDir: /workspace/components/backends/sglang
           command:
           - python3