diff --git a/docs/installation.md b/docs/installation.md
index cc8c36a2..10b7e01d 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -151,6 +151,31 @@ benchmark:
   concurrencies: [256, 512]
 ```
 
+## Profiling (torch / nsys)
+
+You can enable profiling via a top-level `profiling` section in your job YAML. Profiling and benchmarking do not run at the same time; when profiling is enabled, set `benchmark.type: "manual"` (or omit the benchmark) to avoid conflicts.
+
+Example:
+
+```yaml
+profiling:
+  type: "nsys"   # one of: "none", "torch", "nsys"
+  prefill:
+    isl: 1024     # input sequence length
+    osl: 2        # output sequence length
+    concurrency: 24
+    start_step: 0 
+    stop_step: 16
+  decode:
+    isl: 8
+    osl: 16
+    concurrency: 1024
+    start_step: 0
+    stop_step: 16
+```
+
+When `type: none`, normal serving runs with `dynamo.sglang`. Otherwise, serving uses `sglang.launch_server`.
+
 ## Validate with Dry Run
 
 Always validate before submitting:
diff --git a/scripts/profiling/profile.sh b/scripts/profiling/profile.sh
index 179841d8..04374e42 100755
--- a/scripts/profiling/profile.sh
+++ b/scripts/profiling/profile.sh
@@ -44,38 +44,68 @@ wait_until_ready() {
 }
 wait_until_ready "http://${head_node}:${head_port}"
 
-# Determine profiling parameters based on mode
-if [[ "${PROFILING_MODE}" == "prefill" ]]; then
-    # Prefill profiling: smaller batch, long input, short output
-    BATCH_SIZE=24
-    INPUT_LEN=1024
-    OUTPUT_LEN=2
-    PROFILE_STEPS_ARG=""
-    echo "Running prefill profiling with batch=${BATCH_SIZE}, input_len=${INPUT_LEN}, output_len=${OUTPUT_LEN}"
-else
-    # Decode profiling: large batch, short input, longer output with profile steps
-    BATCH_SIZE=1024
-    INPUT_LEN=8
-    OUTPUT_LEN=16
-    PROFILE_STEPS_ARG="--profile-steps 16"
-    echo "Running decode profiling with batch=${BATCH_SIZE}, input_len=${INPUT_LEN}, output_len=${OUTPUT_LEN}, profile_steps=16"
+# Determine profiling parameters strictly from environment 
+PROFILE_STEPS_ARG=""
+CLI_ARGS=""
+[[ -n "${PROFILE_CONCURRENCY}" ]] && CLI_ARGS+=" --batch-size ${PROFILE_CONCURRENCY}"
+# Require ISL/OSL to be provided; do not pass them as CLI args here
+if [[ -z "${PROFILE_ISL}" || -z "${PROFILE_OSL}" ]]; then
+    echo "Error: isl and osl must be set for profiling."
+    exit 1
+fi
+
+# Configure profiling steps range; set defaults independently if missing
+if [[ -z "${PROFILE_START_STEP}" ]]; then
+    echo "Warning: PROFILE_START_STEP not set; defaulting to 0"
+    PROFILE_START_STEP=0
+fi
+if [[ -z "${PROFILE_STOP_STEP}" ]]; then
+    echo "Warning: PROFILE_STOP_STEP not set; defaulting to 50"
+    PROFILE_STOP_STEP=50
 fi
 
-# Create profiling output directory
-mkdir -p ${SGLANG_TORCH_PROFILER_DIR} 2>/dev/null || true
 
-echo "Running torch profiler..."
+echo "Running profiler..."
 echo "$(date '+%Y-%m-%d %H:%M:%S')"
 
+# Create profiling output directory only when torch profiler dir is provided
+ACTIVITIES=""
+if [[ -n "${SGLANG_TORCH_PROFILER_DIR}" ]]; then
+    ACTIVITIES='["GPU"]'
+    mkdir -p "${SGLANG_TORCH_PROFILER_DIR}" 2>/dev/null || true
+    export SGLANG_TORCH_PROFILER_DIR=${SGLANG_TORCH_PROFILER_DIR}
+else
+    ACTIVITIES='["CUDA_PROFILER"]'
+    mkdir -p "/logs/profiles" 2>/dev/null || true
+fi
+
 set -x
-python3 -m sglang.bench_one_batch_server \
-    --model ${model_name} \
-    --base-url http://${head_node}:${head_port} \
-    --batch-size ${BATCH_SIZE} \
-    --input-len ${INPUT_LEN} \
-    --output-len ${OUTPUT_LEN} \
-    ${PROFILE_STEPS_ARG} \
-    --profile
+
+curl -X POST http://${head_node}:${head_port}/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"
+
+python3 -m sglang.bench_serving \
+--backend sglang \
+--model ${model_name} \
+--host ${head_node} --port ${head_port} \
+--dataset-name random \
+--max-concurrency $PROFILE_CONCURRENCY \
+--num-prompts 128 \
+--random-input-len $PROFILE_ISL \
+--random-output-len $PROFILE_OSL \
+--random-range-ratio 1 \
+--warmup-request 10
+
+pip install lm-eval tenacity
+python -m lm_eval \
+--model local-completions \
+--tasks gsm8k \
+--model_args \
+base_url=http://${head_node}:${head_port}/v1/completions,\
+model=${model_name},\
+tokenized_requests=False,tokenizer_backend=None,\
+num_concurrent=${PROFILE_CONCURRENCY},timeout=6000,max_retries=1 \
+--limit 10
+
 exit_code=$?
 set +x
 
diff --git a/scripts/templates/job_script_template_agg.j2 b/scripts/templates/job_script_template_agg.j2
index 10efd556..6e5b7038 100755
--- a/scripts/templates/job_script_template_agg.j2
+++ b/scripts/templates/job_script_template_agg.j2
@@ -189,12 +189,10 @@ WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip
 WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
 {% endraw %}
 {% endif %}
-{% if sglang_torch_profiler %}
 {% raw %}
-# Enable torch profiling mode
-WORKER_ARGS="$WORKER_ARGS --sglang-torch-profiler"
+# Set profiler mode from config
+WORKER_ARGS="$WORKER_ARGS --profiler {% endraw %}{{ profiler }}{% raw %}"
 {% endraw %}
-{% endif %}
 {% raw %}
 # Add SGLang config path (mounted in container at /logs/)
 WORKER_ARGS="$WORKER_ARGS --sglang-config-path /logs/sglang_config.yaml"
@@ -213,7 +211,7 @@ WORKER_ARGS="$WORKER_ARGS --setup-script {{ setup_script }}"
 {% raw %}
 # Launch nginx on node 0
 echo "Launching nginx on ${NGINX_NODE}"
-cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out --error=${LOG_DIR}/${NGINX_NODE}_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
 echo "$cmd"
 $cmd &
 {% endraw %}
@@ -222,7 +220,7 @@ $cmd &
 
 # Launch frontend on master node (node 1) - this will also start NATS/ETCD
 echo "Launching frontend + NATS/ETCD on master node ${MASTER_NODE}"
-cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out --error=${LOG_DIR}/${MASTER_NODE}_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
 echo "$cmd"
 $cmd &
 
@@ -236,7 +234,7 @@ if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then
         if [ $frontend_node_idx -lt $TOTAL_NODES ]; then
             node=${nodes[$frontend_node_idx]}
             echo "Launching additional frontend $frontend_idx on node $frontend_node_idx: $node"
-            cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out --error=${LOG_DIR}/${node}_frontend_${frontend_idx}.err python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
+            cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
             echo "$cmd"
             $cmd &
             frontend_idx=$((frontend_idx + 1))
@@ -280,7 +278,7 @@ for worker_idx in $(seq 0 $((AGG_WORKERS - 1))); do
 {% endraw %}
 {% endif %}
 {% raw %}
-        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_agg_w${worker_idx}.out --error=${LOG_DIR}/${node}_agg_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${AGG_NODES_PER_WORKER} --worker_type aggregated ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
+        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_agg_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${AGG_NODES_PER_WORKER} --worker_type aggregated ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
         echo "$cmd"
         $cmd &
     done
@@ -318,11 +316,11 @@ echo "srun $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --overlap --pty bas
 {% raw %}
 BENCHMARK_TYPE={% endraw %}{{ benchmark_type }}{% raw %}
 BENCHMARK_ARGS="{% endraw %}{{ benchmark_arg }}{% raw %}"
-srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --error=${LOG_DIR}/benchmark.err --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $AGG_WORKERS 0 0 $DECODE_GPUS ${BENCHMARK_ARGS} &
+srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $AGG_WORKERS 0 0 $DECODE_GPUS ${BENCHMARK_ARGS} &
 {% endraw %}
 {% endif %}
 
-{% if sglang_torch_profiler %}
+{% if profiler != 'none' %}
 
 {% raw %}
 # Torch profiling mode for aggregated workers
@@ -336,12 +334,12 @@ echo "Aggregated profiling will run on: $AGG_LEADER_NODE"
 # Run profiling on first aggregated worker's leader node
 # Use "decode" mode for aggregated since it profiles the full generation pipeline
 srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $AGG_LEADER_NODE \
-    --output=${LOG_DIR}/profile_aggregated.out --error=${LOG_DIR}/profile_aggregated.err --overlap \
-    bash -c "PROFILING_MODE=decode SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode /scripts/profiling/profile.sh 0 $AGG_WORKERS 0 $DECODE_GPUS $TOTAL_GPUS" &
+    --output=${LOG_DIR}/profile_aggregated.out --overlap \
+    bash -c "PROFILING_MODE=decode {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode {% endif %}{{ decode_profile_env }}{% raw %} /scripts/profiling/profile.sh 0 $AGG_WORKERS 0 $DECODE_GPUS $TOTAL_GPUS" &
 {% endraw %}
 {% endif %}
 
-{% if sglang_torch_profiler %}
+{% if profiler != 'none' %}
 {% raw %}
 # Wait for profiling script to complete
 echo "Waiting for profiling script to complete..."
diff --git a/scripts/templates/job_script_template_disagg.j2 b/scripts/templates/job_script_template_disagg.j2
index d2222cce..3e12acbb 100755
--- a/scripts/templates/job_script_template_disagg.j2
+++ b/scripts/templates/job_script_template_disagg.j2
@@ -9,7 +9,6 @@
 #SBATCH --account={{ account }}
 #SBATCH --time={{ time_limit }}
 #SBATCH --output={{ log_dir_prefix }}/%j_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}/log.out
-#SBATCH --error={{ log_dir_prefix }}/%j_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}/log.err
 #SBATCH --partition={{ partition }}
 
 # Constants
@@ -190,12 +189,10 @@ WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip
 WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
 {% endraw %}
 {% endif %}
-{% if sglang_torch_profiler %}
 {% raw %}
-# Enable torch profiling mode
-WORKER_ARGS="$WORKER_ARGS --sglang-torch-profiler"
+# Set profiler mode from config
+WORKER_ARGS="$WORKER_ARGS --profiler {% endraw %}{{ profiler }}{% raw %}"
 {% endraw %}
-{% endif %}
 {% raw %}
 # Add SGLang config path (mounted in container at /logs/)
 WORKER_ARGS="$WORKER_ARGS --sglang-config-path /logs/sglang_config.yaml"
@@ -214,7 +211,7 @@ WORKER_ARGS="$WORKER_ARGS --setup-script {{ setup_script }}"
 {% raw %}
 # Launch nginx on node 0
 echo "Launching nginx on ${NGINX_NODE}"
-cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out --error=${LOG_DIR}/${NGINX_NODE}_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
 echo "$cmd"
 $cmd &
 {% endraw %}
@@ -223,7 +220,7 @@ $cmd &
 
 # Launch frontend on master node (node 1) - this will also start NATS/ETCD
 echo "Launching frontend + NATS/ETCD on master node ${MASTER_NODE}"
-cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out --error=${LOG_DIR}/${MASTER_NODE}_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
 echo "$cmd"
 $cmd &
 
@@ -237,7 +234,7 @@ if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then
         if [ $frontend_node_idx -lt $TOTAL_NODES ]; then
             node=${nodes[$frontend_node_idx]}
             echo "Launching additional frontend $frontend_idx on node $frontend_node_idx: $node"
-            cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out --error=${LOG_DIR}/${node}_frontend_${frontend_idx}.err python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
+            cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
             echo "$cmd"
             $cmd &
             frontend_idx=$((frontend_idx + 1))
@@ -276,7 +273,7 @@ for worker_idx in $(seq 0 $((PREFILL_WORKERS - 1))); do
 {% endraw %}
 {% endif %}
 {% raw %}
-        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out --error=${LOG_DIR}/${node}_prefill_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill ${WORKER_ARGS} ${CONFIG_DUMP_ARG}"
+        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill ${WORKER_ARGS} ${CONFIG_DUMP_ARG}"
         echo "$cmd"
         $cmd &
     done
@@ -309,7 +306,7 @@ for worker_idx in $(seq 0 $((DECODE_WORKERS - 1))); do
 {% endraw %}
 {% endif %}
 {% raw %}
-        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out --error=${LOG_DIR}/${node}_decode_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
+        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
         echo "$cmd"
         $cmd &
     done
@@ -346,14 +343,14 @@ BENCHMARK_ARGS="{{ benchmark_arg }}"
 
 {% if do_benchmark %}
 {% raw %}
-srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --error=${LOG_DIR}/benchmark.err --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS ${BENCHMARK_ARGS} &
+srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS ${BENCHMARK_ARGS} &
 {% endraw %}
 {% endif %}
 
-{% if sglang_torch_profiler %}
+{% if profiler != 'none' %}
 {% raw %}
 # Torch profiling mode: run profiling on prefill and decode workers separately
-echo "Starting torch profiling..."
+echo "Starting profiler..."
 
 # Get leader nodes for first prefill and decode workers
 PREFILL_LEADER_NODE=${nodes[${prefill_leaders[0]}]}
@@ -364,17 +361,17 @@ echo "Decode profiling will run on: $DECODE_LEADER_NODE"
 
 # Run prefill profiling on first prefill worker's leader node
 srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $PREFILL_LEADER_NODE \
-    --output=${LOG_DIR}/profile_prefill.out --error=${LOG_DIR}/profile_prefill.err --overlap \
-    bash -c "PROFILING_MODE=prefill SGLANG_TORCH_PROFILER_DIR=/logs/profiles/prefill /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
+    --output=${LOG_DIR}/profile_prefill.out --overlap \
+    bash -c "PROFILING_MODE=prefill {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/prefill {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
 
 # Run decode profiling on first decode worker's leader node
 srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $DECODE_LEADER_NODE \
-    --output=${LOG_DIR}/profile_decode.out --error=${LOG_DIR}/profile_decode.err --overlap \
-    bash -c "PROFILING_MODE=decode SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
+    --output=${LOG_DIR}/profile_decode.out  --overlap \
+    bash -c "PROFILING_MODE=decode {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode {% endif %}{{ decode_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
 {% endraw %}
 {% endif %}
 
-{% if sglang_torch_profiler %}
+{% if profiler != 'none' %}
 {% raw %}
 # Wait for all profiling scripts to complete (both prefill and decode)
 echo "Waiting for all profiling scripts to complete..."
diff --git a/scripts/worker_setup.py b/scripts/worker_setup.py
index a6448962..8557634c 100644
--- a/scripts/worker_setup.py
+++ b/scripts/worker_setup.py
@@ -103,9 +103,11 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
     )
 
     parser.add_argument(
-        "--sglang-torch-profiler",
-        action="store_true",
-        help="Enable torch profiling mode: use sglang.launch_server and skip --disaggregation-mode",
+        "--profiler",
+        type=str,
+        choices=["none", "torch", "nsys"],
+        default="none",
+        help="Profiling method for workers",
     )
 
     parser.add_argument(
@@ -182,7 +184,7 @@ def main(input_args: list[str] | None = None):
             args.nodes_per_worker,
             args.gpu_type,
             args.multiple_frontends_enabled,
-            args.sglang_torch_profiler,
+            args.profiler,
             args.sglang_config_path,
             args.dump_config_path,
             args.setup_script,
@@ -195,7 +197,7 @@ def main(input_args: list[str] | None = None):
             args.master_ip,
             args.nodes_per_worker,
             args.gpu_type,
-            args.sglang_torch_profiler,
+            args.profiler,
             args.sglang_config_path,
             args.dump_config_path,
             args.setup_script,
@@ -209,7 +211,7 @@ def main(input_args: list[str] | None = None):
             args.nodes_per_worker,
             args.gpu_type,
             args.multiple_frontends_enabled,
-            args.sglang_torch_profiler,
+            args.profiler,
             args.sglang_config_path,
             args.dump_config_path,
             args.setup_script,
diff --git a/scripts/worker_setup/command.py b/scripts/worker_setup/command.py
index 1e24c8b2..4a2524eb 100644
--- a/scripts/worker_setup/command.py
+++ b/scripts/worker_setup/command.py
@@ -15,7 +15,7 @@ def build_sglang_command_from_yaml(
     port: int,
     total_nodes: int,
     rank: int,
-    use_profiling: bool = False,
+    profiler: str = "none",
     dump_config_path: str | None = None,
 ) -> str:
     """Build SGLang command using native YAML config support.
@@ -33,7 +33,7 @@ def build_sglang_command_from_yaml(
         port: Port for distributed coordination
         total_nodes: Total number of nodes
         rank: Node rank (0-indexed)
-        use_profiling: Whether to use sglang.launch_server (profiling mode)
+        profiler: Profiling method: "none", "torch", or "nsys"
 
     Returns:
         Full command string ready to execute
@@ -54,16 +54,21 @@ def build_sglang_command_from_yaml(
     env_exports = []
     for key, value in env_vars.items():
         env_exports.append(f"export {key}={value}")
-    if use_profiling:
+    if profiler == "torch":
         env_exports.append(f"export SGLANG_TORCH_PROFILER_DIR=/logs/profiles/{config_key}")
 
     # Determine Python module based on profiling mode
-    python_module = "sglang.launch_server" if use_profiling else "dynamo.sglang"
+    python_module = "sglang.launch_server" if profiler != "none" else "dynamo.sglang"
+    nsys_prefix=f"nsys profile -t cuda,nvtx --cuda-graph-trace=node -c cudaProfilerApi --capture-range-end stop --force-overwrite true -o /logs/profiles/{config_key}_{rank}"
 
-    if use_profiling:
+    if profiler != "none":
         # Profiling mode: inline all flags (sglang.launch_server doesn't support --config)
         mode_config = sglang_config.get(config_key, {})
-        cmd_parts = [f"python3 -m {python_module}"]
+        # Wrap with NSYS on all ranks; outputs are isolated per-rank
+        if profiler == "nsys":
+            cmd_parts = [f"{nsys_prefix} python3 -m {python_module}"]
+        else:
+            cmd_parts = [f"python3 -m {python_module}"]
 
         # Add all SGLang flags from config
         for key, value in sorted(mode_config.items()):
@@ -145,7 +150,7 @@ def get_gpu_command(
     port: int,
     total_nodes: int,
     rank: int,
-    use_profiling: bool = False,
+    profiler: str = "none",
     dump_config_path: str | None = None,
 ) -> str:
     """Generate command to run SGLang worker using YAML config.
@@ -157,7 +162,7 @@ def get_gpu_command(
         port: Port for distributed coordination
         total_nodes: Total number of nodes
         rank: Node rank (0-indexed)
-        use_profiling: Whether to use sglang.launch_server (profiling mode)
+        profiler: Profiling method: "none", "torch", or "nsys"
 
     Returns:
         Command string to execute
@@ -167,5 +172,5 @@ def get_gpu_command(
 
     logging.info(f"Building command from YAML config: {sglang_config_path}")
     return build_sglang_command_from_yaml(
-        worker_type, sglang_config_path, host_ip, port, total_nodes, rank, use_profiling, dump_config_path
+        worker_type, sglang_config_path, host_ip, port, total_nodes, rank, profiler, dump_config_path
     )
diff --git a/scripts/worker_setup/worker.py b/scripts/worker_setup/worker.py
index aee8762c..c93c43b2 100644
--- a/scripts/worker_setup/worker.py
+++ b/scripts/worker_setup/worker.py
@@ -60,7 +60,7 @@ def setup_prefill_worker(
     nodes_per_worker: int,
     gpu_type: str,
     multiple_frontends_enabled: bool = False,
-    sglang_torch_profiler: bool = False,
+    profiler: str = "none",
     sglang_config_path: str | None = None,
     dump_config_path: str | None = None,
     setup_script: str | None = None,
@@ -111,7 +111,7 @@ def setup_prefill_worker(
         port=DIST_INIT_PORT,
         total_nodes=nodes_per_worker,
         rank=local_rank,
-        use_profiling=sglang_torch_profiler,
+        profiler=profiler,
         dump_config_path=dump_config_path,
     )
     return run_command(cmd_to_run)
@@ -124,7 +124,7 @@ def setup_decode_worker(
     master_ip: str,
     nodes_per_worker: int,
     gpu_type: str,
-    sglang_torch_profiler: bool = False,
+    profiler: str = "none",
     sglang_config_path: str | None = None,
     dump_config_path: str | None = None,
     setup_script: str | None = None,
@@ -153,7 +153,7 @@ def setup_decode_worker(
         port=DIST_INIT_PORT,
         total_nodes=nodes_per_worker,
         rank=local_rank,
-        use_profiling=sglang_torch_profiler,
+        profiler=profiler,
         dump_config_path=dump_config_path,
     )
     return run_command(cmd_to_run)
@@ -167,7 +167,7 @@ def setup_aggregated_worker(
     nodes_per_worker: int,
     gpu_type: str,
     multiple_frontends_enabled: bool = False,
-    sglang_torch_profiler: bool = False,
+    profiler: str = "none",
     sglang_config_path: str | None = None,
     dump_config_path: str | None = None,
     setup_script: str | None = None,
@@ -218,7 +218,7 @@ def setup_aggregated_worker(
         port=DIST_INIT_PORT,
         total_nodes=nodes_per_worker,
         rank=local_rank,
-        use_profiling=sglang_torch_profiler,
+        profiler=profiler,
         dump_config_path=dump_config_path,
     )
     return run_command(cmd_to_run)
diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py
index 4cf6e301..91dd6fbd 100644
--- a/src/srtctl/backends/sglang.py
+++ b/src/srtctl/backends/sglang.py
@@ -101,9 +101,12 @@ def render_command(self, mode: str, config_path: Path = None) -> str:
         for key, val in env_vars.items():
             lines.append(f"{key}={val} \\")
 
-        # Python command - use sglang.launch_server for profiling, dynamo.sglang otherwise
-        is_profiling = self.backend_config.get("enable_profiling", False)
-        if is_profiling:
+        # Python command - use sglang.launch_server when profiler != none, dynamo.sglang otherwise
+        profiling_type = (self.config.get("profiling") or {}).get("type", "none")
+        nsys_prefix = "nsys profile -t cuda,nvtx --cuda-graph-trace=node -c cudaProfilerApi --capture-range-end stop --force-overwrite true"
+        if profiling_type == "nsys":
+            lines.append(f"{nsys_prefix} python3 -m sglang.launch_server \\")
+        elif profiling_type == "torch":
             lines.append("python3 -m sglang.launch_server \\")
         else:
             lines.append("python3 -m dynamo.sglang \\")
@@ -133,14 +136,14 @@ def _config_to_flags(self, config: dict) -> list[str]:
             List of flag strings with backslash continuations
         """
         lines = []
-        is_profiling = self.backend_config.get("enable_profiling", False)
+        profiling_type = (self.config.get("profiling") or {}).get("type", "none")
 
         for key, value in sorted(config.items()):
             # Convert underscores to hyphens
             flag_name = key.replace("_", "-")
 
             # Skip disaggregation-mode flag when profiling (sglang.launch_server doesn't accept it)
-            if is_profiling and flag_name == "disaggregation-mode":
+            if profiling_type in ("torch", "nsys") and flag_name == "disaggregation-mode":
                 continue
 
             if isinstance(value, bool):
@@ -188,7 +191,8 @@ def _get_enable_config_dump(self) -> bool:
         enable_config_dump = self.config.get("enable_config_dump", True)
 
         # Auto-disable when profiling is enabled (unless explicitly set to True)
-        if self.backend_config.get("enable_profiling", False):
+        profiling_type = (self.config.get("profiling") or {}).get("type", "none")
+        if profiling_type != "none":
             # When profiling, disable config dump by default
             # User can explicitly set enable_config_dump: true to override
             return False
@@ -277,6 +281,30 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None)
         config_dir_path = srtctl_root / "configs"
         log_dir_path = srtctl_root / "logs"
 
+        # Build profiling env injections
+        profiling_cfg = self.config.get("profiling") or {}
+        prefill_cfg = profiling_cfg.get("prefill") or {}
+        decode_cfg = profiling_cfg.get("decode") or {}
+
+        def build_env_str(cfg: dict) -> str:
+            parts: list[str] = []
+            if "isl" in cfg and cfg["isl"] is not None:
+                parts.append(f"PROFILE_ISL={cfg['isl']}")
+            if "osl" in cfg and cfg["osl"] is not None:
+                parts.append(f"PROFILE_OSL={cfg['osl']}")
+            if "concurrency" in cfg and cfg["concurrency"] is not None:
+                parts.append(f"PROFILE_CONCURRENCY={cfg['concurrency']}")
+            if "start_step" in cfg and cfg["start_step"] is not None:
+                parts.append(f"PROFILE_START_STEP={cfg['start_step']}")
+            if "stop_step" in cfg and cfg["stop_step"] is not None:
+                parts.append(f"PROFILE_STOP_STEP={cfg['stop_step']}")
+            return " ".join(parts)
+
+        prefill_profile_env = build_env_str(prefill_cfg)
+        decode_profile_env = build_env_str(decode_cfg)
+
+        profiler_mode = profiling_cfg.get("type", "none")
+
         # Template variables
         template_vars = {
             "job_name": job_name,
@@ -307,7 +335,9 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None)
             # Auto-disabled when profiling unless explicitly enabled
             "enable_config_dump": self._get_enable_config_dump(),
             "log_dir_prefix": str(log_dir_path),  # Absolute path to logs directory
-            "sglang_torch_profiler": self.backend_config.get("enable_profiling", False),
+            "profiler": profiler_mode,
+            "prefill_profile_env": prefill_profile_env,
+            "decode_profile_env": decode_profile_env,
             "setup_script": self.setup_script,
             "use_gpus_per_node_directive": get_srtslurm_setting("use_gpus_per_node_directive", True),
             "extra_container_mounts": ",".join(self.config.get("extra_mount") or []),
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
index e721a53d..998b5dae 100644
--- a/src/srtctl/core/schema.py
+++ b/src/srtctl/core/schema.py
@@ -162,6 +162,32 @@ class BenchmarkConfig(BaseModel):
     req_rate: Optional[str] = Field("inf", description="Request rate")
 
 
+class ProfilingType(str, Enum):
+    """Supported profiling types."""
+
+    NSYS = "nsys"
+    TORCH = "torch"
+    NONE = "none"
+
+
+class ProfilingPhaseConfig(BaseModel):
+    """Per-phase profiling parameters."""
+
+    isl: Optional[int] = Field(None, description="Input sequence length")
+    osl: Optional[int] = Field(None, description="Output sequence length")
+    concurrency: Optional[int] = Field(None, description="Batch size / concurrency")
+    start_step: Optional[int] = Field(None, description="Profiling start step")
+    stop_step: Optional[int] = Field(None, description="Profiling stop step")
+
+
+class ProfilingConfig(BaseModel):
+    """Profiling configuration."""
+
+    type: ProfilingType = Field(ProfilingType.NONE, description="Profiling type")
+    prefill: Optional[ProfilingPhaseConfig] = None
+    decode: Optional[ProfilingPhaseConfig] = None
+
+
 class SGLangPrefillConfig(BaseModel):
     """SGLang prefill worker configuration.
 
@@ -217,12 +243,6 @@ class BackendConfig(BaseModel):
     enable_multiple_frontends: bool = True
     num_additional_frontends: int = 9
 
-    # Profiling settings
-    enable_profiling: bool = Field(
-        False,
-        description="Enable torch profiling mode (uses sglang.launch_server instead of dynamo.sglang)",
-    )
-
 
 class JobConfig(BaseModel):
     """Complete job configuration."""
@@ -235,6 +255,7 @@ class JobConfig(BaseModel):
     slurm: SlurmConfig = Field(default_factory=SlurmConfig)
     backend: Optional[BackendConfig] = None  # Auto-populated
     benchmark: BenchmarkConfig = Field(default_factory=BenchmarkConfig)
+    profiling: ProfilingConfig = Field(default_factory=ProfilingConfig)
 
     # Additional optional settings
     enable_config_dump: bool = True
@@ -275,7 +296,8 @@ def model_post_init(self, __context: Any) -> None:
 
     def _validate_profiling_mode(self) -> None:
         """Validate profiling mode constraints."""
-        if not self.backend or not self.backend.enable_profiling:
+        prof = getattr(self, "profiling", None)
+        if not prof or prof.type in (ProfilingType.NONE, None):
             return
 
         # Auto-disable config dump when profiling (already handled in backend, but validate here too)