diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.router b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.router
new file mode 100644
index 000000000000..ba4d8193d985
--- /dev/null
+++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.router
@@ -0,0 +1,47 @@
+# Dockerfile for vllm-router (Rust binary)
+#
+# Build context: the root of the vllm-router repo (~/ repos/router).
+#
+#   docker build -f Dockerfile.router -t vllm-router:dev .
+#
+# Adapted from the upstream Dockerfile.router, but kept as a standalone file
+# so it can be referenced from the demo without modifying the router repo.
+
+FROM docker.io/rustlang/rust:nightly-bullseye AS rust-builder
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    pkg-config \
+    libssl-dev \
+    protobuf-compiler \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Cache dependency compilation layer separately from source changes
+COPY Cargo.toml Cargo.lock ./
+COPY build.rs ./
+# Dummy main so cargo can resolve/compile deps without full source
+RUN mkdir -p src && echo 'fn main() {}' > src/main.rs
+RUN cargo build --release || true
+RUN rm -f src/main.rs
+
+# Now copy real source and build
+COPY src ./src
+RUN cargo build --release
+
+# ── runtime image ────────────────────────────────────────────────────────────
+FROM docker.io/debian:bullseye-slim AS runtime
+
+RUN apt-get update && apt-get install -y \
+    ca-certificates \
+    libssl1.1 \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=rust-builder /app/target/release/vllm-router /usr/local/bin/vllm-router
+RUN chmod +x /usr/local/bin/vllm-router
+
+EXPOSE 8080
+EXPOSE 29000
+
+CMD ["vllm-router", "--host", "0.0.0.0", "--port", "8080"]
diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm
new file mode 100644
index 000000000000..ed1bf24af6a0
--- /dev/null
+++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm
@@ -0,0 +1,112 @@
+# Dockerfile for vLLM with MoRIIO KV connector (ROCm)
+#
+# Installs vLLM from a local source tree on top of the official ROCm base image,
+# following the same multi-stage approach as docker/Dockerfile.rocm.
+#
+# Build from the root of the vllm repo:
+#
+#   docker build \
+#       -f examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm \
+#       -t vllm-rocm-moriio:dev \
+#       .
+#
+# The resulting image is used by run_pd_demo.sh.
+
+# ── base: same image used by the official ROCm vLLM image ────────────────────
+# Must be sha256-8404161df58334093533b2419b669e71d8cc4a4da2c74a2563bb833944fda8b4
+ARG BASE_IMAGE=rocm/vllm-dev:base
+FROM ${BASE_IMAGE} AS base
+
+# Basic utilities required for the build
+RUN apt-get update -q -y && apt-get install -q -y \
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
+    apt-transport-https ca-certificates wget curl git \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m pip install --upgrade pip
+
+# Install UV (fast Python package installer)
+RUN curl -LsSf --retry 3 --retry-delay 5 https://astral.sh/uv/install.sh -o /tmp/uv-install.sh \
+    && env UV_INSTALL_DIR="/usr/local/bin" sh /tmp/uv-install.sh \
+    && rm -f /tmp/uv-install.sh \
+    && uv --version
+
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE=copy
+
+# ── build vLLM from local source ─────────────────────────────────────────────
+FROM base AS build_vllm
+
+WORKDIR /app
+
+# Copy the full source tree (build context = repo root)
+COPY . vllm/
+
+RUN cd vllm \
+    && python3 -m pip install -r requirements/rocm.txt \
+    && python3 setup.py clean --all \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+
+# ── runtime image ─────────────────────────────────────────────────────────────
+FROM base AS final
+
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+
+# Install RDMA userspace libraries needed by MoRIIO / RIXL
+RUN apt-get update -q -y && apt-get install -q -y \
+    librdmacm1 \
+    libibverbs1 \
+    ibverbs-providers \
+    ibverbs-utils \
+    autoconf \
+    libibverbs-dev \
+    libtool \
+    unzip \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Broadcom bnxt_re RDMA user-space driver
+RUN wget -q https://docs.broadcom.com/docs-and-downloads/ethernet-network-adapters/NXE/Thor2/GCA1/bcm5760x_230.2.52.0a.zip && \
+    unzip -q bcm5760x_230.2.52.0a.zip && \
+    cd bcm5760x_230.2.52.0a/drivers_linux/bnxt_rocelib/ && \
+    results=$(find -name "libbnxt*.tar.gz") && tar -xf $results && \
+    untar_dir=$(find . -maxdepth 1 -type d -name "libbnxt*" ! -name "*.tar.gz" | head -n 1) && \
+    cd $untar_dir && sh autogen.sh && ./configure && make && \
+    find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; && \
+    make install all && \
+    sh -c "echo /usr/local/lib >> /etc/ld.so.conf" && \
+    ldconfig && \
+    cp -f bnxt_re.driver /etc/libibverbs.d/ && \
+    ibv_devices && \
+    cd / && rm -rf /bcm5760x_230.2.52.0a /bcm5760x_230.2.52.0a.zip
+
+# Install vLLM wheel and its ROCm dependencies
+RUN --mount=type=bind,from=build_vllm,src=/app/vllm,target=/vllm_src \
+    --mount=type=cache,target=/root/.cache/uv \
+    cd /vllm_src \
+    && uv pip install --system -r requirements/rocm.txt \
+    && pip uninstall -y vllm || true \
+    && uv pip install --system dist/*.whl \
+    && uv pip install --system msgpack
+
+# Verify ROCm PyTorch (not CUDA)
+RUN python3 -c "import torch; assert torch.version.hip is not None, \
+    f'Expected ROCm PyTorch but got CUDA (hip={torch.version.hip})'; \
+    print(f'Verified: PyTorch {torch.__version__} ROCm HIP {torch.version.hip}')"
+
+# Copy examples so the proxy server script is available inside the container
+COPY --from=build_vllm /app/vllm/examples /app/vllm/examples
+# Copy the GSM8K evaluation script (used by run_pd_demo.sh with USE_GSM8K=1)
+COPY --from=build_vllm /app/vllm/tests/evals /app/vllm/tests/evals
+
+# Performance / correctness env vars from the official ROCm Dockerfile
+ENV TOKENIZERS_PARALLELISM=false
+ENV SAFETENSORS_FAST_GPU=1
+ENV HIP_FORCE_DEV_KERNARG=1
+ENV MIOPEN_DEBUG_CONV_DIRECT=0
+ENV MIOPEN_DEBUG_CONV_GEMM=0
+
+WORKDIR /app
+
+CMD ["/bin/bash"]
diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/README.md b/examples/online_serving/disaggregated_serving/moriio_pd_demo/README.md
new file mode 100644
index 000000000000..acfe788ba235
--- /dev/null
+++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/README.md
@@ -0,0 +1,339 @@
+# MoRIIO PD-disaggregation demo
+
+Reproduction scripts for running vLLM PD-disaggregation with the
+MoRIIOConnector KV connector and the vllm-router.
+
+- **Single-host** (`run_pd_demo.sh`): requires two ROCm GPUs on one machine (Qwen3-8B).
+- **Two-node** (`run_pd_demo_2node.sh`): runs DeepSeek-R1 with TP8 across two hosts connected via RDMA.
+
+---
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `Dockerfile.router` | Builds the `vllm-router` Rust binary |
+| `Dockerfile.vllm-rocm` | Builds vLLM from source on the ROCm base image |
+| `run_pd_demo.sh` | Launches prefill, decode, and router containers on a single host |
+| `run_pd_demo_2node.sh` | Two-node setup (DeepSeek-R1 with TP8); run on both nodes |
+
+---
+
+## 1. Get the Docker images
+
+### Option A — Pull pre-built images (recommended)
+
+```bash
+# Built from vllm PR https://github.com/vllm-project/vllm/pull/39565, commit 619af871380a7c72876f17101a2f9384fd959cff
+docker pull ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0414-0859
+# Basic router support, i.e. PR https://github.com/vllm-project/router/pull/138
+docker pull ghcr.io/simondanielsson/vllm-router:dev
+# Basic router support + streaming, i.e. both PRs https://github.com/vllm-project/router/pull/138 and https://github.com/vllm-project/router/pull/114
+docker pull ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy
+```
+
+### Option B — Build from source
+
+#### Router image
+
+```bash
+# clone the fork with the PR
+git clone git@github.com:simondanielsson/router.git
+cd router
+# To build the image with basic mori support
+git switch feature/moriio-support
+docker build \
+    -f <path-to-this-dir>/Dockerfile.router \
+    -t ghcr.io/simondanielsson/vllm-router:dev \
+    .
+
+# To build the image with mori support + streaming: this branch contains the two PR's on top of each other
+git switch tmp/transfer_id-and-streaming
+docker build \
+    -f <path-to-this-dir>/Dockerfile.router \
+    -t ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy \
+    .
+```
+
+#### vLLM image
+
+Build from the **root of this vllm repo**:
+
+```bash
+# clone the fork
+git clone git@github.com:simondanielsson/vllm.git # vllm branc here
+git clone git@github.com:mpashkov/vllm.git path/to/mpashkov-vllm-fork/ # reproducer in this fork, alt. just set multiple remotes
+cd vllm
+git switch fix/align-mori-messages
+docker build \
+    -f path/to/mpashkov-vllm-fork/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm \
+    -t ghcr.io/simondanielsson/vllm-rocm-moriio:dev \
+    .
+```
+
+The vLLM build compiles the ROCm wheel from source, which takes a while.
+
+---
+
+## 2. Run the demo, single node
+
+### PR 1 — basic PD-disaggregation (smoke test only to confirm everything works)
+
+```bash
+MODEL=Qwen/Qwen3-8B \
+PREFILL_GPU=0 \
+DECODE_GPU=1 \
+./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh
+```
+
+This sends a single smoke-test request through the router after all services are healthy.
+
+### PR 2 — with streaming support (full benchmark)
+
+Once the streaming PR is merged into the router image, enable the full two-phase benchmark
+(vllm-router + toy proxy) by setting `USE_BENCH=1`:
+
+```bash
+MODEL=Qwen/Qwen3-8B \
+PREFILL_GPU=0 \
+DECODE_GPU=1 \
+USE_BENCH=1 \
+./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh
+```
+
+Benchmark results are written to `~/moriio-logs/benchmark_results.log`.
+
+Environment variables:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MODEL` | `Qwen/Qwen3-8B` | HuggingFace model id |
+| `PREFILL_GPU` | `0` | GPU index for the prefill instance |
+| `DECODE_GPU` | `1` | GPU index for the decode instance |
+| `PREFILL_PORT` | `8100` | HTTP port for the prefill vLLM server |
+| `DECODE_PORT` | `8200` | HTTP port for the decode vLLM server |
+| `ROUTER_PORT` | `8080` | HTTP port for vllm-router |
+| `PROXY_PING_PORT` | `36367` | ZMQ service-discovery port (router ↔ vLLM) |
+| `HF_HOME` | `~/.cache/huggingface` | Host path to HuggingFace model cache |
+| `LOG_DIR` | `~/moriio-logs` | Directory for container and benchmark logs |
+| `USE_BENCH` | `0` | Set to `1` to run the full perf benchmark (requires streaming support) |
+| `USE_GSM8K` | `0` | Set to `1` to run a GSM8K accuracy evaluation instead of the perf benchmark |
+| `KEEP_ALIVE` | `0` | Set to `1` to leave containers running after the script exits |
+| `VLLM_IMAGE` | `ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0414-0859` | vLLM Docker image name |
+| `ROUTER_IMAGE` | `ghcr.io/simondanielsson/vllm-router:dev` | Router image used for smoke-test (no streaming) |
+| `ROUTER_STREAMING_IMAGE` | `ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy` | Router image used for `USE_BENCH=1` / `USE_GSM8K=1` (streaming support required) |
+
+---
+
+## DSR1 on two nodes
+
+Run `run_pd_demo_2node.sh` on two separate nodes. Replace `<node1-ip>` and
+`<node2-ip>` with the actual IP addresses of each host.
+
+The decode node runs the **same script** in all cases — only the prefill node
+changes between benchmark modes.
+
+### Smoke test
+
+#### Node 1 — prefill + router
+
+```bash
+IS_PREFILL=1 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> \
+  ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
+```
+
+#### Node 2 — decode
+
+```bash
+IS_PREFILL=0 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> \
+  ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
+```
+
+### Performance benchmark (`USE_BENCH=1`)
+
+#### Node 1 — prefill + router
+
+```bash
+IS_PREFILL=1 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> USE_BENCH=1 \
+  ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
+```
+
+#### Node 2 — decode
+
+```bash
+IS_PREFILL=0 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> \
+  ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
+```
+
+Benchmark results (both phases) are written to `~/moriio-logs/benchmark_results.log`
+on Node 1. The decode node automatically restarts for Phase 2 — just leave Node 2's
+script running; it will poll for the Phase 2 restart signal from Node 1.
+
+### GSM8K accuracy evaluation (`USE_GSM8K=1`)
+
+#### Node 1 — prefill + router
+
+```bash
+IS_PREFILL=1 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> USE_GSM8K=1 \
+  ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
+```
+
+#### Node 2 — decode
+
+```bash
+IS_PREFILL=0 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> \
+  ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
+```
+
+GSM8K results are written to `~/moriio-logs/gsm8k_results.log` (and `.json`) on Node 1.
+
+> **Note:** `USE_GSM8K=1` and `USE_BENCH=1` are mutually exclusive — GSM8K takes priority
+> if both are set. The GSM8K eval runs only through vllm-router (single-phase); the toy
+> proxy comparison in 2-node mode is not yet supported for GSM8K.
+
+Environment variables for `run_pd_demo_2node.sh`:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `IS_PREFILL` | *(required)* | `1` = this node runs prefill + router; `0` = decode only |
+| `PREFILL_IP` | *(required)* | IP address of the prefill/router node |
+| `DECODE_IP` | *(required)* | IP address of the decode node |
+| `MODEL` | `deepseek-ai/DeepSeek-R1-0528` | HuggingFace model id |
+| `PREFILL_PORT` | `8100` | HTTP port for the prefill vLLM server |
+| `DECODE_PORT` | `8200` | HTTP port for the decode vLLM server |
+| `ROUTER_PORT` | `8080` | HTTP port for vllm-router (Node 1 only) |
+| `PROXY_PING_PORT` | `36367` | ZMQ service-discovery port |
+| `HF_HOME` | `~/.cache/huggingface` | Host path to HuggingFace model cache |
+| `LOG_DIR` | `~/moriio-logs` | Directory for logs and benchmark results |
+| `USE_BENCH` | `0` | Set to `1` to run the perf benchmark after startup |
+| `USE_GSM8K` | `0` | Set to `1` to run the GSM8K accuracy evaluation |
+| `KEEP_ALIVE` | `0` | Set to `1` to leave containers running after the script exits |
+| `VLLM_IMAGE` | `ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0414-0859` | vLLM Docker image |
+| `PHASE2_SIGNAL_PORT` | `19876` | Port on Node 1 used to signal Node 2 to restart for Phase 2 (`USE_BENCH=1`) |
+| `ROUTER_IMAGE` | `ghcr.io/simondanielsson/vllm-router:dev` | Router image (smoke test) |
+| `ROUTER_STREAMING_IMAGE` | `ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy` | Router image for bench/eval |
+
+---
+
+## 3. Send a test request
+
+Once all three containers are healthy, send requests through the **router**:
+
+```bash
+curl -s http://localhost:8080/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen3-8B",
+        "prompt": "San Francisco is a",
+        "max_tokens": 64,
+        "temperature": 0
+    }' | python3 -m json.tool
+```
+
+---
+
+## 4. Load test with `vllm bench serve`
+
+Run a synthetic load test against the router with 1 000-token inputs, 1 000-token
+outputs, and a max concurrency of 16:
+
+```bash
+vllm bench serve \
+    --base-url http://localhost:8080 \
+    --backend openai-completions \
+    --model Qwen/Qwen3-8B \
+    --dataset-name random \
+    --random-input-len 1000 \
+    --random-output-len 1000 \
+    --max-concurrency 16 \
+    --num-prompts 200
+```
+
+| Flag | Value | Notes |
+|------|-------|-------|
+| `--base-url` | `http://localhost:8080` | Points at the router, not the vLLM instances directly |
+| `--backend` | `openai-completions` | Uses the `/v1/completions` endpoint |
+| `--model` | `Qwen/Qwen3-8B` | Must match the model served |
+| `--dataset-name` | `random` | Fully synthetic, no external dataset file needed |
+| `--random-input-len` | `1000` | Input sequence length (ISL) in tokens |
+| `--random-output-len` | `1000` | Output sequence length (OSL) in tokens |
+| `--max-concurrency` | `16` | Maximum number of in-flight requests |
+| `--num-prompts` | `200` | Total requests to send; increase for a longer run |
+
+---
+
+## 5. GSM8K accuracy evaluation (two-phase comparison)
+
+To compare accuracy between the vllm-router and the toy proxy, run:
+
+```bash
+MODEL=Qwen/Qwen3-8B \
+PREFILL_GPU=0 \
+DECODE_GPU=1 \
+USE_GSM8K=1 \
+./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh
+```
+
+This uses the [`lm_eval`](https://github.com/EleutherAI/lm-evaluation-harness) harness
+with `--model local-completions` pointed at the active proxy, running the full GSM8K
+test set, and runs it **twice**:
+
+1. **Phase 1** — through `vllm-router` (services already running)
+2. **Phase 2** — vllm-router is replaced by `moriio_toy_proxy_server.py`; prefill and
+   decode are restarted so they re-register with the new proxy, then the eval runs again
+
+`lm_eval[api]` is installed automatically inside the container before each run
+(`local-completions` is an API model backend that requires `tenacity`).
+
+At the end the script prints a side-by-side summary of `exact_match,flexible-extract`
+and `exact_match,strict-match` for both routers.
+
+Results are written to:
+
+| File | Contents |
+|------|----------|
+| `~/moriio-logs/gsm8k_results_router.log` | Phase 1 human-readable summary (vllm-router) |
+| `~/moriio-logs/gsm8k_results_router.json` | Phase 1 machine-readable JSON |
+| `~/moriio-logs/gsm8k_results_toy_proxy.log` | Phase 2 human-readable summary (toy proxy) |
+| `~/moriio-logs/gsm8k_results_toy_proxy.json` | Phase 2 machine-readable JSON |
+| `~/moriio-logs/toy_proxy.log` | Toy proxy container logs |
+
+> **Note:** `USE_GSM8K=1` and `USE_BENCH=1` are mutually exclusive — GSM8K takes
+> priority if both are set.
+
+---
+
+## 6. Teardown
+
+The script shuts down all containers automatically when it exits.
+To leave them running (e.g. to inspect logs or send additional requests), set `KEEP_ALIVE=1`.
+
+If you need to tear down manually:
+
+```bash
+docker rm -f moriio-prefill moriio-decode moriio-router
+```
+
+---
+
+## Architecture
+
+```
+Client
+  │
+  ▼
+vllm-router (port 8080)
+  │  ZMQ service-discovery on PROXY_PING_PORT
+  │  (vLLM instances register themselves at startup)
+  ├──► Prefill instance (GPU 0, port 8100)
+  │       kv_role = kv_producer
+  │       MoRIIOConnector writes KV cache → Decode via RDMA
+  │
+  └──► Decode instance (GPU 1, port 8200)
+          kv_role = kv_consumer
+          MoRIIOConnector reads KV cache from Prefill via RDMA
+```
+
+The router uses `--vllm-pd-disaggregation` + `--vllm-discovery-address` so
+that vLLM instances register dynamically at startup rather than being passed
+as static `--prefill`/`--decode` URLs.
diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh b/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh
new file mode 100755
index 000000000000..6dce239ebb61
--- /dev/null
+++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh
@@ -0,0 +1,712 @@
+#!/usr/bin/env bash
+# run_pd_demo.sh — Launch a single-node MoRIIO PD-disaggregation demo
+#
+# Prerequisites
+#   • Docker images built (see README in this directory):
+#       vllm-rocm-moriio:dev   (from Dockerfile.vllm-rocm)
+#       vllm-router:dev        (from Dockerfile.router)
+#   • At least 2 ROCm GPUs visible to the host
+#   • A HuggingFace model cache at HF_HOME (default: ~/.cache/huggingface)
+#
+# Usage:
+#   MODEL=meta-llama/Llama-3.1-8B-Instruct ./run_pd_demo.sh
+#   MODEL=meta-llama/Llama-3.1-8B-Instruct PREFILL_GPU=0 DECODE_GPU=1 ./run_pd_demo.sh
+#
+# To tear everything down:
+#   docker rm -f moriio-prefill moriio-decode moriio-router
+
+set -euo pipefail
+
+# ── Configuration ─────────────────────────────────────────────────────────────
+MODEL="${MODEL:-Qwen/Qwen3-8B}"
+
+PREFILL_GPU="${PREFILL_GPU:-0}"            # GPU index for the prefill instance
+DECODE_GPU="${DECODE_GPU:-1}"             # GPU index for the decode instance
+
+PREFILL_PORT="${PREFILL_PORT:-8100}"      # HTTP port exposed by the prefill vLLM
+DECODE_PORT="${DECODE_PORT:-8200}"        # HTTP port exposed by the decode vLLM
+ROUTER_PORT="${ROUTER_PORT:-8080}"        # HTTP port exposed by vllm-router
+PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # ZMQ service-discovery port (router ↔ vLLM)
+
+# MoRIIO internal ports — must be distinct between prefill and decode on the same host.
+# handshake_port : initial MoRIIO engine handshake (default in code: 6301)
+# notify_port    : prefill↔decode stage synchronisation (default in code: 61005)
+PREFILL_HANDSHAKE_PORT="${PREFILL_HANDSHAKE_PORT:-6301}"
+DECODE_HANDSHAKE_PORT="${DECODE_HANDSHAKE_PORT:-6302}"
+PREFILL_NOTIFY_PORT="${PREFILL_NOTIFY_PORT:-61005}"
+DECODE_NOTIFY_PORT="${DECODE_NOTIFY_PORT:-61006}"
+
+VLLM_IMAGE="${VLLM_IMAGE:-ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0411-1127}"
+# Basic router (smoke-test only — no streaming support)
+ROUTER_IMAGE="${ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev}"
+# Streaming-capable router (required for USE_BENCH=1 and USE_GSM8K=1)
+ROUTER_STREAMING_IMAGE="${ROUTER_STREAMING_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming}"
+
+HF_HOME="${HF_HOME:-${HOME}/.cache/huggingface}"
+
+LOG_DIR="${LOG_DIR:-${HOME}/moriio-logs}"
+SHM_SIZE="${SHM_SIZE:-128G}"
+USE_BENCH="${USE_BENCH:-0}"      # Set to 1 to run full benchmark on both routers
+USE_GSM8K="${USE_GSM8K:-0}"     # Set to 1 to run GSM8K accuracy eval instead of the perf benchmark
+KEEP_ALIVE="${KEEP_ALIVE:-0}"   # Set to 1 to leave containers running after the script exits
+
+# Max tokens the prefill instance is allowed to generate (1 = prefill only)
+# The decode instance does the actual generation.
+PREFILL_MAX_MODEL_LEN="${PREFILL_MAX_MODEL_LEN:-8192}"
+DECODE_MAX_MODEL_LEN="${DECODE_MAX_MODEL_LEN:-8192}"
+
+# ── Derive the host IP that containers will reach each other on ───────────────
+# Use the docker bridge gateway as "this host" so containers can talk to each
+# other through the host network.  On Linux, host.docker.internal may not
+# resolve; the bridge gateway (172.17.0.1) is usually reliable.
+HOST_IP="${HOST_IP:-$(docker network inspect bridge \
+    --format '{{range .IPAM.Config}}{{.Gateway}}{{end}}' 2>/dev/null \
+    || echo "172.17.0.1")}"
+
+mkdir -p "${LOG_DIR}"
+
+# ── Toy-proxy patch: fix non-streaming responses ──────────────────────────────
+# gsm8k_eval.py (and any non-streaming client) posts without "stream": true.
+# The toy proxy always returns an async-generator response, which Quart labels
+# as text/html.  This patch makes handle_request() buffer + return a proper
+# JSON response when the client did not request streaming.
+# The patch script is mounted read-only into the toy-proxy container and run
+# once at startup, before the proxy process is launched.
+TOY_PROXY_PATCH_SCRIPT="$(mktemp /tmp/patch_toy_proxy.XXXXXX.py)"
+cat > "${TOY_PROXY_PATCH_SCRIPT}" << 'PYEOF'
+import pathlib
+
+TARGET = pathlib.Path(
+    "/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py"
+)
+src = TARGET.read_text()
+
+OLD = (
+    "        session, decode_response = await decode_request_task\n"
+    "        stream_generator = stream_decode_response(session, decode_response, request_id)\n"
+    "        response = await make_response(stream_generator)\n"
+    "        return response"
+)
+NEW = (
+    "        session, decode_response = await decode_request_task\n"
+    "        if req_data.get(\"stream\", False):\n"
+    "            stream_generator = stream_decode_response(\n"
+    "                session, decode_response, request_id\n"
+    "            )\n"
+    "            response = await make_response(stream_generator)\n"
+    "            return response\n"
+    "        else:\n"
+    "            try:\n"
+    "                body = await decode_response.read()\n"
+    "                content_type = decode_response.headers.get(\n"
+    "                    \"Content-Type\", \"application/json\"\n"
+    "                )\n"
+    "            finally:\n"
+    "                await session.close()\n"
+    "            response = await make_response(body, decode_response.status)\n"
+    "            response.headers[\"Content-Type\"] = content_type\n"
+    "            return response"
+)
+
+if OLD in src:
+    TARGET.write_text(src.replace(OLD, NEW, 1))
+    print("toy proxy patch: non-streaming fix applied.")
+else:
+    print("toy proxy patch: target not found — skipping (already patched?).")
+PYEOF
+
+echo "=== MoRIIO PD disaggregation demo ==="
+echo "  Model         : ${MODEL}"
+echo "  Host IP       : ${HOST_IP}"
+echo "  Prefill GPU   : GPU ${PREFILL_GPU}  → port ${PREFILL_PORT}"
+echo "  Decode  GPU   : GPU ${DECODE_GPU}  → port ${DECODE_PORT}"
+echo "  Router  port  : ${ROUTER_PORT}"
+echo "  Discovery port: ${PROXY_PING_PORT}"
+echo "  Prefill MoRIIO: handshake=${PREFILL_HANDSHAKE_PORT} notify=${PREFILL_NOTIFY_PORT}"
+echo "  Decode  MoRIIO: handshake=${DECODE_HANDSHAKE_PORT} notify=${DECODE_NOTIFY_PORT}"
+echo "  Log dir       : ${LOG_DIR}"
+echo ""
+
+# ── Helper: wait for a vLLM /health endpoint ─────────────────────────────────
+wait_for_health() {
+    local name="$1"
+    local port="$2"
+    local max_wait=300   # seconds
+    local interval=5
+    local elapsed=0
+
+    echo "Waiting for ${name} (port ${port}) to become healthy..."
+    while true; do
+        if curl -sf "http://localhost:${port}/health" >/dev/null 2>&1; then
+            echo "  ${name} is healthy."
+            return 0
+        fi
+        sleep "${interval}"
+        elapsed=$((elapsed + interval))
+        if [[ "${elapsed}" -ge "${max_wait}" ]]; then
+            echo "ERROR: ${name} did not become healthy after ${max_wait}s" >&2
+            docker logs "${name}" | tail -30 >&2
+            exit 1
+        fi
+        echo "  Still waiting for ${name} (${elapsed}s / ${max_wait}s)..."
+    done
+}
+
+# ── Remove stale containers (if any) ─────────────────────────────────────────
+for cname in moriio-prefill moriio-decode moriio-router moriio-toy-proxy; do
+    if docker ps -a --format '{{.Names}}' | grep -q "^${cname}$"; then
+        echo "Removing existing container: ${cname}"
+        docker rm -f "${cname}"
+    fi
+done
+
+# ── KV-transfer config shared between prefill and decode ─────────────────────
+# proxy_ip   : address that vLLM uses to register with the router's ZMQ socket
+# proxy_ping_port : the port the router listens on for registration messages
+# http_port  : the vLLM instance's own HTTP port (used in the zmq_address that
+#              gets embedded in request IDs by the router)
+
+PREFILL_KV_CONFIG=$(cat <<EOF
+{
+  "kv_connector": "MoRIIOConnector",
+  "kv_role": "kv_producer",
+  "kv_connector_extra_config": {
+    "proxy_ip": "${HOST_IP}",
+    "proxy_ping_port": "${PROXY_PING_PORT}",
+    "http_port": "${PREFILL_PORT}",
+    "handshake_port": "${PREFILL_HANDSHAKE_PORT}",
+    "notify_port": "${PREFILL_NOTIFY_PORT}"
+  }
+}
+EOF
+)
+
+DECODE_KV_CONFIG=$(cat <<EOF
+{
+  "kv_connector": "MoRIIOConnector",
+  "kv_role": "kv_consumer",
+  "kv_connector_extra_config": {
+    "proxy_ip": "${HOST_IP}",
+    "proxy_ping_port": "${PROXY_PING_PORT}",
+    "http_port": "${DECODE_PORT}",
+    "handshake_port": "${DECODE_HANDSHAKE_PORT}",
+    "notify_port": "${DECODE_NOTIFY_PORT}"
+  }
+}
+EOF
+)
+
+# ── Common docker run flags for both vLLM containers ─────────────────────────
+VLLM_COMMON_ARGS=(
+    --init
+    --network host
+    --ipc host
+    --privileged
+    --cap-add SYS_PTRACE
+    --security-opt seccomp=unconfined
+    --ulimit memlock=-1
+    --ulimit stack=67108864
+    --shm-size "${SHM_SIZE}"
+    --group-add video
+    --group-add render
+    --device /dev/kfd
+    --device /dev/dri
+    --device /dev/infiniband
+    -v /sys:/sys
+    -v "${HF_HOME}:/root/.cache/huggingface"
+    -e HF_HOME=/root/.cache/huggingface
+    -e HF_HUB_ENABLE_HF_TRANSFER=0
+    -e VLLM_MORIIO_CONNECTOR_READ_MODE=1
+    -e NCCL_MIN_NCHANNELS=112
+    -e VLLM_USE_V1=1
+    -e VLLM_ROCM_USE_AITER=1
+    -e VLLM_ROCM_USE_AITER_PAGED_ATTN=0
+    -e VLLM_ROCM_USE_AITER_RMSNORM=1
+    -e VLLM_USE_AITER_TRITON_SILU_MUL=0
+    -e VLLM_ENGINE_READY_TIMEOUT_S=3600
+    -e VLLM_SERVER_DEV_MODE=1
+)
+
+# ── Launch prefill instance ───────────────────────────────────────────────────
+echo ""
+echo ">>> Starting prefill instance (GPU ${PREFILL_GPU}, port ${PREFILL_PORT})..."
+
+docker run -d \
+    --name moriio-prefill \
+    "${VLLM_COMMON_ARGS[@]}" \
+    -e HIP_VISIBLE_DEVICES="${PREFILL_GPU}" \
+    "${VLLM_IMAGE}" \
+    vllm serve "${MODEL}" \
+        --port "${PREFILL_PORT}" \
+        --max-model-len "${PREFILL_MAX_MODEL_LEN}" \
+        --trust-remote-code \
+        --kv-transfer-config "${PREFILL_KV_CONFIG}"
+
+docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill.log" &
+
+# ── Launch decode instance ────────────────────────────────────────────────────
+echo ">>> Starting decode instance (GPU ${DECODE_GPU}, port ${DECODE_PORT})..."
+
+docker run -d \
+    --name moriio-decode \
+    "${VLLM_COMMON_ARGS[@]}" \
+    -e HIP_VISIBLE_DEVICES="${DECODE_GPU}" \
+    "${VLLM_IMAGE}" \
+    vllm serve "${MODEL}" \
+        --port "${DECODE_PORT}" \
+        --max-model-len "${DECODE_MAX_MODEL_LEN}" \
+        --trust-remote-code \
+        --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
+        --kv-transfer-config "${DECODE_KV_CONFIG}"
+
+docker logs -f moriio-decode 2>&1 | tee "${LOG_DIR}/decode.log" &
+
+# ── Wait for both vLLM instances to be healthy before starting the router ─────
+wait_for_health "moriio-prefill" "${PREFILL_PORT}"
+wait_for_health "moriio-decode"  "${DECODE_PORT}"
+
+# ── Launch vllm-router ────────────────────────────────────────────────────────
+# USE_BENCH and USE_GSM8K require the streaming-capable router image.
+if [[ "${USE_BENCH}" == "1" || "${USE_GSM8K}" == "1" ]]; then
+    _ACTIVE_ROUTER_IMAGE="${ROUTER_STREAMING_IMAGE}"
+else
+    _ACTIVE_ROUTER_IMAGE="${ROUTER_IMAGE}"
+fi
+
+echo ""
+echo ">>> Starting vllm-router (port ${ROUTER_PORT}, discovery port ${PROXY_PING_PORT})..."
+echo "    Image: ${_ACTIVE_ROUTER_IMAGE}"
+
+docker run -d \
+    --name moriio-router \
+    --network host \
+    "${_ACTIVE_ROUTER_IMAGE}" \
+    vllm-router \
+        --vllm-pd-disaggregation \
+        --vllm-discovery-address "0.0.0.0:${PROXY_PING_PORT}" \
+        --port "${ROUTER_PORT}" \
+        --host 0.0.0.0 \
+        --policy consistent_hash \
+        --prefill-policy consistent_hash \
+        --decode-policy consistent_hash \
+        --log-level info
+
+docker logs -f moriio-router 2>&1 | tee "${LOG_DIR}/router.log" &
+
+# ── Wait briefly for the router to be ready (it starts fast) ─────────────────
+echo "Waiting for router to start..."
+sleep 5
+
+# ── Print summary ─────────────────────────────────────────────────────────────
+echo ""
+echo "=== All services running ==="
+echo "  Prefill  : http://localhost:${PREFILL_PORT}"
+echo "  Decode   : http://localhost:${DECODE_PORT}"
+echo "  Router   : http://localhost:${ROUTER_PORT}  ← send requests here"
+echo ""
+echo "Example request:"
+echo "  curl -s http://localhost:${ROUTER_PORT}/v1/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"prompt\": \"San Francisco is a\","
+echo "      \"max_tokens\": 64,"
+echo "      \"temperature\": 0"
+echo "    }' | python3 -m json.tool"
+echo ""
+echo "To follow logs (log files are written to ${LOG_DIR}):"
+echo "  tail -f ${LOG_DIR}/prefill.log"
+echo "  tail -f ${LOG_DIR}/decode.log"
+echo "  tail -f ${LOG_DIR}/router.log"
+echo ""
+echo "Containers will be shut down automatically when the script exits."
+echo "  (Set KEEP_ALIVE=1 to leave them running.)"
+
+if [[ "${USE_GSM8K}" == "1" ]]; then
+
+# ── GSM8K two-phase accuracy comparison (via lm_eval) ────────────────────────
+# Phase 1 : vllm-router (already running)
+# Phase 2 : toy proxy  (router replaced, vLLM instances restarted)
+#
+# Uses the lm_eval harness with --model local-completions so the full GSM8K
+# test set flows through whichever proxy is active.
+
+# Per-phase output paths
+GSM8K_ROUTER_LOG="${LOG_DIR}/gsm8k_results_router.log"
+GSM8K_ROUTER_JSON="${LOG_DIR}/gsm8k_results_router.json"
+GSM8K_PROXY_LOG="${LOG_DIR}/gsm8k_results_toy_proxy.log"
+GSM8K_PROXY_JSON="${LOG_DIR}/gsm8k_results_toy_proxy.json"
+
+TOY_PROXY_CONTAINER_PATH="/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py"
+TOY_PROXY_HTTP_PORT=10001
+
+# Helper: run lm_eval gsm8k inside a container against the given port.
+# Installs lm_eval[vllm] if needed, writes stdout to log_file, and copies
+# the lm_eval results JSON out of the container to json_dest.
+run_lm_eval() {
+    local container="$1"
+    local port="$2"
+    local log_file="$3"
+    local json_dest="$4"
+    local out_dir="/tmp/lm_eval_out"
+
+    docker exec "${container}" bash -c \
+        "pip install --quiet 'lm_eval[api]' && \
+         rm -rf ${out_dir} && \
+         lm_eval \
+             --model local-completions \
+             --model_args model=${MODEL},base_url=http://127.0.0.1:${port}/v1/completions,tokenized_requests=False,trust_remote_code=True \
+             --tasks gsm8k \
+             --output_path ${out_dir}" \
+        2>&1 | tee -a "${log_file}"
+
+    # lm_eval writes results.json somewhere under out_dir; find and copy it.
+    local remote_json
+    remote_json=$(docker exec "${container}" \
+        find "${out_dir}" -name "results.json" 2>/dev/null | head -1)
+    if [[ -n "${remote_json}" ]]; then
+        docker cp "${container}:${remote_json}" "${json_dest}" 2>/dev/null || true
+    else
+        echo "WARNING: lm_eval results.json not found in ${out_dir}" >&2
+    fi
+}
+
+# ── Phase 1: GSM8K through vllm-router ───────────────────────────────────────
+echo ""
+echo ">>> Phase 1: GSM8K accuracy evaluation (lm_eval) through vllm-router..."
+echo "    (full GSM8K test set, results → ${GSM8K_ROUTER_LOG})"
+{
+    echo "======================================================"
+    echo "  GSM8K evaluation (lm_eval) — Phase 1: vllm-router"
+    echo "  Model : ${MODEL}"
+    echo "  Date  : $(date)"
+    echo "======================================================"
+} | tee "${GSM8K_ROUTER_LOG}"
+
+run_lm_eval "moriio-prefill" "${ROUTER_PORT}" "${GSM8K_ROUTER_LOG}" "${GSM8K_ROUTER_JSON}"
+
+# ── Phase 2: switch to toy proxy and re-run ───────────────────────────────────
+echo ""
+echo ">>> Stopping vllm-router, prefill, and decode for Phase 2..."
+docker rm -f moriio-router moriio-prefill moriio-decode
+
+echo ">>> Starting toy proxy container (HTTP :${TOY_PROXY_HTTP_PORT}, ZMQ :${PROXY_PING_PORT})..."
+docker run -d \
+    --name moriio-toy-proxy \
+    --network host \
+    -v "${TOY_PROXY_PATCH_SCRIPT}:/tmp/patch_toy_proxy.py:ro" \
+    "${VLLM_IMAGE}" \
+    bash -c "pip install --quiet --ignore-installed quart aiohttp msgpack && \
+             python3 /tmp/patch_toy_proxy.py && \
+             python3 -u ${TOY_PROXY_CONTAINER_PATH}"
+
+docker logs -f moriio-toy-proxy 2>&1 | tee "${LOG_DIR}/toy_proxy.log" &
+
+# Wait for the toy proxy HTTP port before starting vLLM instances.
+echo "Waiting for toy proxy HTTP port ${TOY_PROXY_HTTP_PORT} to open..."
+_tp_wait=0
+until curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/" >/dev/null 2>&1 \
+   || curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/v1/completions" >/dev/null 2>&1 \
+   || nc -z 127.0.0.1 "${TOY_PROXY_HTTP_PORT}" 2>/dev/null; do
+    sleep 2
+    _tp_wait=$((_tp_wait + 2))
+    if [[ "${_tp_wait}" -ge 60 ]]; then
+        echo "WARNING: toy proxy did not open port ${TOY_PROXY_HTTP_PORT} after 60s" >&2
+        docker logs moriio-toy-proxy 2>&1 | tail -20 >&2
+        break
+    fi
+done
+echo "  Toy proxy is up."
+
+echo ">>> Restarting prefill instance (GPU ${PREFILL_GPU}, port ${PREFILL_PORT})..."
+docker run -d \
+    --name moriio-prefill \
+    "${VLLM_COMMON_ARGS[@]}" \
+    -e HIP_VISIBLE_DEVICES="${PREFILL_GPU}" \
+    "${VLLM_IMAGE}" \
+    vllm serve "${MODEL}" \
+        --port "${PREFILL_PORT}" \
+        --max-model-len "${PREFILL_MAX_MODEL_LEN}" \
+        --trust-remote-code \
+        --kv-transfer-config "${PREFILL_KV_CONFIG}"
+
+docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill_phase2.log" &
+
+echo ">>> Restarting decode instance (GPU ${DECODE_GPU}, port ${DECODE_PORT})..."
+docker run -d \
+    --name moriio-decode \
+    "${VLLM_COMMON_ARGS[@]}" \
+    -e HIP_VISIBLE_DEVICES="${DECODE_GPU}" \
+    "${VLLM_IMAGE}" \
+    vllm serve "${MODEL}" \
+        --port "${DECODE_PORT}" \
+        --max-model-len "${DECODE_MAX_MODEL_LEN}" \
+        --trust-remote-code \
+        --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
+        --kv-transfer-config "${DECODE_KV_CONFIG}"
+
+docker logs -f moriio-decode 2>&1 | tee "${LOG_DIR}/decode_phase2.log" &
+
+wait_for_health "moriio-prefill" "${PREFILL_PORT}"
+wait_for_health "moriio-decode"  "${DECODE_PORT}"
+
+# Reuse the same registration-wait helper defined in the USE_BENCH block
+# (define it inline here so USE_GSM8K works standalone).
+echo "Waiting for prefill and decode to register with toy proxy..."
+_reg_wait=0
+while true; do
+    _p=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Prefill" || true)
+    _d=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Decode"  || true)
+    if [[ "${_p:-0}" -ge 1 && "${_d:-0}" -ge 1 ]]; then
+        echo "  Both instances registered."
+        break
+    fi
+    sleep 3
+    _reg_wait=$((_reg_wait + 3))
+    if [[ "${_reg_wait}" -ge 120 ]]; then
+        echo "WARNING: timed out waiting for toy proxy registrations after 120s" >&2
+        break
+    fi
+    echo "  Still waiting (${_reg_wait}s / 120s) — prefill=${_p:-0} decode=${_d:-0}..."
+done
+
+echo ""
+echo ">>> Phase 2: GSM8K accuracy evaluation (lm_eval) through toy proxy..."
+echo "    (full GSM8K test set, results → ${GSM8K_PROXY_LOG})"
+{
+    echo "======================================================"
+    echo "  GSM8K evaluation (lm_eval) — Phase 2: moriio_toy_proxy_server.py"
+    echo "  Model : ${MODEL}"
+    echo "  Date  : $(date)"
+    echo "======================================================"
+} | tee "${GSM8K_PROXY_LOG}"
+
+run_lm_eval "moriio-prefill" "${TOY_PROXY_HTTP_PORT}" "${GSM8K_PROXY_LOG}" "${GSM8K_PROXY_JSON}"
+
+# ── Side-by-side summary ──────────────────────────────────────────────────────
+# lm_eval results.json structure:
+#   { "results": { "gsm8k": { "exact_match,flexible-extract": 0.xx, ... } } }
+_gsm8k_metric() {
+    local json_file="$1"
+    local key="$2"
+    python3 -c "
+import json, sys
+try:
+    d = json.load(open('${json_file}'))
+    v = d['results']['gsm8k'].get('${key}', 'N/A')
+    print(f'{v:.4f}' if isinstance(v, float) else v)
+except Exception as e:
+    print('N/A')
+" 2>/dev/null || echo "N/A"
+}
+
+echo ""
+echo "=== GSM8K comparison complete ==="
+echo ""
+printf "%-35s  %-15s %s\n" "Metric" "vllm-router" "toy-proxy"
+printf "%-35s  %-15s %s\n" "------" "-----------" "---------"
+for key in "exact_match,flexible-extract" "exact_match,strict-match"; do
+    r=$(_gsm8k_metric "${GSM8K_ROUTER_JSON}" "${key}")
+    p=$(_gsm8k_metric "${GSM8K_PROXY_JSON}"  "${key}")
+    printf "%-35s  %-15s %s\n" "${key}" "${r}" "${p}"
+done
+echo ""
+echo "  Phase 1 log  : ${GSM8K_ROUTER_LOG}"
+echo "  Phase 1 JSON : ${GSM8K_ROUTER_JSON}"
+echo "  Phase 2 log  : ${GSM8K_PROXY_LOG}"
+echo "  Phase 2 JSON : ${GSM8K_PROXY_JSON}"
+echo "  Toy proxy log: ${LOG_DIR}/toy_proxy.log"
+
+elif [[ "${USE_BENCH}" == "1" ]]; then
+
+# ── Benchmark helpers ─────────────────────────────────────────────────────────
+# Both the benchmark tool and the toy proxy run inside moriio-prefill because:
+#   • vllm (bench serve) is installed there
+#   • examples/ are copied to /app/vllm/examples/ in the image (see Dockerfile)
+#   • --network host means localhost:ROUTER_PORT is the same inside and outside
+BENCH_LOG="${LOG_DIR}/benchmark_results.log"
+TOY_PROXY_CONTAINER_PATH="/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py"
+
+# Common args shared by both benchmark runs
+BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-16}"
+BENCH_NUM_WARMUPS=$((BENCH_MAX_CONCURRENCY * 2))
+BENCH_NUM_PROMPTS=$((BENCH_MAX_CONCURRENCY * 10))
+
+BENCH_ARGS=(
+    --backend vllm
+    --model "${MODEL}"
+    --dataset-name random
+    --random-input-len 1000
+    --random-output-len 1000
+    --max-concurrency "${BENCH_MAX_CONCURRENCY}"
+    --num-warmups "${BENCH_NUM_WARMUPS}"
+    --num-prompts "${BENCH_NUM_PROMPTS}"
+    --ready_check_timeout_sec 3000
+    --seed 1234
+)
+
+# Poll docker logs of the toy proxy container until both instances have registered.
+# Using docker logs avoids Python stdout-buffering issues (print() to a file is
+# fully buffered; docker logs reads directly from the container's log driver).
+wait_for_toy_proxy_registrations() {
+    local max_wait=120
+    local interval=3
+    local elapsed=0
+    echo "Waiting for prefill and decode to register with toy proxy..."
+    while true; do
+        local p d
+        p=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Prefill" || true)
+        d=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Decode"  || true)
+        if [[ "${p:-0}" -ge 1 && "${d:-0}" -ge 1 ]]; then
+            echo "  Both instances registered."
+            return 0
+        fi
+        sleep "${interval}"
+        elapsed=$((elapsed + interval))
+        if [[ "${elapsed}" -ge "${max_wait}" ]]; then
+            echo "WARNING: timed out waiting for toy proxy registrations after ${max_wait}s" >&2
+            return 0
+        fi
+        echo "  Still waiting (${elapsed}s / ${max_wait}s) — prefill=${p:-0} decode=${d:-0}..."
+    done
+}
+
+# ── Phase 1: benchmark through vllm-router ────────────────────────────────────
+echo ""
+echo ">>> Phase 1: benchmarking through vllm-router..."
+{
+    echo "======================================================"
+    echo "  Router: vllm-router"
+    echo "  Date  : $(date)"
+    echo "======================================================"
+} | tee -a "${BENCH_LOG}"
+
+docker exec moriio-prefill \
+    vllm bench serve \
+        --base-url "http://localhost:${ROUTER_PORT}" \
+        "${BENCH_ARGS[@]}" 2>&1 | tee -a "${BENCH_LOG}"
+
+# ── Phase 2: switch to toy proxy and benchmark again ──────────────────────────
+# The toy proxy's HTTP port is hardcoded in the image as 10001; its ZMQ
+# discovery port defaults to PROXY_PING_PORT (36367), same as vllm-router.
+# We run it as a dedicated container so docker logs captures stdout without
+# any Python output-buffering issues (no grep-on-file race).
+# Prefill and decode are restarted so they register fresh with the new proxy.
+TOY_PROXY_HTTP_PORT=10001
+
+echo ""
+echo ">>> Stopping vllm-router, prefill, and decode..."
+docker rm -f moriio-router moriio-prefill moriio-decode
+
+echo ">>> Starting toy proxy container (HTTP :${TOY_PROXY_HTTP_PORT}, ZMQ :${PROXY_PING_PORT})..."
+docker run -d \
+    --name moriio-toy-proxy \
+    --network host \
+    -v "${TOY_PROXY_PATCH_SCRIPT}:/tmp/patch_toy_proxy.py:ro" \
+    "${VLLM_IMAGE}" \
+    bash -c "pip install --quiet --ignore-installed quart aiohttp msgpack && \
+             python3 /tmp/patch_toy_proxy.py && \
+             python3 -u ${TOY_PROXY_CONTAINER_PATH}"
+
+docker logs -f moriio-toy-proxy 2>&1 | tee "${LOG_DIR}/toy_proxy.log" &
+
+# Wait for the toy proxy HTTP port before starting vLLM (avoids a race where
+# instances start sending heartbeats before the ZMQ socket is bound).
+echo "Waiting for toy proxy HTTP port ${TOY_PROXY_HTTP_PORT} to open..."
+_tp_wait=0
+until curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/" >/dev/null 2>&1 \
+   || curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/v1/completions" >/dev/null 2>&1 \
+   || nc -z 127.0.0.1 "${TOY_PROXY_HTTP_PORT}" 2>/dev/null; do
+    sleep 2
+    _tp_wait=$((_tp_wait + 2))
+    if [[ "${_tp_wait}" -ge 60 ]]; then
+        echo "WARNING: toy proxy did not open port ${TOY_PROXY_HTTP_PORT} after 60s" >&2
+        docker logs moriio-toy-proxy 2>&1 | tail -20 >&2
+        break
+    fi
+done
+echo "  Toy proxy is up."
+
+echo ">>> Restarting prefill instance (GPU ${PREFILL_GPU}, port ${PREFILL_PORT})..."
+docker run -d \
+    --name moriio-prefill \
+    "${VLLM_COMMON_ARGS[@]}" \
+    -e HIP_VISIBLE_DEVICES="${PREFILL_GPU}" \
+    "${VLLM_IMAGE}" \
+    vllm serve "${MODEL}" \
+        --port "${PREFILL_PORT}" \
+        --max-model-len "${PREFILL_MAX_MODEL_LEN}" \
+        --trust-remote-code \
+        --kv-transfer-config "${PREFILL_KV_CONFIG}"
+
+docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill_phase2.log" &
+
+echo ">>> Restarting decode instance (GPU ${DECODE_GPU}, port ${DECODE_PORT})..."
+docker run -d \
+    --name moriio-decode \
+    "${VLLM_COMMON_ARGS[@]}" \
+    -e HIP_VISIBLE_DEVICES="${DECODE_GPU}" \
+    "${VLLM_IMAGE}" \
+    vllm serve "${MODEL}" \
+        --port "${DECODE_PORT}" \
+        --max-model-len "${DECODE_MAX_MODEL_LEN}" \
+        --trust-remote-code \
+        --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
+        --kv-transfer-config "${DECODE_KV_CONFIG}"
+
+docker logs -f moriio-decode 2>&1 | tee "${LOG_DIR}/decode_phase2.log" &
+
+wait_for_health "moriio-prefill" "${PREFILL_PORT}"
+wait_for_health "moriio-decode"  "${DECODE_PORT}"
+
+wait_for_toy_proxy_registrations
+
+echo ""
+echo ">>> Phase 2: benchmarking through toy proxy..."
+{
+    echo ""
+    echo "======================================================"
+    echo "  Router: moriio_toy_proxy_server.py"
+    echo "  Date  : $(date)"
+    echo "======================================================"
+} | tee -a "${BENCH_LOG}"
+
+docker exec moriio-prefill \
+    vllm bench serve \
+        --base-url "http://localhost:${TOY_PROXY_HTTP_PORT}" \
+        "${BENCH_ARGS[@]}" 2>&1 | tee -a "${BENCH_LOG}"
+
+echo ""
+echo "=== Both benchmark runs complete ==="
+echo "  Results      : ${BENCH_LOG}"
+echo "  Toy proxy log: ${LOG_DIR}/toy_proxy.log"
+
+else
+
+# ── Smoke test: single completion request through the router ──────────────────
+echo ""
+echo ">>> Smoke test: sending a completion request through vllm-router..."
+curl -s "http://localhost:${ROUTER_PORT}/v1/completions" \
+    -H "Content-Type: application/json" \
+    -d "{
+      \"model\": \"${MODEL}\",
+      \"prompt\": \"San Francisco is a\",
+      \"max_tokens\": 64,
+      \"temperature\": 0
+    }" | python3 -m json.tool
+echo ""
+echo "(Set USE_BENCH=1 to run the full perf benchmark, or USE_GSM8K=1 for accuracy evaluation.)"
+
+fi
+
+# ── Teardown ──────────────────────────────────────────────────────────────────
+if [[ "${KEEP_ALIVE}" == "1" ]]; then
+    echo ""
+    echo "KEEP_ALIVE=1 — containers left running."
+    echo "  To tear down: docker rm -f moriio-prefill moriio-decode moriio-router moriio-toy-proxy"
+else
+    echo ""
+    echo ">>> Shutting down containers..."
+    docker rm -f moriio-prefill moriio-decode moriio-router moriio-toy-proxy 2>/dev/null || true
+    echo "Done. All containers removed."
+fi
diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh b/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
new file mode 100755
index 000000000000..05229dc7f4da
--- /dev/null
+++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
@@ -0,0 +1,740 @@
+#!/usr/bin/env bash
+# run_pd_demo_2node.sh — Two-node MoRIIO PD-disaggregation demo
+#
+# Run the SAME script on BOTH nodes, setting IS_PREFILL to distinguish roles:
+#
+#   Node 1 — prefill instance + vllm-router:
+#     IS_PREFILL=1 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> \
+#       ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
+#
+#   Node 2 — decode instance:
+#     IS_PREFILL=0 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> \
+#       ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh
+#
+# Prerequisites
+#   • 8 ROCm GPUs on each node
+#   • Docker image pulled: ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0414-0859
+#   • Router image pulled: ghcr.io/simondanielsson/vllm-router:dev (or :dev-streaming-cn-cjy)
+#   • RDMA / InfiniBand devices visible at /dev/infiniband on each node
+#   • Ports PREFILL_PORT, DECODE_PORT, ROUTER_PORT, PROXY_PING_PORT,
+#     HANDSHAKE_PORT, NOTIFY_PORT, PHASE2_SIGNAL_PORT open between the two nodes
+#
+# Phase 2 coordination (USE_BENCH=1 or USE_GSM8K=1)
+#   The prefill node starts a tiny HTTP signal server on PHASE2_SIGNAL_PORT
+#   after Phase 1 completes.  The decode node polls that port; once it gets a
+#   response it tears down its container and restarts so the new instance
+#   registers with the toy proxy instead of the router.  The prefill node waits
+#   for the decode to become healthy again before running Phase 2.
+
+set -euo pipefail
+
+# ── Required env vars ─────────────────────────────────────────────────────────
+IS_PREFILL="${IS_PREFILL:-}"
+PREFILL_IP="${PREFILL_IP:-}"
+DECODE_IP="${DECODE_IP:-}"
+
+if [[ -z "${IS_PREFILL}" || -z "${PREFILL_IP}" || -z "${DECODE_IP}" ]]; then
+    echo "ERROR: IS_PREFILL, PREFILL_IP, and DECODE_IP must all be set." >&2
+    echo "" >&2
+    echo "  Node 1 (prefill + router):" >&2
+    echo "    IS_PREFILL=1 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> $0" >&2
+    echo "" >&2
+    echo "  Node 2 (decode):" >&2
+    echo "    IS_PREFILL=0 PREFILL_IP=<node1-ip> DECODE_IP=<node2-ip> $0" >&2
+    exit 1
+fi
+
+# ── Configuration ─────────────────────────────────────────────────────────────
+# DeepSeek-R1-0528 is a 671B MoE model; --load-format dummy skips weight
+# loading so the script can be used for integration testing without the
+# full model checkpoint.  Set MODEL to the real HF id when running with
+# actual weights.
+MODEL="${MODEL:-deepseek-ai/DeepSeek-R1-0528}"
+
+PREFILL_PORT="${PREFILL_PORT:-8100}"      # HTTP port for the prefill vLLM instance
+DECODE_PORT="${DECODE_PORT:-8200}"        # HTTP port for the decode vLLM instance
+ROUTER_PORT="${ROUTER_PORT:-8080}"        # HTTP port for vllm-router (prefill node only)
+PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # ZMQ service-discovery port (router ↔ vLLM)
+
+# MoRIIO internal ports — nodes are separate machines so no port conflicts;
+# both instances can use the same numbers.
+HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}"  # MoRIIO engine handshake
+NOTIFY_PORT="${NOTIFY_PORT:-61005}"       # Prefill↔decode stage synchronisation
+
+# Port used by the prefill node to signal the decode node to restart for Phase 2.
+# The prefill node starts a simple HTTP server on this port; the decode node polls it.
+PHASE2_SIGNAL_PORT="${PHASE2_SIGNAL_PORT:-19876}"
+
+VLLM_IMAGE="${VLLM_IMAGE:-ghcr.io/simondanielsson/vllm-rocm-moriio:dev}"
+#VLLM_IMAGE="${VLLM_IMAGE:-ghcr.io/simondanielsson/vllm-rocm-moriio:dev-hang-fixes}"
+
+# Basic router (smoke-test only — no streaming support)
+ROUTER_IMAGE="${ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev}"
+# Streaming-capable router (required for USE_BENCH=1 and USE_GSM8K=1)
+ROUTER_STREAMING_IMAGE="${ROUTER_STREAMING_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}"
+
+HF_HOME="${HF_HOME:-${HOME}/.cache/huggingface}"
+LOG_DIR="${LOG_DIR:-${HOME}/moriio-logs}"
+SHM_SIZE="${SHM_SIZE:-256G}"
+
+USE_BENCH="${USE_BENCH:-0}"    # Set to 1 to run the perf benchmark (prefill node only)
+USE_GSM8K="${USE_GSM8K:-0}"   # Set to 1 to run GSM8K accuracy eval (prefill node only)
+KEEP_ALIVE="${KEEP_ALIVE:-0}" # Set to 1 to leave containers running after the script exits
+
+BENCH_NUM_PROMPTS_FACTOR=${BENCH_NUM_PROMPTS_FACTOR:-10}
+BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-16}"
+BENCH_NUM_WARMUPS=$((BENCH_MAX_CONCURRENCY * 2))
+BENCH_NUM_PROMPTS=$((BENCH_MAX_CONCURRENCY * $BENCH_NUM_PROMPTS_FACTOR))
+
+# HTTP port the toy proxy listens on (hardcoded in the image).
+TOY_PROXY_HTTP_PORT=10001
+# Path to the toy proxy script inside the vLLM image.
+TOY_PROXY_CONTAINER_PATH="/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py"
+
+mkdir -p "${LOG_DIR}"
+
+# ── Toy-proxy patch: fix non-streaming responses ──────────────────────────────
+# Makes handle_request() return a proper JSON body when the client did not
+# request streaming (needed by lm_eval and any non-streaming client).
+TOY_PROXY_PATCH_SCRIPT="$(mktemp /tmp/patch_toy_proxy.XXXXXX.py)"
+cat > "${TOY_PROXY_PATCH_SCRIPT}" << 'PYEOF'
+import pathlib
+
+TARGET = pathlib.Path(
+    "/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py"
+)
+src = TARGET.read_text()
+
+OLD = (
+    "        session, decode_response = await decode_request_task\n"
+    "        stream_generator = stream_decode_response(session, decode_response, request_id)\n"
+    "        response = await make_response(stream_generator)\n"
+    "        return response"
+)
+NEW = (
+    "        session, decode_response = await decode_request_task\n"
+    "        if req_data.get(\"stream\", False):\n"
+    "            stream_generator = stream_decode_response(\n"
+    "                session, decode_response, request_id\n"
+    "            )\n"
+    "            response = await make_response(stream_generator)\n"
+    "            return response\n"
+    "        else:\n"
+    "            try:\n"
+    "                body = await decode_response.read()\n"
+    "                content_type = decode_response.headers.get(\n"
+    "                    \"Content-Type\", \"application/json\"\n"
+    "                )\n"
+    "            finally:\n"
+    "                await session.close()\n"
+    "            response = await make_response(body, decode_response.status)\n"
+    "            response.headers[\"Content-Type\"] = content_type\n"
+    "            return response"
+)
+
+if OLD in src:
+    TARGET.write_text(src.replace(OLD, NEW, 1))
+    print("toy proxy patch: non-streaming fix applied.")
+else:
+    print("toy proxy patch: target not found — skipping (already patched?).")
+PYEOF
+
+# ── vLLM serve flags shared between prefill and decode ───────────────────────
+VLLM_SERVE_ARGS=(
+    --tensor-parallel-size 8
+    --kv-cache-dtype fp8
+    --gpu-memory-utilization 0.7
+    --max-num-batched-tokens 32768
+    --max-model-len 16384
+    --trust-remote-code
+    --no-enable-prefix-caching
+    --block-size 1
+)
+
+# ── Role-specific vLLM serve flags ───────────────────────────────────────────
+PREFILL_EXTRA_ARGS=(
+    --enforce-eager
+)
+
+DECODE_EXTRA_ARGS=(
+    --enable-expert-parallel
+    --all2all-backend mori
+    --compilation-config '{"cudagraph_mode": "PIECEWISE"}'
+)
+
+# ── KV-transfer configs ───────────────────────────────────────────────────────
+# proxy_ip      : IP of the node running vllm-router (always the prefill node)
+# proxy_ping_port : ZMQ port the router listens on for instance registration
+# http_port     : this instance's own HTTP port (embedded in zmq_address by router)
+# handshake_port / notify_port : MoRIIO RDMA coordination ports on this node
+PREFILL_KV_CONFIG=$(cat <<EOF
+{
+  "kv_connector": "MoRIIOConnector",
+  "kv_role": "kv_producer",
+  "kv_connector_extra_config": {
+    "proxy_ip": "${PREFILL_IP}",
+    "proxy_ping_port": "${PROXY_PING_PORT}",
+    "http_port": "${PREFILL_PORT}",
+    "handshake_port": "${HANDSHAKE_PORT}",
+    "notify_port": "${NOTIFY_PORT}"
+  }
+}
+EOF
+)
+
+DECODE_KV_CONFIG=$(cat <<EOF
+{
+  "kv_connector": "MoRIIOConnector",
+  "kv_role": "kv_consumer",
+  "kv_connector_extra_config": {
+    "proxy_ip": "${PREFILL_IP}",
+    "proxy_ping_port": "${PROXY_PING_PORT}",
+    "http_port": "${DECODE_PORT}",
+    "handshake_port": "${HANDSHAKE_PORT}",
+    "notify_port": "${NOTIFY_PORT}"
+  }
+}
+EOF
+)
+
+# ── Common docker run flags ───────────────────────────────────────────────────
+VLLM_COMMON_ARGS=(
+    --init
+    --network host
+    --ipc host
+    --privileged
+    --cap-add SYS_PTRACE
+    --security-opt seccomp=unconfined
+    --ulimit memlock=-1
+    --ulimit stack=67108864
+    --shm-size "${SHM_SIZE}"
+    --group-add video
+    --group-add render
+    --device /dev/kfd
+    --device /dev/dri
+    --device /dev/infiniband
+    -v /sys:/sys
+    -v "${HF_HOME}:/root/.cache/huggingface"
+    -e HF_HOME=/root/.cache/huggingface
+    -e HF_HUB_ENABLE_HF_TRANSFER=0
+    -e VLLM_MORIIO_CONNECTOR_READ_MODE=1
+    -e NCCL_MIN_NCHANNELS=112
+    -e VLLM_USE_V1=1
+    -e VLLM_ENGINE_READY_TIMEOUT_S=3600
+    -e VLLM_SERVER_DEV_MODE=1
+    -e VLLM_ROCM_USE_AITER=1
+    -e VLLM_ROCM_USE_AITER_PAGED_ATTN=0
+    -e VLLM_ROCM_USE_AITER_RMSNORM=1
+    -e VLLM_USE_AITER_TRITON_SILU_MUL=0
+    
+)
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+wait_for_health() {
+    local name="$1"
+    local url="$2"
+    local max_wait="${3:-1800}"
+    local interval=10
+    local elapsed=0
+
+    echo "Waiting for ${name} (${url}) to become healthy..."
+    while true; do
+        if curl -sf "${url}" >/dev/null 2>&1; then
+            echo "  ${name} is healthy."
+            return 0
+        fi
+        sleep "${interval}"
+        elapsed=$((elapsed + interval))
+        if [[ "${elapsed}" -ge "${max_wait}" ]]; then
+            echo "ERROR: ${name} did not become healthy after ${max_wait}s" >&2
+            exit 1
+        fi
+        echo "  Still waiting for ${name} (${elapsed}s / ${max_wait}s)..."
+    done
+}
+
+# Poll docker logs until both P and D have registered with the toy proxy.
+wait_for_toy_proxy_registrations() {
+    local max_wait=120 interval=3 elapsed=0
+    echo "Waiting for prefill and decode to register with toy proxy..."
+    while true; do
+        local p d
+        p=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Prefill" || true)
+        d=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Decode"  || true)
+        if [[ "${p:-0}" -ge 1 && "${d:-0}" -ge 1 ]]; then
+            echo "  Both instances registered (prefill=${p} decode=${d})."
+            return 0
+        fi
+        sleep "${interval}"
+        elapsed=$((elapsed + interval))
+        if [[ "${elapsed}" -ge "${max_wait}" ]]; then
+            echo "WARNING: timed out waiting for toy proxy registrations after ${max_wait}s" >&2
+            return 0
+        fi
+        echo "  Still waiting (${elapsed}s / ${max_wait}s) — prefill=${p:-0} decode=${d:-0}..."
+    done
+}
+
+# Stop router+prefill, launch toy proxy, restart prefill, wait for decode Phase 2.
+# Called by both USE_BENCH and USE_GSM8K after Phase 1 completes.
+run_phase2_switchover() {
+    # Signal the decode node to restart by serving a one-line HTTP response on
+    # PHASE2_SIGNAL_PORT.  We use a Python one-liner so the response is proper
+    # HTTP (nc -l alone sends no headers and some curl versions reject it).
+    echo ""
+    echo ">>> Phase 2: signalling decode node to restart (port ${PHASE2_SIGNAL_PORT})..."
+    echo "    The decode node is polling http://${PREFILL_IP}:${PHASE2_SIGNAL_PORT}/phase2"
+    # Serve exactly one request then exit (Python http.server handles one GET then we kill it).
+    python3 -c "
+import http.server, socketserver, threading, time
+
+class Handler(http.server.BaseHTTPRequestHandler):
+    def do_GET(self):
+        self.send_response(200)
+        self.end_headers()
+        self.wfile.write(b'phase2\n')
+        # Schedule shutdown after we've sent the response
+        threading.Thread(target=self.server.shutdown, daemon=True).start()
+    def log_message(self, *a): pass
+
+with socketserver.TCPServer(('0.0.0.0', ${PHASE2_SIGNAL_PORT}), Handler) as srv:
+    srv.serve_forever()
+" &
+    _SIGNAL_PID=$!
+
+    # Now tear down the router and this node's prefill container.
+    echo ">>> Stopping vllm-router and prefill (Phase 1)..."
+    docker rm -f moriio-router moriio-prefill
+    # Wait for GPU memory and host-network TCP sockets (TIME_WAIT) to be released
+    # before starting the Phase 2 prefill container.
+    echo "Waiting 30s for GPU/network resources to be released..."
+    sleep 30
+
+    # Start toy proxy (HTTP :${TOY_PROXY_HTTP_PORT}, ZMQ :${PROXY_PING_PORT})
+    echo ">>> Starting toy proxy container (HTTP :${TOY_PROXY_HTTP_PORT}, ZMQ :${PROXY_PING_PORT})..."
+    docker run -d \
+        --name moriio-toy-proxy \
+        --network host \
+        -v "${TOY_PROXY_PATCH_SCRIPT}:/tmp/patch_toy_proxy.py:ro" \
+        "${VLLM_IMAGE}" \
+        bash -c "pip install --quiet --ignore-installed quart aiohttp msgpack && \
+                 python3 /tmp/patch_toy_proxy.py && \
+                 python3 -u ${TOY_PROXY_CONTAINER_PATH}"
+
+    docker logs -f moriio-toy-proxy 2>&1 | tee "${LOG_DIR}/toy_proxy.log" &
+
+    # Wait for toy proxy HTTP port to open before starting vLLM.
+    echo "Waiting for toy proxy HTTP port ${TOY_PROXY_HTTP_PORT} to open..."
+    _tp_wait=0
+    until curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/" >/dev/null 2>&1 \
+       || curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/v1/completions" >/dev/null 2>&1 \
+       || nc -z 127.0.0.1 "${TOY_PROXY_HTTP_PORT}" 2>/dev/null; do
+        sleep 2
+        _tp_wait=$((_tp_wait + 2))
+        if [[ "${_tp_wait}" -ge 60 ]]; then
+            echo "WARNING: toy proxy did not open port ${TOY_PROXY_HTTP_PORT} after 60s" >&2
+            docker logs moriio-toy-proxy 2>&1 | tail -20 >&2
+            break
+        fi
+    done
+    echo "  Toy proxy is up."
+
+    # Restart the prefill container so it registers with the toy proxy.
+    echo ">>> Restarting prefill instance (port ${PREFILL_PORT})..."
+    docker run -d \
+        --name moriio-prefill \
+        "${VLLM_COMMON_ARGS[@]}" \
+        "${VLLM_IMAGE}" \
+        vllm serve "${MODEL}" \
+            --port "${PREFILL_PORT}" \
+            "${VLLM_SERVE_ARGS[@]}" \
+            "${PREFILL_EXTRA_ARGS[@]}" \
+            --kv-transfer-config "${PREFILL_KV_CONFIG}"
+
+    docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill_phase2.log" &
+
+    wait_for_health "moriio-prefill" "http://localhost:${PREFILL_PORT}/health" 1800
+
+    # Kill the signal server now that decode has been notified (it may already be gone).
+    kill "${_SIGNAL_PID}" 2>/dev/null || true
+
+    # Wait for the remote decode to come back up (it restarts itself upon seeing the signal).
+    echo ""
+    echo ">>> Waiting for decode node (${DECODE_IP}:${DECODE_PORT}) to come back up..."
+    wait_for_health "moriio-decode (remote, Phase 2)" "http://${DECODE_IP}:${DECODE_PORT}/health" 1200
+
+    echo "Waiting 10s for decode to register with toy proxy..."
+    sleep 10
+
+    wait_for_toy_proxy_registrations
+}
+
+# ── Cleanup trap ──────────────────────────────────────────────────────────────
+_cleanup() {
+    if [[ "${KEEP_ALIVE}" == "1" ]]; then
+        echo ""
+        echo "KEEP_ALIVE=1 — containers left running."
+        if [[ "${IS_PREFILL}" == "1" ]]; then
+            echo "  To tear down: docker rm -f moriio-prefill moriio-router moriio-toy-proxy"
+        else
+            echo "  To tear down: docker rm -f moriio-decode"
+        fi
+        return
+    fi
+    echo ""
+    echo ">>> Shutting down containers..."
+    if [[ "${IS_PREFILL}" == "1" ]]; then
+        docker rm -f moriio-prefill moriio-router moriio-toy-proxy 2>/dev/null || true
+    else
+        docker rm -f moriio-decode 2>/dev/null || true
+    fi
+    echo "Done."
+}
+trap _cleanup EXIT
+
+# ── Remove stale containers ───────────────────────────────────────────────────
+if [[ "${IS_PREFILL}" == "1" ]]; then
+    _stale_containers=(moriio-prefill moriio-router moriio-toy-proxy)
+else
+    _stale_containers=(moriio-decode)
+fi
+for cname in "${_stale_containers[@]}"; do
+    if docker ps -a --format '{{.Names}}' | grep -q "^${cname}$"; then
+        echo "Removing existing container: ${cname}"
+        docker rm -f "${cname}"
+    fi
+done
+
+# ── Print summary ─────────────────────────────────────────────────────────────
+_role="$([ "${IS_PREFILL}" == "1" ] && echo "prefill + router" || echo "decode")"
+echo "=== MoRIIO PD disaggregation demo (2-node) ==="
+echo "  Model       : ${MODEL}"
+echo "  This node   : ${_role}"
+echo "  Prefill     : http://${PREFILL_IP}:${PREFILL_PORT}"
+echo "  Decode      : http://${DECODE_IP}:${DECODE_PORT}"
+echo "  Router      : http://${PREFILL_IP}:${ROUTER_PORT}  (prefill node)"
+echo "  Discovery   : ${PREFILL_IP}:${PROXY_PING_PORT}"
+echo "  MoRIIO ports: handshake=${HANDSHAKE_PORT}  notify=${NOTIFY_PORT}"
+echo "  Phase2 sig  : ${PREFILL_IP}:${PHASE2_SIGNAL_PORT}  (USE_BENCH=1 or USE_GSM8K=1)"
+echo "  Log dir     : ${LOG_DIR}"
+echo ""
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# PREFILL NODE
+# ═══════════════════════════════════════════════════════════════════════════════
+if [[ "${IS_PREFILL}" == "1" ]]; then
+
+echo ">>> Starting prefill instance (port ${PREFILL_PORT})..."
+docker run -d \
+    --name moriio-prefill \
+    "${VLLM_COMMON_ARGS[@]}" \
+    "${VLLM_IMAGE}" \
+    vllm serve "${MODEL}" \
+        --port "${PREFILL_PORT}" \
+        "${VLLM_SERVE_ARGS[@]}" \
+        "${PREFILL_EXTRA_ARGS[@]}" \
+        --kv-transfer-config "${PREFILL_KV_CONFIG}"
+
+docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill.log" &
+
+wait_for_health "moriio-prefill" "http://localhost:${PREFILL_PORT}/health" 1800
+
+# ── Launch vllm-router ────────────────────────────────────────────────────────
+if [[ "${USE_BENCH}" == "1" || "${USE_GSM8K}" == "1" ]]; then
+    _ACTIVE_ROUTER_IMAGE="${ROUTER_STREAMING_IMAGE}"
+else
+    _ACTIVE_ROUTER_IMAGE="${ROUTER_IMAGE}"
+fi
+
+echo ""
+echo ">>> Starting vllm-router (port ${ROUTER_PORT}, discovery port ${PROXY_PING_PORT})..."
+echo "    Image: ${_ACTIVE_ROUTER_IMAGE}"
+docker run -d \
+    --name moriio-router \
+    --network host \
+    "${_ACTIVE_ROUTER_IMAGE}" \
+    vllm-router \
+        --vllm-pd-disaggregation \
+        --kv-connector moriio \
+        --vllm-discovery-address "0.0.0.0:${PROXY_PING_PORT}" \
+        --port "${ROUTER_PORT}" \
+        --host 0.0.0.0 \
+        --policy consistent_hash \
+        --prefill-policy consistent_hash \
+        --decode-policy consistent_hash \
+        --log-level info
+
+docker logs -f moriio-router 2>&1 | tee "${LOG_DIR}/router.log" &
+
+# ── Wait for the decode node to be healthy (cross-node health check) ──────────
+echo ""
+echo ">>> Waiting for decode node (${DECODE_IP}:${DECODE_PORT}) to become healthy..."
+echo "    Start the decode node now if you haven't already."
+wait_for_health "moriio-decode (remote)" "http://${DECODE_IP}:${DECODE_PORT}/health" 1200
+
+# Brief pause for the decode instance to register with the router via ZMQ.
+echo "Waiting 10s for decode instance to register with the router..."
+sleep 10
+
+# ── Summary ───────────────────────────────────────────────────────────────────
+echo ""
+echo "=== All services ready ==="
+echo "  Prefill  : http://localhost:${PREFILL_PORT}"
+echo "  Decode   : http://${DECODE_IP}:${DECODE_PORT}"
+echo "  Router   : http://localhost:${ROUTER_PORT}  ← send requests here"
+echo ""
+echo "To follow logs (written to ${LOG_DIR}):"
+echo "  tail -f ${LOG_DIR}/prefill.log"
+echo "  tail -f ${LOG_DIR}/router.log"
+echo ""
+echo "Containers will be shut down automatically when this script exits."
+echo "(Set KEEP_ALIVE=1 to leave them running.)"
+
+# ── Smoke test / benchmark ────────────────────────────────────────────────────
+if [[ "${USE_GSM8K}" == "1" ]]; then
+
+    GSM8K_LOG="${LOG_DIR}/gsm8k_results.log"
+    GSM8K_JSON_ROUTER="${LOG_DIR}/gsm8k_results_router.json"
+    GSM8K_JSON_PROXY="${LOG_DIR}/gsm8k_results_proxy.json"
+    _out_dir="/tmp/lm_eval_out"
+
+    _run_lm_eval() {
+        local base_url="$1"
+        docker exec moriio-prefill bash -c \
+            "pip install --quiet 'lm_eval[api]' && \
+             rm -rf ${_out_dir} && \
+             lm_eval \
+                 --model local-completions \
+                 --model_args model=${MODEL},base_url=${base_url}/v1/completions,tokenized_requests=False,trust_remote_code=True \
+                 --tasks gsm8k \
+                 --output_path ${_out_dir}" \
+            2>&1 | tee -a "${GSM8K_LOG}"
+    }
+
+    _save_lm_eval_json() {
+        local dest="$1"
+        local _remote_json
+        _remote_json=$(docker exec moriio-prefill \
+            find "${_out_dir}" -name "results.json" 2>/dev/null | head -1)
+        if [[ -n "${_remote_json}" ]]; then
+            docker cp "moriio-prefill:${_remote_json}" "${dest}" 2>/dev/null || true
+        else
+            echo "WARNING: lm_eval results.json not found in ${_out_dir}" >&2
+        fi
+    }
+
+    if [[ "${GSM8K_PHASE2_ONLY:-0}" != "1" ]]; then
+        # ── Phase 1: GSM8K through vllm-router ───────────────────────────────────
+        echo ""
+        echo ">>> Phase 1: running GSM8K accuracy evaluation (lm_eval) through vllm-router..."
+        {
+            echo "======================================================"
+            echo "  GSM8K evaluation (lm_eval) via MoRIIO PD-disaggregation (2-node)"
+            echo "  Router: vllm-router"
+            echo "  Model : ${MODEL}"
+            echo "  Date  : $(date)"
+            echo "======================================================"
+        } | tee "${GSM8K_LOG}"
+
+        _run_lm_eval "http://127.0.0.1:${ROUTER_PORT}"
+        _save_lm_eval_json "${GSM8K_JSON_ROUTER}"
+
+        # ── Phase 2: switch to toy proxy ──────────────────────────────────────────
+        run_phase2_switchover
+    else
+        echo ""
+        echo ">>> GSM8K_PHASE2_ONLY=1: skipping Phase 1 (vllm-router) and running switchover."
+        {
+            echo "======================================================"
+            echo "  GSM8K evaluation (lm_eval) via MoRIIO PD-disaggregation (2-node)"
+            echo "  (Phase 2 only run)"
+            echo "  Date  : $(date)"
+            echo "======================================================"
+        } | tee "${GSM8K_LOG}"
+
+        # The switchover must still happen: it stops the router + prefill (freeing
+        # PROXY_PING_PORT so the toy proxy's ZMQ listener can bind), starts the toy
+        # proxy, restarts prefill, and waits for decode to re-register.
+        run_phase2_switchover
+    fi
+
+    # ── Phase 2: GSM8K through toy proxy ─────────────────────────────────────
+    echo ""
+    echo ">>> Phase 2: running GSM8K accuracy evaluation (lm_eval) through toy proxy..."
+    {
+        echo ""
+        echo "======================================================"
+        echo "  GSM8K evaluation (lm_eval) via MoRIIO PD-disaggregation (2-node)"
+        echo "  Router: moriio_toy_proxy_server.py"
+        echo "  Model : ${MODEL}"
+        echo "  Date  : $(date)"
+        echo "======================================================"
+    } | tee -a "${GSM8K_LOG}"
+
+    _run_lm_eval "http://127.0.0.1:${TOY_PROXY_HTTP_PORT}"
+    _save_lm_eval_json "${GSM8K_JSON_PROXY}"
+
+    echo ""
+    echo "=== GSM8K evaluation complete ==="
+    echo "  Log        : ${GSM8K_LOG}"
+    if [[ "${GSM8K_PHASE2_ONLY:-0}" != "1" ]]; then
+        echo "  Router JSON: ${GSM8K_JSON_ROUTER}"
+    fi
+    echo "  Proxy JSON : ${GSM8K_JSON_PROXY}"
+
+elif [[ "${USE_BENCH}" == "1" ]]; then
+
+    BENCH_LOG="${LOG_DIR}/benchmark_results.log"
+
+    BENCH_ARGS=(
+        --backend vllm
+        --model "${MODEL}"
+        --dataset-name random
+        --random-input-len 1000
+        --random-output-len 1000
+        --max-concurrency "${BENCH_MAX_CONCURRENCY}"
+        --num-warmups "${BENCH_NUM_WARMUPS}"
+        --num-prompts "${BENCH_NUM_PROMPTS}"
+        --ready_check_timeout_sec 3000
+        --seed 1234
+    )
+
+    # ── Phase 1: benchmark through vllm-router ────────────────────────────────
+    echo ""
+    echo ">>> Phase 1: benchmarking through vllm-router..."
+    {
+        echo "======================================================"
+        echo "  Router: vllm-router (2-node)"
+        echo "  Model : ${MODEL}"
+        echo "  Date  : $(date)"
+        echo "======================================================"
+    } | tee "${BENCH_LOG}"
+
+    docker exec moriio-prefill \
+        vllm bench serve \
+            --base-url "http://localhost:${ROUTER_PORT}" \
+            "${BENCH_ARGS[@]}" 2>&1 | tee -a "${BENCH_LOG}"
+
+    # ── Phase 2: switch to toy proxy and benchmark again ──────────────────────
+    run_phase2_switchover
+
+    echo ""
+    echo ">>> Phase 2: benchmarking through toy proxy (HTTP port ${TOY_PROXY_HTTP_PORT})..."
+    {
+        echo ""
+        echo "======================================================"
+        echo "  Router: moriio_toy_proxy_server.py (2-node)"
+        echo "  Model : ${MODEL}"
+        echo "  Date  : $(date)"
+        echo "======================================================"
+    } | tee -a "${BENCH_LOG}"
+
+    docker exec moriio-prefill \
+        vllm bench serve \
+            --base-url "http://localhost:${TOY_PROXY_HTTP_PORT}" \
+            "${BENCH_ARGS[@]}" 2>&1 | tee -a "${BENCH_LOG}"
+
+    echo ""
+    echo "=== Both benchmark runs complete ==="
+    echo "  Results      : ${BENCH_LOG}"
+    echo "  Toy proxy log: ${LOG_DIR}/toy_proxy.log"
+
+else
+
+    echo ""
+    echo ">>> Smoke test: sending a completion request through vllm-router..."
+    curl -s "http://localhost:${ROUTER_PORT}/v1/completions" \
+        -H "Content-Type: application/json" \
+        -d "{
+          \"model\": \"${MODEL}\",
+          \"prompt\": \"San Francisco is a\",
+          \"max_tokens\": 64,
+          \"temperature\": 0
+        }" | python3 -m json.tool
+    echo ""
+    echo "(Set USE_BENCH=1 for a perf benchmark, or USE_GSM8K=1 for accuracy eval.)"
+
+fi
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# DECODE NODE
+# ═══════════════════════════════════════════════════════════════════════════════
+else
+
+# ── Helper: start (or restart) the decode container ──────────────────────────
+start_decode() {
+    local log_suffix="${1:-}"
+    docker run -d \
+        --name moriio-decode \
+        "${VLLM_COMMON_ARGS[@]}" \
+        "${VLLM_IMAGE}" \
+        vllm serve "${MODEL}" \
+            --port "${DECODE_PORT}" \
+            "${VLLM_SERVE_ARGS[@]}" \
+            "${DECODE_EXTRA_ARGS[@]}" \
+            --kv-transfer-config "${DECODE_KV_CONFIG}"
+
+    docker logs -f moriio-decode 2>&1 | tee "${LOG_DIR}/decode${log_suffix}.log" &
+}
+
+echo ">>> Starting decode instance (port ${DECODE_PORT})..."
+start_decode ""
+
+wait_for_health "moriio-decode" "http://localhost:${DECODE_PORT}/health" 1800
+
+echo ""
+echo "=== Decode instance is healthy and registered with router on ${PREFILL_IP} ==="
+echo "  Decode : http://localhost:${DECODE_PORT}"
+echo "  Router : http://${PREFILL_IP}:${ROUTER_PORT}"
+echo ""
+echo "To follow logs:"
+echo "  tail -f ${LOG_DIR}/decode.log"
+echo ""
+
+if [[ "${USE_BENCH}" == "1" || "${USE_GSM8K}" == "1" ]]; then
+    # ── Phase 2: wait for the prefill node to signal a restart ───────────────
+    echo ">>> USE_BENCH/USE_GSM8K: waiting for Phase 2 restart signal from prefill node"
+    echo "    (polling http://${PREFILL_IP}:${PHASE2_SIGNAL_PORT}/phase2 ...)"
+    _sig_wait=0
+    until curl -sf --max-time 5 "http://${PREFILL_IP}:${PHASE2_SIGNAL_PORT}/phase2" >/dev/null 2>&1; do
+        sleep 5
+        _sig_wait=$((_sig_wait + 5))
+        if [[ "${_sig_wait}" -ge 7200 ]]; then
+            echo "WARNING: Phase 2 signal never arrived after ${_sig_wait}s — exiting." >&2
+            exit 0
+        fi
+        if (( _sig_wait % 60 == 0 )); then
+            echo "  Still waiting for Phase 2 signal (${_sig_wait}s)..."
+        fi
+    done
+
+    echo ""
+    echo ">>> Phase 2 signal received — restarting decode container to register with toy proxy..."
+    docker rm -f moriio-decode 2>/dev/null || true
+
+    # The kv_transfer_config proxy_ip still points at PREFILL_IP where the toy proxy runs.
+    start_decode "_phase2"
+
+    wait_for_health "moriio-decode (Phase 2)" "http://localhost:${DECODE_PORT}/health" 1800
+
+    echo ""
+    echo "=== Decode Phase 2 instance is healthy and registered with toy proxy ==="
+    echo "  Decode : http://localhost:${DECODE_PORT}"
+    echo "  Proxy  : http://${PREFILL_IP}:10001"
+    echo ""
+    echo "Containers will be shut down when the prefill node finishes (Ctrl+C to abort)."
+    echo "(Set KEEP_ALIVE=1 to leave the container running after exit.)"
+    echo ""
+
+    # Block until the user stops the script (prefill node runs the benchmark).
+    wait
+else
+    echo "Containers will be shut down when this script exits (Ctrl+C)."
+    echo "(Set KEEP_ALIVE=1 to leave the container running after exit.)"
+    echo ""
+
+    # Block until the user stops the script; the cleanup trap handles teardown.
+    wait
+fi
+
+fi
diff --git a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py b/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
index eaff0904b877..762d6896dfed 100644
--- a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
+++ b/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
@@ -283,10 +283,12 @@ async def handle_request():
 
 
 if __name__ == "__main__":
-    t = start_service_discovery("0.0.0.0", 36367)
+    _ping_port = int(os.environ.get("PROXY_PING_PORT", "36367"))
+    _http_port = int(os.environ.get("PROXY_HTTP_PORT", "10001"))
+    t = start_service_discovery("0.0.0.0", _ping_port)
     app.debug = True
     app.config["BODY_TIMEOUT"] = 360000
     app.config["RESPONSE_TIMEOUT"] = 360000
 
-    app.run(host="0.0.0.0", port=10001)
+    app.run(host="0.0.0.0", port=_http_port)
     t.join()