mpashkovskii · simondanielsson · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.router b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.router
@@ -0,0 +1,47 @@
+# Dockerfile for vllm-router (Rust binary)
+#
+# Build context: the root of the vllm-router repo (~/ repos/router).
+#
+#   docker build -f Dockerfile.router -t vllm-router:dev .
+#
+# Adapted from the upstream Dockerfile.router, but kept as a standalone file
+# so it can be referenced from the demo without modifying the router repo.
+
+FROM docker.io/rustlang/rust:nightly-bullseye AS rust-builder
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    pkg-config \
+    libssl-dev \
+    protobuf-compiler \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Cache dependency compilation layer separately from source changes
+COPY Cargo.toml Cargo.lock ./
+COPY build.rs ./
+# Dummy main so cargo can resolve/compile deps without full source
+RUN mkdir -p src && echo 'fn main() {}' > src/main.rs
+RUN cargo build --release || true
+RUN rm -f src/main.rs
+
+# Now copy real source and build
+COPY src ./src
+RUN cargo build --release
+
+# ── runtime image ────────────────────────────────────────────────────────────
+FROM docker.io/debian:bullseye-slim AS runtime
+
+RUN apt-get update && apt-get install -y \
+    ca-certificates \
+    libssl1.1 \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=rust-builder /app/target/release/vllm-router /usr/local/bin/vllm-router
+RUN chmod +x /usr/local/bin/vllm-router
+
+EXPOSE 8080
+EXPOSE 29000
+
+CMD ["vllm-router", "--host", "0.0.0.0", "--port", "8080"]
diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm
@@ -0,0 +1,109 @@
+# Dockerfile for vLLM with MoRIIO KV connector (ROCm)
+#
+# Installs vLLM from a local source tree on top of the official ROCm base image,
+# following the same multi-stage approach as docker/Dockerfile.rocm.
+#
+# Build from the root of the vllm repo:
+#
+#   docker build \
+#       -f examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm \
+#       -t vllm-rocm-moriio:dev \
+#       .
+#
+# The resulting image is used by run_pd_demo.sh.
+
+# ── base: same image used by the official ROCm vLLM image ────────────────────
+ARG BASE_IMAGE=docker.io/rocm/vllm-dev:base
+FROM ${BASE_IMAGE} AS base
+
+# Basic utilities required for the build
+RUN apt-get update -q -y && apt-get install -q -y \
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
+    apt-transport-https ca-certificates wget curl git \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m pip install --upgrade pip
+
+# Install UV (fast Python package installer)
+RUN curl -LsSf --retry 3 --retry-delay 5 https://astral.sh/uv/install.sh -o /tmp/uv-install.sh \
+    && env UV_INSTALL_DIR="/usr/local/bin" sh /tmp/uv-install.sh \
+    && rm -f /tmp/uv-install.sh \
+    && uv --version
+
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE=copy
+
+# ── build vLLM from local source ─────────────────────────────────────────────
+FROM base AS build_vllm
+
+WORKDIR /app
+
+# Copy the full source tree (build context = repo root)
+COPY . vllm/
+
+RUN cd vllm \
+    && python3 -m pip install -r requirements/rocm.txt \
+    && python3 setup.py clean --all \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+
+# ── runtime image ─────────────────────────────────────────────────────────────
+FROM base AS final
+
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+
+# Install RDMA userspace libraries needed by MoRIIO / RIXL
+RUN apt-get update -q -y && apt-get install -q -y \
+    librdmacm1 \
+    libibverbs1 \
+    ibverbs-providers \
+    ibverbs-utils \
+    autoconf \
+    libibverbs-dev \
+    libtool \
+    unzip \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Broadcom bnxt_re RDMA user-space driver
+RUN wget -q https://docs.broadcom.com/docs-and-downloads/ethernet-network-adapters/NXE/Thor2/GCA1/bcm5760x_230.2.52.0a.zip && \
+    unzip -q bcm5760x_230.2.52.0a.zip && \
+    cd bcm5760x_230.2.52.0a/drivers_linux/bnxt_rocelib/ && \
+    results=$(find -name "libbnxt*.tar.gz") && tar -xf $results && \
+    untar_dir=$(find . -maxdepth 1 -type d -name "libbnxt*" ! -name "*.tar.gz" | head -n 1) && \
+    cd $untar_dir && sh autogen.sh && ./configure && make && \
+    find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; && \
+    make install all && \
+    sh -c "echo /usr/local/lib >> /etc/ld.so.conf" && \
+    ldconfig && \
+    cp -f bnxt_re.driver /etc/libibverbs.d/ && \
+    ibv_devices && \
+    cd / && rm -rf /bcm5760x_230.2.52.0a /bcm5760x_230.2.52.0a.zip
+
+# Install vLLM wheel and its ROCm dependencies
+RUN --mount=type=bind,from=build_vllm,src=/app/vllm,target=/vllm_src \
+    --mount=type=cache,target=/root/.cache/uv \
+    cd /vllm_src \
+    && uv pip install --system -r requirements/rocm.txt \
+    && pip uninstall -y vllm || true \
+    && uv pip install --system dist/*.whl \
+    && uv pip install --system msgpack
+
+# Verify ROCm PyTorch (not CUDA)
+RUN python3 -c "import torch; assert torch.version.hip is not None, \
+    f'Expected ROCm PyTorch but got CUDA (hip={torch.version.hip})'; \
+    print(f'Verified: PyTorch {torch.__version__} ROCm HIP {torch.version.hip}')"
+
+# Copy examples so the proxy server script is available inside the container
+COPY --from=build_vllm /app/vllm/examples /app/vllm/examples
+
+# Performance / correctness env vars from the official ROCm Dockerfile
+ENV TOKENIZERS_PARALLELISM=false
+ENV SAFETENSORS_FAST_GPU=1
+ENV HIP_FORCE_DEV_KERNARG=1
+ENV MIOPEN_DEBUG_CONV_DIRECT=0
+ENV MIOPEN_DEBUG_CONV_GEMM=0
+
+WORKDIR /app
+
+CMD ["/bin/bash"]
diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/README.md b/examples/online_serving/disaggregated_serving/moriio_pd_demo/README.md
@@ -0,0 +1,217 @@
+# MoRIIO PD-disaggregation demo
+
+Minimal reproduction script for running vLLM PD-disaggregation with the
+MoRIIOConnector KV connector and the vllm-router.
+
+Requires two ROCm GPUs on a single host.
+
+---
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `Dockerfile.router` | Builds the `vllm-router` Rust binary |
+| `Dockerfile.vllm-rocm` | Builds vLLM from source on the ROCm base image |
+| `run_pd_demo.sh` | Launches prefill, decode, and router containers |
+
+---
+
+## 1. Get the Docker images
+
+### Option A — Pull pre-built images (recommended)
+
+```bash
+docker pull ghcr.io/simondanielsson/vllm-rocm-moriio:dev
+docker pull ghcr.io/simondanielsson/vllm-router:dev
+```
+
+### Option B — Build from source
+
+#### Router image
+
+Build from the **root of the vllm-router repo** (`~/repos/router`):
+
+```bash
+cd ~/repos/router
+docker build \
+    -f <path-to-this-dir>/Dockerfile.router \
+    -t ghcr.io/simondanielsson/vllm-router:dev \
+    .
+```
+
+#### vLLM image
+
+Build from the **root of this vllm repo**:
+
+```bash
+cd ~/repos/mpashkov/vllm
+docker build \
+    -f examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm \
+    -t ghcr.io/simondanielsson/vllm-rocm-moriio:dev \
+    .
+```
+
+The vLLM build compiles the ROCm wheel from source, which takes a while.
+
+---
+
+## 2. Run the demo
+
+### PR 1 — basic PD-disaggregation (smoke test only to confirm everything works)
+
+```bash
+MODEL=Qwen/Qwen3-8B \
+PREFILL_GPU=0 \
+DECODE_GPU=1 \
+./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh
+```
+
+This sends a single smoke-test request through the router after all services are healthy.
+
+### PR 2 — with streaming support (full benchmark)
+
+Once the streaming PR is merged into the router image, enable the full two-phase benchmark
+(vllm-router + toy proxy) by setting `USE_BENCH=1`:
+
+```bash
+MODEL=Qwen/Qwen3-8B \
+PREFILL_GPU=0 \
+DECODE_GPU=1 \
+USE_BENCH=1 \
+./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh
+```
+
+Benchmark results are written to `~/moriio-logs/benchmark_results.log`.
+
+Environment variables:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MODEL` | `Qwen/Qwen3-8B` | HuggingFace model id |
+| `PREFILL_GPU` | `0` | GPU index for the prefill instance |
+| `DECODE_GPU` | `1` | GPU index for the decode instance |
+| `PREFILL_PORT` | `8100` | HTTP port for the prefill vLLM server |
+| `DECODE_PORT` | `8200` | HTTP port for the decode vLLM server |
+| `ROUTER_PORT` | `8080` | HTTP port for vllm-router |
+| `PROXY_PING_PORT` | `36367` | ZMQ service-discovery port (router ↔ vLLM) |
+| `HF_HOME` | `~/.cache/huggingface` | Host path to HuggingFace model cache |
+| `LOG_DIR` | `~/moriio-logs` | Directory for container and benchmark logs |
+| `USE_BENCH` | `0` | Set to `1` to run the full perf benchmark (requires streaming support) |
+| `USE_GSM8K` | `0` | Set to `1` to run a GSM8K accuracy evaluation instead of the perf benchmark |
+| `KEEP_ALIVE` | `0` | Set to `1` to leave containers running after the script exits |
+| `VLLM_IMAGE` | `ghcr.io/simondanielsson/vllm-rocm-moriio:dev` | vLLM Docker image name |
+| `ROUTER_IMAGE` | `ghcr.io/simondanielsson/vllm-router:dev` | Router Docker image name |
+
+---
+
+## 3. Send a test request
+
+Once all three containers are healthy, send requests through the **router**:
+
+```bash
+curl -s http://localhost:8080/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen3-8B",
+        "prompt": "San Francisco is a",
+        "max_tokens": 64,
+        "temperature": 0
+    }' | python3 -m json.tool
+```
+
+---
+
+## 4. Load test with `vllm bench serve`
+
+Run a synthetic load test against the router with 1 000-token inputs, 1 000-token
+outputs, and a max concurrency of 16:
+
+```bash
+vllm bench serve \
+    --base-url http://localhost:8080 \
+    --backend openai-completions \
+    --model Qwen/Qwen3-8B \
+    --dataset-name random \
+    --random-input-len 1000 \
+    --random-output-len 1000 \
+    --max-concurrency 16 \
+    --num-prompts 200
+```
+
+| Flag | Value | Notes |
+|------|-------|-------|
+| `--base-url` | `http://localhost:8080` | Points at the router, not the vLLM instances directly |
+| `--backend` | `openai-completions` | Uses the `/v1/completions` endpoint |
+| `--model` | `Qwen/Qwen3-8B` | Must match the model served |
+| `--dataset-name` | `random` | Fully synthetic, no external dataset file needed |
+| `--random-input-len` | `1000` | Input sequence length (ISL) in tokens |
+| `--random-output-len` | `1000` | Output sequence length (OSL) in tokens |
+| `--max-concurrency` | `16` | Maximum number of in-flight requests |
+| `--num-prompts` | `200` | Total requests to send; increase for a longer run |
+
+---
+
+## 5. GSM8K accuracy evaluation
+
+To validate that PD-disaggregation produces correct outputs, run the GSM8K accuracy
+evaluation against the router instead of a perf benchmark:
+
+```bash
+MODEL=Qwen/Qwen3-8B \
+PREFILL_GPU=0 \
+DECODE_GPU=1 \
+USE_GSM8K=1 \
+./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh
+```
+
+This uses vLLM's built-in `tests/evals/gsm8k/gsm8k_eval.py` script, which sends the
+full GSM8K test set (1 319 questions, 5-shot, temperature=0) through the router so all
+requests flow through the MoRIIO PD-disaggregation pipeline.
+
+Results are written to:
+
+| File | Contents |
+|------|----------|
+| `~/moriio-logs/gsm8k_results.log` | Human-readable accuracy summary |
+| `~/moriio-logs/gsm8k_results.json` | Machine-readable JSON with all metrics |
+
+> **Note:** `USE_GSM8K=1` and `USE_BENCH=1` are mutually exclusive — GSM8K takes
+> priority if both are set.
+
+---
+
+## 6. Teardown
+
+The script shuts down all containers automatically when it exits.
+To leave them running (e.g. to inspect logs or send additional requests), set `KEEP_ALIVE=1`.
+
+If you need to tear down manually:
+
+```bash
+docker rm -f moriio-prefill moriio-decode moriio-router
+```
+
+---
+
+## Architecture
+
+```
+Client
+  │
+  ▼
+vllm-router (port 8080)
+  │  ZMQ service-discovery on PROXY_PING_PORT
+  │  (vLLM instances register themselves at startup)
+  ├──► Prefill instance (GPU 0, port 8100)
+  │       kv_role = kv_producer
+  │       MoRIIOConnector writes KV cache → Decode via RDMA
+  │
+  └──► Decode instance (GPU 1, port 8200)
+          kv_role = kv_consumer
+          MoRIIOConnector reads KV cache from Prefill via RDMA
+```
+
+The router uses `--vllm-pd-disaggregation` + `--vllm-discovery-address` so
+that vLLM instances register dynamically at startup rather than being passed
+as static `--prefill`/`--decode` URLs.