diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.router b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.router new file mode 100644 index 000000000000..ba4d8193d985 --- /dev/null +++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.router @@ -0,0 +1,47 @@ +# Dockerfile for vllm-router (Rust binary) +# +# Build context: the root of the vllm-router repo (~/ repos/router). +# +# docker build -f Dockerfile.router -t vllm-router:dev . +# +# Adapted from the upstream Dockerfile.router, but kept as a standalone file +# so it can be referenced from the demo without modifying the router repo. + +FROM docker.io/rustlang/rust:nightly-bullseye AS rust-builder + +RUN apt-get update && apt-get install -y \ + build-essential \ + pkg-config \ + libssl-dev \ + protobuf-compiler \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Cache dependency compilation layer separately from source changes +COPY Cargo.toml Cargo.lock ./ +COPY build.rs ./ +# Dummy main so cargo can resolve/compile deps without full source +RUN mkdir -p src && echo 'fn main() {}' > src/main.rs +RUN cargo build --release || true +RUN rm -f src/main.rs + +# Now copy real source and build +COPY src ./src +RUN cargo build --release + +# ── runtime image ──────────────────────────────────────────────────────────── +FROM docker.io/debian:bullseye-slim AS runtime + +RUN apt-get update && apt-get install -y \ + ca-certificates \ + libssl1.1 \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=rust-builder /app/target/release/vllm-router /usr/local/bin/vllm-router +RUN chmod +x /usr/local/bin/vllm-router + +EXPOSE 8080 +EXPOSE 29000 + +CMD ["vllm-router", "--host", "0.0.0.0", "--port", "8080"] diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm new file mode 100644 index 000000000000..ed1bf24af6a0 --- /dev/null +++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm @@ -0,0 +1,112 @@ +# Dockerfile for vLLM with MoRIIO KV connector (ROCm) +# +# Installs vLLM from a local source tree on top of the official ROCm base image, +# following the same multi-stage approach as docker/Dockerfile.rocm. +# +# Build from the root of the vllm repo: +# +# docker build \ +# -f examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm \ +# -t vllm-rocm-moriio:dev \ +# . +# +# The resulting image is used by run_pd_demo.sh. + +# ── base: same image used by the official ROCm vLLM image ──────────────────── +# Must be sha256-8404161df58334093533b2419b669e71d8cc4a4da2c74a2563bb833944fda8b4 +ARG BASE_IMAGE=rocm/vllm-dev:base +FROM ${BASE_IMAGE} AS base + +# Basic utilities required for the build +RUN apt-get update -q -y && apt-get install -q -y \ + sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ + apt-transport-https ca-certificates wget curl git \ + && rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip install --upgrade pip + +# Install UV (fast Python package installer) +RUN curl -LsSf --retry 3 --retry-delay 5 https://astral.sh/uv/install.sh -o /tmp/uv-install.sh \ + && env UV_INSTALL_DIR="/usr/local/bin" sh /tmp/uv-install.sh \ + && rm -f /tmp/uv-install.sh \ + && uv --version + +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_LINK_MODE=copy + +# ── build vLLM from local source ───────────────────────────────────────────── +FROM base AS build_vllm + +WORKDIR /app + +# Copy the full source tree (build context = repo root) +COPY . vllm/ + +RUN cd vllm \ + && python3 -m pip install -r requirements/rocm.txt \ + && python3 setup.py clean --all \ + && python3 setup.py bdist_wheel --dist-dir=dist + +# ── runtime image ───────────────────────────────────────────────────────────── +FROM base AS final + +RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* + +# Install RDMA userspace libraries needed by MoRIIO / RIXL +RUN apt-get update -q -y && apt-get install -q -y \ + librdmacm1 \ + libibverbs1 \ + ibverbs-providers \ + ibverbs-utils \ + autoconf \ + libibverbs-dev \ + libtool \ + unzip \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Install Broadcom bnxt_re RDMA user-space driver +RUN wget -q https://docs.broadcom.com/docs-and-downloads/ethernet-network-adapters/NXE/Thor2/GCA1/bcm5760x_230.2.52.0a.zip && \ + unzip -q bcm5760x_230.2.52.0a.zip && \ + cd bcm5760x_230.2.52.0a/drivers_linux/bnxt_rocelib/ && \ + results=$(find -name "libbnxt*.tar.gz") && tar -xf $results && \ + untar_dir=$(find . -maxdepth 1 -type d -name "libbnxt*" ! -name "*.tar.gz" | head -n 1) && \ + cd $untar_dir && sh autogen.sh && ./configure && make && \ + find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; && \ + make install all && \ + sh -c "echo /usr/local/lib >> /etc/ld.so.conf" && \ + ldconfig && \ + cp -f bnxt_re.driver /etc/libibverbs.d/ && \ + ibv_devices && \ + cd / && rm -rf /bcm5760x_230.2.52.0a /bcm5760x_230.2.52.0a.zip + +# Install vLLM wheel and its ROCm dependencies +RUN --mount=type=bind,from=build_vllm,src=/app/vllm,target=/vllm_src \ + --mount=type=cache,target=/root/.cache/uv \ + cd /vllm_src \ + && uv pip install --system -r requirements/rocm.txt \ + && pip uninstall -y vllm || true \ + && uv pip install --system dist/*.whl \ + && uv pip install --system msgpack + +# Verify ROCm PyTorch (not CUDA) +RUN python3 -c "import torch; assert torch.version.hip is not None, \ + f'Expected ROCm PyTorch but got CUDA (hip={torch.version.hip})'; \ + print(f'Verified: PyTorch {torch.__version__} ROCm HIP {torch.version.hip}')" + +# Copy examples so the proxy server script is available inside the container +COPY --from=build_vllm /app/vllm/examples /app/vllm/examples +# Copy the GSM8K evaluation script (used by run_pd_demo.sh with USE_GSM8K=1) +COPY --from=build_vllm /app/vllm/tests/evals /app/vllm/tests/evals + +# Performance / correctness env vars from the official ROCm Dockerfile +ENV TOKENIZERS_PARALLELISM=false +ENV SAFETENSORS_FAST_GPU=1 +ENV HIP_FORCE_DEV_KERNARG=1 +ENV MIOPEN_DEBUG_CONV_DIRECT=0 +ENV MIOPEN_DEBUG_CONV_GEMM=0 + +WORKDIR /app + +CMD ["/bin/bash"] diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/README.md b/examples/online_serving/disaggregated_serving/moriio_pd_demo/README.md new file mode 100644 index 000000000000..acfe788ba235 --- /dev/null +++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/README.md @@ -0,0 +1,339 @@ +# MoRIIO PD-disaggregation demo + +Reproduction scripts for running vLLM PD-disaggregation with the +MoRIIOConnector KV connector and the vllm-router. + +- **Single-host** (`run_pd_demo.sh`): requires two ROCm GPUs on one machine (Qwen3-8B). +- **Two-node** (`run_pd_demo_2node.sh`): runs DeepSeek-R1 with TP8 across two hosts connected via RDMA. + +--- + +## Files + +| File | Purpose | +|------|---------| +| `Dockerfile.router` | Builds the `vllm-router` Rust binary | +| `Dockerfile.vllm-rocm` | Builds vLLM from source on the ROCm base image | +| `run_pd_demo.sh` | Launches prefill, decode, and router containers on a single host | +| `run_pd_demo_2node.sh` | Two-node setup (DeepSeek-R1 with TP8); run on both nodes | + +--- + +## 1. Get the Docker images + +### Option A — Pull pre-built images (recommended) + +```bash +# Built from vllm PR https://github.com/vllm-project/vllm/pull/39565, commit 619af871380a7c72876f17101a2f9384fd959cff +docker pull ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0414-0859 +# Basic router support, i.e. PR https://github.com/vllm-project/router/pull/138 +docker pull ghcr.io/simondanielsson/vllm-router:dev +# Basic router support + streaming, i.e. both PRs https://github.com/vllm-project/router/pull/138 and https://github.com/vllm-project/router/pull/114 +docker pull ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy +``` + +### Option B — Build from source + +#### Router image + +```bash +# clone the fork with the PR +git clone git@github.com:simondanielsson/router.git +cd router +# To build the image with basic mori support +git switch feature/moriio-support +docker build \ + -f /Dockerfile.router \ + -t ghcr.io/simondanielsson/vllm-router:dev \ + . + +# To build the image with mori support + streaming: this branch contains the two PR's on top of each other +git switch tmp/transfer_id-and-streaming +docker build \ + -f /Dockerfile.router \ + -t ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy \ + . +``` + +#### vLLM image + +Build from the **root of this vllm repo**: + +```bash +# clone the fork +git clone git@github.com:simondanielsson/vllm.git # vllm branc here +git clone git@github.com:mpashkov/vllm.git path/to/mpashkov-vllm-fork/ # reproducer in this fork, alt. just set multiple remotes +cd vllm +git switch fix/align-mori-messages +docker build \ + -f path/to/mpashkov-vllm-fork/examples/online_serving/disaggregated_serving/moriio_pd_demo/Dockerfile.vllm-rocm \ + -t ghcr.io/simondanielsson/vllm-rocm-moriio:dev \ + . +``` + +The vLLM build compiles the ROCm wheel from source, which takes a while. + +--- + +## 2. Run the demo, single node + +### PR 1 — basic PD-disaggregation (smoke test only to confirm everything works) + +```bash +MODEL=Qwen/Qwen3-8B \ +PREFILL_GPU=0 \ +DECODE_GPU=1 \ +./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh +``` + +This sends a single smoke-test request through the router after all services are healthy. + +### PR 2 — with streaming support (full benchmark) + +Once the streaming PR is merged into the router image, enable the full two-phase benchmark +(vllm-router + toy proxy) by setting `USE_BENCH=1`: + +```bash +MODEL=Qwen/Qwen3-8B \ +PREFILL_GPU=0 \ +DECODE_GPU=1 \ +USE_BENCH=1 \ +./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh +``` + +Benchmark results are written to `~/moriio-logs/benchmark_results.log`. + +Environment variables: + +| Variable | Default | Description | +|----------|---------|-------------| +| `MODEL` | `Qwen/Qwen3-8B` | HuggingFace model id | +| `PREFILL_GPU` | `0` | GPU index for the prefill instance | +| `DECODE_GPU` | `1` | GPU index for the decode instance | +| `PREFILL_PORT` | `8100` | HTTP port for the prefill vLLM server | +| `DECODE_PORT` | `8200` | HTTP port for the decode vLLM server | +| `ROUTER_PORT` | `8080` | HTTP port for vllm-router | +| `PROXY_PING_PORT` | `36367` | ZMQ service-discovery port (router ↔ vLLM) | +| `HF_HOME` | `~/.cache/huggingface` | Host path to HuggingFace model cache | +| `LOG_DIR` | `~/moriio-logs` | Directory for container and benchmark logs | +| `USE_BENCH` | `0` | Set to `1` to run the full perf benchmark (requires streaming support) | +| `USE_GSM8K` | `0` | Set to `1` to run a GSM8K accuracy evaluation instead of the perf benchmark | +| `KEEP_ALIVE` | `0` | Set to `1` to leave containers running after the script exits | +| `VLLM_IMAGE` | `ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0414-0859` | vLLM Docker image name | +| `ROUTER_IMAGE` | `ghcr.io/simondanielsson/vllm-router:dev` | Router image used for smoke-test (no streaming) | +| `ROUTER_STREAMING_IMAGE` | `ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy` | Router image used for `USE_BENCH=1` / `USE_GSM8K=1` (streaming support required) | + +--- + +## DSR1 on two nodes + +Run `run_pd_demo_2node.sh` on two separate nodes. Replace `` and +`` with the actual IP addresses of each host. + +The decode node runs the **same script** in all cases — only the prefill node +changes between benchmark modes. + +### Smoke test + +#### Node 1 — prefill + router + +```bash +IS_PREFILL=1 PREFILL_IP= DECODE_IP= \ + ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh +``` + +#### Node 2 — decode + +```bash +IS_PREFILL=0 PREFILL_IP= DECODE_IP= \ + ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh +``` + +### Performance benchmark (`USE_BENCH=1`) + +#### Node 1 — prefill + router + +```bash +IS_PREFILL=1 PREFILL_IP= DECODE_IP= USE_BENCH=1 \ + ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh +``` + +#### Node 2 — decode + +```bash +IS_PREFILL=0 PREFILL_IP= DECODE_IP= \ + ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh +``` + +Benchmark results (both phases) are written to `~/moriio-logs/benchmark_results.log` +on Node 1. The decode node automatically restarts for Phase 2 — just leave Node 2's +script running; it will poll for the Phase 2 restart signal from Node 1. + +### GSM8K accuracy evaluation (`USE_GSM8K=1`) + +#### Node 1 — prefill + router + +```bash +IS_PREFILL=1 PREFILL_IP= DECODE_IP= USE_GSM8K=1 \ + ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh +``` + +#### Node 2 — decode + +```bash +IS_PREFILL=0 PREFILL_IP= DECODE_IP= \ + ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh +``` + +GSM8K results are written to `~/moriio-logs/gsm8k_results.log` (and `.json`) on Node 1. + +> **Note:** `USE_GSM8K=1` and `USE_BENCH=1` are mutually exclusive — GSM8K takes priority +> if both are set. The GSM8K eval runs only through vllm-router (single-phase); the toy +> proxy comparison in 2-node mode is not yet supported for GSM8K. + +Environment variables for `run_pd_demo_2node.sh`: + +| Variable | Default | Description | +|----------|---------|-------------| +| `IS_PREFILL` | *(required)* | `1` = this node runs prefill + router; `0` = decode only | +| `PREFILL_IP` | *(required)* | IP address of the prefill/router node | +| `DECODE_IP` | *(required)* | IP address of the decode node | +| `MODEL` | `deepseek-ai/DeepSeek-R1-0528` | HuggingFace model id | +| `PREFILL_PORT` | `8100` | HTTP port for the prefill vLLM server | +| `DECODE_PORT` | `8200` | HTTP port for the decode vLLM server | +| `ROUTER_PORT` | `8080` | HTTP port for vllm-router (Node 1 only) | +| `PROXY_PING_PORT` | `36367` | ZMQ service-discovery port | +| `HF_HOME` | `~/.cache/huggingface` | Host path to HuggingFace model cache | +| `LOG_DIR` | `~/moriio-logs` | Directory for logs and benchmark results | +| `USE_BENCH` | `0` | Set to `1` to run the perf benchmark after startup | +| `USE_GSM8K` | `0` | Set to `1` to run the GSM8K accuracy evaluation | +| `KEEP_ALIVE` | `0` | Set to `1` to leave containers running after the script exits | +| `VLLM_IMAGE` | `ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0414-0859` | vLLM Docker image | +| `PHASE2_SIGNAL_PORT` | `19876` | Port on Node 1 used to signal Node 2 to restart for Phase 2 (`USE_BENCH=1`) | +| `ROUTER_IMAGE` | `ghcr.io/simondanielsson/vllm-router:dev` | Router image (smoke test) | +| `ROUTER_STREAMING_IMAGE` | `ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy` | Router image for bench/eval | + +--- + +## 3. Send a test request + +Once all three containers are healthy, send requests through the **router**: + +```bash +curl -s http://localhost:8080/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-8B", + "prompt": "San Francisco is a", + "max_tokens": 64, + "temperature": 0 + }' | python3 -m json.tool +``` + +--- + +## 4. Load test with `vllm bench serve` + +Run a synthetic load test against the router with 1 000-token inputs, 1 000-token +outputs, and a max concurrency of 16: + +```bash +vllm bench serve \ + --base-url http://localhost:8080 \ + --backend openai-completions \ + --model Qwen/Qwen3-8B \ + --dataset-name random \ + --random-input-len 1000 \ + --random-output-len 1000 \ + --max-concurrency 16 \ + --num-prompts 200 +``` + +| Flag | Value | Notes | +|------|-------|-------| +| `--base-url` | `http://localhost:8080` | Points at the router, not the vLLM instances directly | +| `--backend` | `openai-completions` | Uses the `/v1/completions` endpoint | +| `--model` | `Qwen/Qwen3-8B` | Must match the model served | +| `--dataset-name` | `random` | Fully synthetic, no external dataset file needed | +| `--random-input-len` | `1000` | Input sequence length (ISL) in tokens | +| `--random-output-len` | `1000` | Output sequence length (OSL) in tokens | +| `--max-concurrency` | `16` | Maximum number of in-flight requests | +| `--num-prompts` | `200` | Total requests to send; increase for a longer run | + +--- + +## 5. GSM8K accuracy evaluation (two-phase comparison) + +To compare accuracy between the vllm-router and the toy proxy, run: + +```bash +MODEL=Qwen/Qwen3-8B \ +PREFILL_GPU=0 \ +DECODE_GPU=1 \ +USE_GSM8K=1 \ +./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh +``` + +This uses the [`lm_eval`](https://github.com/EleutherAI/lm-evaluation-harness) harness +with `--model local-completions` pointed at the active proxy, running the full GSM8K +test set, and runs it **twice**: + +1. **Phase 1** — through `vllm-router` (services already running) +2. **Phase 2** — vllm-router is replaced by `moriio_toy_proxy_server.py`; prefill and + decode are restarted so they re-register with the new proxy, then the eval runs again + +`lm_eval[api]` is installed automatically inside the container before each run +(`local-completions` is an API model backend that requires `tenacity`). + +At the end the script prints a side-by-side summary of `exact_match,flexible-extract` +and `exact_match,strict-match` for both routers. + +Results are written to: + +| File | Contents | +|------|----------| +| `~/moriio-logs/gsm8k_results_router.log` | Phase 1 human-readable summary (vllm-router) | +| `~/moriio-logs/gsm8k_results_router.json` | Phase 1 machine-readable JSON | +| `~/moriio-logs/gsm8k_results_toy_proxy.log` | Phase 2 human-readable summary (toy proxy) | +| `~/moriio-logs/gsm8k_results_toy_proxy.json` | Phase 2 machine-readable JSON | +| `~/moriio-logs/toy_proxy.log` | Toy proxy container logs | + +> **Note:** `USE_GSM8K=1` and `USE_BENCH=1` are mutually exclusive — GSM8K takes +> priority if both are set. + +--- + +## 6. Teardown + +The script shuts down all containers automatically when it exits. +To leave them running (e.g. to inspect logs or send additional requests), set `KEEP_ALIVE=1`. + +If you need to tear down manually: + +```bash +docker rm -f moriio-prefill moriio-decode moriio-router +``` + +--- + +## Architecture + +``` +Client + │ + ▼ +vllm-router (port 8080) + │ ZMQ service-discovery on PROXY_PING_PORT + │ (vLLM instances register themselves at startup) + ├──► Prefill instance (GPU 0, port 8100) + │ kv_role = kv_producer + │ MoRIIOConnector writes KV cache → Decode via RDMA + │ + └──► Decode instance (GPU 1, port 8200) + kv_role = kv_consumer + MoRIIOConnector reads KV cache from Prefill via RDMA +``` + +The router uses `--vllm-pd-disaggregation` + `--vllm-discovery-address` so +that vLLM instances register dynamically at startup rather than being passed +as static `--prefill`/`--decode` URLs. diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh b/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh new file mode 100755 index 000000000000..6dce239ebb61 --- /dev/null +++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo.sh @@ -0,0 +1,712 @@ +#!/usr/bin/env bash +# run_pd_demo.sh — Launch a single-node MoRIIO PD-disaggregation demo +# +# Prerequisites +# • Docker images built (see README in this directory): +# vllm-rocm-moriio:dev (from Dockerfile.vllm-rocm) +# vllm-router:dev (from Dockerfile.router) +# • At least 2 ROCm GPUs visible to the host +# • A HuggingFace model cache at HF_HOME (default: ~/.cache/huggingface) +# +# Usage: +# MODEL=meta-llama/Llama-3.1-8B-Instruct ./run_pd_demo.sh +# MODEL=meta-llama/Llama-3.1-8B-Instruct PREFILL_GPU=0 DECODE_GPU=1 ./run_pd_demo.sh +# +# To tear everything down: +# docker rm -f moriio-prefill moriio-decode moriio-router + +set -euo pipefail + +# ── Configuration ───────────────────────────────────────────────────────────── +MODEL="${MODEL:-Qwen/Qwen3-8B}" + +PREFILL_GPU="${PREFILL_GPU:-0}" # GPU index for the prefill instance +DECODE_GPU="${DECODE_GPU:-1}" # GPU index for the decode instance + +PREFILL_PORT="${PREFILL_PORT:-8100}" # HTTP port exposed by the prefill vLLM +DECODE_PORT="${DECODE_PORT:-8200}" # HTTP port exposed by the decode vLLM +ROUTER_PORT="${ROUTER_PORT:-8080}" # HTTP port exposed by vllm-router +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # ZMQ service-discovery port (router ↔ vLLM) + +# MoRIIO internal ports — must be distinct between prefill and decode on the same host. +# handshake_port : initial MoRIIO engine handshake (default in code: 6301) +# notify_port : prefill↔decode stage synchronisation (default in code: 61005) +PREFILL_HANDSHAKE_PORT="${PREFILL_HANDSHAKE_PORT:-6301}" +DECODE_HANDSHAKE_PORT="${DECODE_HANDSHAKE_PORT:-6302}" +PREFILL_NOTIFY_PORT="${PREFILL_NOTIFY_PORT:-61005}" +DECODE_NOTIFY_PORT="${DECODE_NOTIFY_PORT:-61006}" + +VLLM_IMAGE="${VLLM_IMAGE:-ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0411-1127}" +# Basic router (smoke-test only — no streaming support) +ROUTER_IMAGE="${ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev}" +# Streaming-capable router (required for USE_BENCH=1 and USE_GSM8K=1) +ROUTER_STREAMING_IMAGE="${ROUTER_STREAMING_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming}" + +HF_HOME="${HF_HOME:-${HOME}/.cache/huggingface}" + +LOG_DIR="${LOG_DIR:-${HOME}/moriio-logs}" +SHM_SIZE="${SHM_SIZE:-128G}" +USE_BENCH="${USE_BENCH:-0}" # Set to 1 to run full benchmark on both routers +USE_GSM8K="${USE_GSM8K:-0}" # Set to 1 to run GSM8K accuracy eval instead of the perf benchmark +KEEP_ALIVE="${KEEP_ALIVE:-0}" # Set to 1 to leave containers running after the script exits + +# Max tokens the prefill instance is allowed to generate (1 = prefill only) +# The decode instance does the actual generation. +PREFILL_MAX_MODEL_LEN="${PREFILL_MAX_MODEL_LEN:-8192}" +DECODE_MAX_MODEL_LEN="${DECODE_MAX_MODEL_LEN:-8192}" + +# ── Derive the host IP that containers will reach each other on ─────────────── +# Use the docker bridge gateway as "this host" so containers can talk to each +# other through the host network. On Linux, host.docker.internal may not +# resolve; the bridge gateway (172.17.0.1) is usually reliable. +HOST_IP="${HOST_IP:-$(docker network inspect bridge \ + --format '{{range .IPAM.Config}}{{.Gateway}}{{end}}' 2>/dev/null \ + || echo "172.17.0.1")}" + +mkdir -p "${LOG_DIR}" + +# ── Toy-proxy patch: fix non-streaming responses ────────────────────────────── +# gsm8k_eval.py (and any non-streaming client) posts without "stream": true. +# The toy proxy always returns an async-generator response, which Quart labels +# as text/html. This patch makes handle_request() buffer + return a proper +# JSON response when the client did not request streaming. +# The patch script is mounted read-only into the toy-proxy container and run +# once at startup, before the proxy process is launched. +TOY_PROXY_PATCH_SCRIPT="$(mktemp /tmp/patch_toy_proxy.XXXXXX.py)" +cat > "${TOY_PROXY_PATCH_SCRIPT}" << 'PYEOF' +import pathlib + +TARGET = pathlib.Path( + "/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py" +) +src = TARGET.read_text() + +OLD = ( + " session, decode_response = await decode_request_task\n" + " stream_generator = stream_decode_response(session, decode_response, request_id)\n" + " response = await make_response(stream_generator)\n" + " return response" +) +NEW = ( + " session, decode_response = await decode_request_task\n" + " if req_data.get(\"stream\", False):\n" + " stream_generator = stream_decode_response(\n" + " session, decode_response, request_id\n" + " )\n" + " response = await make_response(stream_generator)\n" + " return response\n" + " else:\n" + " try:\n" + " body = await decode_response.read()\n" + " content_type = decode_response.headers.get(\n" + " \"Content-Type\", \"application/json\"\n" + " )\n" + " finally:\n" + " await session.close()\n" + " response = await make_response(body, decode_response.status)\n" + " response.headers[\"Content-Type\"] = content_type\n" + " return response" +) + +if OLD in src: + TARGET.write_text(src.replace(OLD, NEW, 1)) + print("toy proxy patch: non-streaming fix applied.") +else: + print("toy proxy patch: target not found — skipping (already patched?).") +PYEOF + +echo "=== MoRIIO PD disaggregation demo ===" +echo " Model : ${MODEL}" +echo " Host IP : ${HOST_IP}" +echo " Prefill GPU : GPU ${PREFILL_GPU} → port ${PREFILL_PORT}" +echo " Decode GPU : GPU ${DECODE_GPU} → port ${DECODE_PORT}" +echo " Router port : ${ROUTER_PORT}" +echo " Discovery port: ${PROXY_PING_PORT}" +echo " Prefill MoRIIO: handshake=${PREFILL_HANDSHAKE_PORT} notify=${PREFILL_NOTIFY_PORT}" +echo " Decode MoRIIO: handshake=${DECODE_HANDSHAKE_PORT} notify=${DECODE_NOTIFY_PORT}" +echo " Log dir : ${LOG_DIR}" +echo "" + +# ── Helper: wait for a vLLM /health endpoint ───────────────────────────────── +wait_for_health() { + local name="$1" + local port="$2" + local max_wait=300 # seconds + local interval=5 + local elapsed=0 + + echo "Waiting for ${name} (port ${port}) to become healthy..." + while true; do + if curl -sf "http://localhost:${port}/health" >/dev/null 2>&1; then + echo " ${name} is healthy." + return 0 + fi + sleep "${interval}" + elapsed=$((elapsed + interval)) + if [[ "${elapsed}" -ge "${max_wait}" ]]; then + echo "ERROR: ${name} did not become healthy after ${max_wait}s" >&2 + docker logs "${name}" | tail -30 >&2 + exit 1 + fi + echo " Still waiting for ${name} (${elapsed}s / ${max_wait}s)..." + done +} + +# ── Remove stale containers (if any) ───────────────────────────────────────── +for cname in moriio-prefill moriio-decode moriio-router moriio-toy-proxy; do + if docker ps -a --format '{{.Names}}' | grep -q "^${cname}$"; then + echo "Removing existing container: ${cname}" + docker rm -f "${cname}" + fi +done + +# ── KV-transfer config shared between prefill and decode ───────────────────── +# proxy_ip : address that vLLM uses to register with the router's ZMQ socket +# proxy_ping_port : the port the router listens on for registration messages +# http_port : the vLLM instance's own HTTP port (used in the zmq_address that +# gets embedded in request IDs by the router) + +PREFILL_KV_CONFIG=$(cat <>> Starting prefill instance (GPU ${PREFILL_GPU}, port ${PREFILL_PORT})..." + +docker run -d \ + --name moriio-prefill \ + "${VLLM_COMMON_ARGS[@]}" \ + -e HIP_VISIBLE_DEVICES="${PREFILL_GPU}" \ + "${VLLM_IMAGE}" \ + vllm serve "${MODEL}" \ + --port "${PREFILL_PORT}" \ + --max-model-len "${PREFILL_MAX_MODEL_LEN}" \ + --trust-remote-code \ + --kv-transfer-config "${PREFILL_KV_CONFIG}" + +docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill.log" & + +# ── Launch decode instance ──────────────────────────────────────────────────── +echo ">>> Starting decode instance (GPU ${DECODE_GPU}, port ${DECODE_PORT})..." + +docker run -d \ + --name moriio-decode \ + "${VLLM_COMMON_ARGS[@]}" \ + -e HIP_VISIBLE_DEVICES="${DECODE_GPU}" \ + "${VLLM_IMAGE}" \ + vllm serve "${MODEL}" \ + --port "${DECODE_PORT}" \ + --max-model-len "${DECODE_MAX_MODEL_LEN}" \ + --trust-remote-code \ + --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ + --kv-transfer-config "${DECODE_KV_CONFIG}" + +docker logs -f moriio-decode 2>&1 | tee "${LOG_DIR}/decode.log" & + +# ── Wait for both vLLM instances to be healthy before starting the router ───── +wait_for_health "moriio-prefill" "${PREFILL_PORT}" +wait_for_health "moriio-decode" "${DECODE_PORT}" + +# ── Launch vllm-router ──────────────────────────────────────────────────────── +# USE_BENCH and USE_GSM8K require the streaming-capable router image. +if [[ "${USE_BENCH}" == "1" || "${USE_GSM8K}" == "1" ]]; then + _ACTIVE_ROUTER_IMAGE="${ROUTER_STREAMING_IMAGE}" +else + _ACTIVE_ROUTER_IMAGE="${ROUTER_IMAGE}" +fi + +echo "" +echo ">>> Starting vllm-router (port ${ROUTER_PORT}, discovery port ${PROXY_PING_PORT})..." +echo " Image: ${_ACTIVE_ROUTER_IMAGE}" + +docker run -d \ + --name moriio-router \ + --network host \ + "${_ACTIVE_ROUTER_IMAGE}" \ + vllm-router \ + --vllm-pd-disaggregation \ + --vllm-discovery-address "0.0.0.0:${PROXY_PING_PORT}" \ + --port "${ROUTER_PORT}" \ + --host 0.0.0.0 \ + --policy consistent_hash \ + --prefill-policy consistent_hash \ + --decode-policy consistent_hash \ + --log-level info + +docker logs -f moriio-router 2>&1 | tee "${LOG_DIR}/router.log" & + +# ── Wait briefly for the router to be ready (it starts fast) ───────────────── +echo "Waiting for router to start..." +sleep 5 + +# ── Print summary ───────────────────────────────────────────────────────────── +echo "" +echo "=== All services running ===" +echo " Prefill : http://localhost:${PREFILL_PORT}" +echo " Decode : http://localhost:${DECODE_PORT}" +echo " Router : http://localhost:${ROUTER_PORT} ← send requests here" +echo "" +echo "Example request:" +echo " curl -s http://localhost:${ROUTER_PORT}/v1/completions \\" +echo " -H 'Content-Type: application/json' \\" +echo " -d '{" +echo " \"model\": \"${MODEL}\"," +echo " \"prompt\": \"San Francisco is a\"," +echo " \"max_tokens\": 64," +echo " \"temperature\": 0" +echo " }' | python3 -m json.tool" +echo "" +echo "To follow logs (log files are written to ${LOG_DIR}):" +echo " tail -f ${LOG_DIR}/prefill.log" +echo " tail -f ${LOG_DIR}/decode.log" +echo " tail -f ${LOG_DIR}/router.log" +echo "" +echo "Containers will be shut down automatically when the script exits." +echo " (Set KEEP_ALIVE=1 to leave them running.)" + +if [[ "${USE_GSM8K}" == "1" ]]; then + +# ── GSM8K two-phase accuracy comparison (via lm_eval) ──────────────────────── +# Phase 1 : vllm-router (already running) +# Phase 2 : toy proxy (router replaced, vLLM instances restarted) +# +# Uses the lm_eval harness with --model local-completions so the full GSM8K +# test set flows through whichever proxy is active. + +# Per-phase output paths +GSM8K_ROUTER_LOG="${LOG_DIR}/gsm8k_results_router.log" +GSM8K_ROUTER_JSON="${LOG_DIR}/gsm8k_results_router.json" +GSM8K_PROXY_LOG="${LOG_DIR}/gsm8k_results_toy_proxy.log" +GSM8K_PROXY_JSON="${LOG_DIR}/gsm8k_results_toy_proxy.json" + +TOY_PROXY_CONTAINER_PATH="/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py" +TOY_PROXY_HTTP_PORT=10001 + +# Helper: run lm_eval gsm8k inside a container against the given port. +# Installs lm_eval[vllm] if needed, writes stdout to log_file, and copies +# the lm_eval results JSON out of the container to json_dest. +run_lm_eval() { + local container="$1" + local port="$2" + local log_file="$3" + local json_dest="$4" + local out_dir="/tmp/lm_eval_out" + + docker exec "${container}" bash -c \ + "pip install --quiet 'lm_eval[api]' && \ + rm -rf ${out_dir} && \ + lm_eval \ + --model local-completions \ + --model_args model=${MODEL},base_url=http://127.0.0.1:${port}/v1/completions,tokenized_requests=False,trust_remote_code=True \ + --tasks gsm8k \ + --output_path ${out_dir}" \ + 2>&1 | tee -a "${log_file}" + + # lm_eval writes results.json somewhere under out_dir; find and copy it. + local remote_json + remote_json=$(docker exec "${container}" \ + find "${out_dir}" -name "results.json" 2>/dev/null | head -1) + if [[ -n "${remote_json}" ]]; then + docker cp "${container}:${remote_json}" "${json_dest}" 2>/dev/null || true + else + echo "WARNING: lm_eval results.json not found in ${out_dir}" >&2 + fi +} + +# ── Phase 1: GSM8K through vllm-router ─────────────────────────────────────── +echo "" +echo ">>> Phase 1: GSM8K accuracy evaluation (lm_eval) through vllm-router..." +echo " (full GSM8K test set, results → ${GSM8K_ROUTER_LOG})" +{ + echo "======================================================" + echo " GSM8K evaluation (lm_eval) — Phase 1: vllm-router" + echo " Model : ${MODEL}" + echo " Date : $(date)" + echo "======================================================" +} | tee "${GSM8K_ROUTER_LOG}" + +run_lm_eval "moriio-prefill" "${ROUTER_PORT}" "${GSM8K_ROUTER_LOG}" "${GSM8K_ROUTER_JSON}" + +# ── Phase 2: switch to toy proxy and re-run ─────────────────────────────────── +echo "" +echo ">>> Stopping vllm-router, prefill, and decode for Phase 2..." +docker rm -f moriio-router moriio-prefill moriio-decode + +echo ">>> Starting toy proxy container (HTTP :${TOY_PROXY_HTTP_PORT}, ZMQ :${PROXY_PING_PORT})..." +docker run -d \ + --name moriio-toy-proxy \ + --network host \ + -v "${TOY_PROXY_PATCH_SCRIPT}:/tmp/patch_toy_proxy.py:ro" \ + "${VLLM_IMAGE}" \ + bash -c "pip install --quiet --ignore-installed quart aiohttp msgpack && \ + python3 /tmp/patch_toy_proxy.py && \ + python3 -u ${TOY_PROXY_CONTAINER_PATH}" + +docker logs -f moriio-toy-proxy 2>&1 | tee "${LOG_DIR}/toy_proxy.log" & + +# Wait for the toy proxy HTTP port before starting vLLM instances. +echo "Waiting for toy proxy HTTP port ${TOY_PROXY_HTTP_PORT} to open..." +_tp_wait=0 +until curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/" >/dev/null 2>&1 \ + || curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/v1/completions" >/dev/null 2>&1 \ + || nc -z 127.0.0.1 "${TOY_PROXY_HTTP_PORT}" 2>/dev/null; do + sleep 2 + _tp_wait=$((_tp_wait + 2)) + if [[ "${_tp_wait}" -ge 60 ]]; then + echo "WARNING: toy proxy did not open port ${TOY_PROXY_HTTP_PORT} after 60s" >&2 + docker logs moriio-toy-proxy 2>&1 | tail -20 >&2 + break + fi +done +echo " Toy proxy is up." + +echo ">>> Restarting prefill instance (GPU ${PREFILL_GPU}, port ${PREFILL_PORT})..." +docker run -d \ + --name moriio-prefill \ + "${VLLM_COMMON_ARGS[@]}" \ + -e HIP_VISIBLE_DEVICES="${PREFILL_GPU}" \ + "${VLLM_IMAGE}" \ + vllm serve "${MODEL}" \ + --port "${PREFILL_PORT}" \ + --max-model-len "${PREFILL_MAX_MODEL_LEN}" \ + --trust-remote-code \ + --kv-transfer-config "${PREFILL_KV_CONFIG}" + +docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill_phase2.log" & + +echo ">>> Restarting decode instance (GPU ${DECODE_GPU}, port ${DECODE_PORT})..." +docker run -d \ + --name moriio-decode \ + "${VLLM_COMMON_ARGS[@]}" \ + -e HIP_VISIBLE_DEVICES="${DECODE_GPU}" \ + "${VLLM_IMAGE}" \ + vllm serve "${MODEL}" \ + --port "${DECODE_PORT}" \ + --max-model-len "${DECODE_MAX_MODEL_LEN}" \ + --trust-remote-code \ + --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ + --kv-transfer-config "${DECODE_KV_CONFIG}" + +docker logs -f moriio-decode 2>&1 | tee "${LOG_DIR}/decode_phase2.log" & + +wait_for_health "moriio-prefill" "${PREFILL_PORT}" +wait_for_health "moriio-decode" "${DECODE_PORT}" + +# Reuse the same registration-wait helper defined in the USE_BENCH block +# (define it inline here so USE_GSM8K works standalone). +echo "Waiting for prefill and decode to register with toy proxy..." +_reg_wait=0 +while true; do + _p=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Prefill" || true) + _d=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Decode" || true) + if [[ "${_p:-0}" -ge 1 && "${_d:-0}" -ge 1 ]]; then + echo " Both instances registered." + break + fi + sleep 3 + _reg_wait=$((_reg_wait + 3)) + if [[ "${_reg_wait}" -ge 120 ]]; then + echo "WARNING: timed out waiting for toy proxy registrations after 120s" >&2 + break + fi + echo " Still waiting (${_reg_wait}s / 120s) — prefill=${_p:-0} decode=${_d:-0}..." +done + +echo "" +echo ">>> Phase 2: GSM8K accuracy evaluation (lm_eval) through toy proxy..." +echo " (full GSM8K test set, results → ${GSM8K_PROXY_LOG})" +{ + echo "======================================================" + echo " GSM8K evaluation (lm_eval) — Phase 2: moriio_toy_proxy_server.py" + echo " Model : ${MODEL}" + echo " Date : $(date)" + echo "======================================================" +} | tee "${GSM8K_PROXY_LOG}" + +run_lm_eval "moriio-prefill" "${TOY_PROXY_HTTP_PORT}" "${GSM8K_PROXY_LOG}" "${GSM8K_PROXY_JSON}" + +# ── Side-by-side summary ────────────────────────────────────────────────────── +# lm_eval results.json structure: +# { "results": { "gsm8k": { "exact_match,flexible-extract": 0.xx, ... } } } +_gsm8k_metric() { + local json_file="$1" + local key="$2" + python3 -c " +import json, sys +try: + d = json.load(open('${json_file}')) + v = d['results']['gsm8k'].get('${key}', 'N/A') + print(f'{v:.4f}' if isinstance(v, float) else v) +except Exception as e: + print('N/A') +" 2>/dev/null || echo "N/A" +} + +echo "" +echo "=== GSM8K comparison complete ===" +echo "" +printf "%-35s %-15s %s\n" "Metric" "vllm-router" "toy-proxy" +printf "%-35s %-15s %s\n" "------" "-----------" "---------" +for key in "exact_match,flexible-extract" "exact_match,strict-match"; do + r=$(_gsm8k_metric "${GSM8K_ROUTER_JSON}" "${key}") + p=$(_gsm8k_metric "${GSM8K_PROXY_JSON}" "${key}") + printf "%-35s %-15s %s\n" "${key}" "${r}" "${p}" +done +echo "" +echo " Phase 1 log : ${GSM8K_ROUTER_LOG}" +echo " Phase 1 JSON : ${GSM8K_ROUTER_JSON}" +echo " Phase 2 log : ${GSM8K_PROXY_LOG}" +echo " Phase 2 JSON : ${GSM8K_PROXY_JSON}" +echo " Toy proxy log: ${LOG_DIR}/toy_proxy.log" + +elif [[ "${USE_BENCH}" == "1" ]]; then + +# ── Benchmark helpers ───────────────────────────────────────────────────────── +# Both the benchmark tool and the toy proxy run inside moriio-prefill because: +# • vllm (bench serve) is installed there +# • examples/ are copied to /app/vllm/examples/ in the image (see Dockerfile) +# • --network host means localhost:ROUTER_PORT is the same inside and outside +BENCH_LOG="${LOG_DIR}/benchmark_results.log" +TOY_PROXY_CONTAINER_PATH="/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py" + +# Common args shared by both benchmark runs +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-16}" +BENCH_NUM_WARMUPS=$((BENCH_MAX_CONCURRENCY * 2)) +BENCH_NUM_PROMPTS=$((BENCH_MAX_CONCURRENCY * 10)) + +BENCH_ARGS=( + --backend vllm + --model "${MODEL}" + --dataset-name random + --random-input-len 1000 + --random-output-len 1000 + --max-concurrency "${BENCH_MAX_CONCURRENCY}" + --num-warmups "${BENCH_NUM_WARMUPS}" + --num-prompts "${BENCH_NUM_PROMPTS}" + --ready_check_timeout_sec 3000 + --seed 1234 +) + +# Poll docker logs of the toy proxy container until both instances have registered. +# Using docker logs avoids Python stdout-buffering issues (print() to a file is +# fully buffered; docker logs reads directly from the container's log driver). +wait_for_toy_proxy_registrations() { + local max_wait=120 + local interval=3 + local elapsed=0 + echo "Waiting for prefill and decode to register with toy proxy..." + while true; do + local p d + p=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Prefill" || true) + d=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Decode" || true) + if [[ "${p:-0}" -ge 1 && "${d:-0}" -ge 1 ]]; then + echo " Both instances registered." + return 0 + fi + sleep "${interval}" + elapsed=$((elapsed + interval)) + if [[ "${elapsed}" -ge "${max_wait}" ]]; then + echo "WARNING: timed out waiting for toy proxy registrations after ${max_wait}s" >&2 + return 0 + fi + echo " Still waiting (${elapsed}s / ${max_wait}s) — prefill=${p:-0} decode=${d:-0}..." + done +} + +# ── Phase 1: benchmark through vllm-router ──────────────────────────────────── +echo "" +echo ">>> Phase 1: benchmarking through vllm-router..." +{ + echo "======================================================" + echo " Router: vllm-router" + echo " Date : $(date)" + echo "======================================================" +} | tee -a "${BENCH_LOG}" + +docker exec moriio-prefill \ + vllm bench serve \ + --base-url "http://localhost:${ROUTER_PORT}" \ + "${BENCH_ARGS[@]}" 2>&1 | tee -a "${BENCH_LOG}" + +# ── Phase 2: switch to toy proxy and benchmark again ────────────────────────── +# The toy proxy's HTTP port is hardcoded in the image as 10001; its ZMQ +# discovery port defaults to PROXY_PING_PORT (36367), same as vllm-router. +# We run it as a dedicated container so docker logs captures stdout without +# any Python output-buffering issues (no grep-on-file race). +# Prefill and decode are restarted so they register fresh with the new proxy. +TOY_PROXY_HTTP_PORT=10001 + +echo "" +echo ">>> Stopping vllm-router, prefill, and decode..." +docker rm -f moriio-router moriio-prefill moriio-decode + +echo ">>> Starting toy proxy container (HTTP :${TOY_PROXY_HTTP_PORT}, ZMQ :${PROXY_PING_PORT})..." +docker run -d \ + --name moriio-toy-proxy \ + --network host \ + -v "${TOY_PROXY_PATCH_SCRIPT}:/tmp/patch_toy_proxy.py:ro" \ + "${VLLM_IMAGE}" \ + bash -c "pip install --quiet --ignore-installed quart aiohttp msgpack && \ + python3 /tmp/patch_toy_proxy.py && \ + python3 -u ${TOY_PROXY_CONTAINER_PATH}" + +docker logs -f moriio-toy-proxy 2>&1 | tee "${LOG_DIR}/toy_proxy.log" & + +# Wait for the toy proxy HTTP port before starting vLLM (avoids a race where +# instances start sending heartbeats before the ZMQ socket is bound). +echo "Waiting for toy proxy HTTP port ${TOY_PROXY_HTTP_PORT} to open..." +_tp_wait=0 +until curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/" >/dev/null 2>&1 \ + || curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/v1/completions" >/dev/null 2>&1 \ + || nc -z 127.0.0.1 "${TOY_PROXY_HTTP_PORT}" 2>/dev/null; do + sleep 2 + _tp_wait=$((_tp_wait + 2)) + if [[ "${_tp_wait}" -ge 60 ]]; then + echo "WARNING: toy proxy did not open port ${TOY_PROXY_HTTP_PORT} after 60s" >&2 + docker logs moriio-toy-proxy 2>&1 | tail -20 >&2 + break + fi +done +echo " Toy proxy is up." + +echo ">>> Restarting prefill instance (GPU ${PREFILL_GPU}, port ${PREFILL_PORT})..." +docker run -d \ + --name moriio-prefill \ + "${VLLM_COMMON_ARGS[@]}" \ + -e HIP_VISIBLE_DEVICES="${PREFILL_GPU}" \ + "${VLLM_IMAGE}" \ + vllm serve "${MODEL}" \ + --port "${PREFILL_PORT}" \ + --max-model-len "${PREFILL_MAX_MODEL_LEN}" \ + --trust-remote-code \ + --kv-transfer-config "${PREFILL_KV_CONFIG}" + +docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill_phase2.log" & + +echo ">>> Restarting decode instance (GPU ${DECODE_GPU}, port ${DECODE_PORT})..." +docker run -d \ + --name moriio-decode \ + "${VLLM_COMMON_ARGS[@]}" \ + -e HIP_VISIBLE_DEVICES="${DECODE_GPU}" \ + "${VLLM_IMAGE}" \ + vllm serve "${MODEL}" \ + --port "${DECODE_PORT}" \ + --max-model-len "${DECODE_MAX_MODEL_LEN}" \ + --trust-remote-code \ + --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ + --kv-transfer-config "${DECODE_KV_CONFIG}" + +docker logs -f moriio-decode 2>&1 | tee "${LOG_DIR}/decode_phase2.log" & + +wait_for_health "moriio-prefill" "${PREFILL_PORT}" +wait_for_health "moriio-decode" "${DECODE_PORT}" + +wait_for_toy_proxy_registrations + +echo "" +echo ">>> Phase 2: benchmarking through toy proxy..." +{ + echo "" + echo "======================================================" + echo " Router: moriio_toy_proxy_server.py" + echo " Date : $(date)" + echo "======================================================" +} | tee -a "${BENCH_LOG}" + +docker exec moriio-prefill \ + vllm bench serve \ + --base-url "http://localhost:${TOY_PROXY_HTTP_PORT}" \ + "${BENCH_ARGS[@]}" 2>&1 | tee -a "${BENCH_LOG}" + +echo "" +echo "=== Both benchmark runs complete ===" +echo " Results : ${BENCH_LOG}" +echo " Toy proxy log: ${LOG_DIR}/toy_proxy.log" + +else + +# ── Smoke test: single completion request through the router ────────────────── +echo "" +echo ">>> Smoke test: sending a completion request through vllm-router..." +curl -s "http://localhost:${ROUTER_PORT}/v1/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"${MODEL}\", + \"prompt\": \"San Francisco is a\", + \"max_tokens\": 64, + \"temperature\": 0 + }" | python3 -m json.tool +echo "" +echo "(Set USE_BENCH=1 to run the full perf benchmark, or USE_GSM8K=1 for accuracy evaluation.)" + +fi + +# ── Teardown ────────────────────────────────────────────────────────────────── +if [[ "${KEEP_ALIVE}" == "1" ]]; then + echo "" + echo "KEEP_ALIVE=1 — containers left running." + echo " To tear down: docker rm -f moriio-prefill moriio-decode moriio-router moriio-toy-proxy" +else + echo "" + echo ">>> Shutting down containers..." + docker rm -f moriio-prefill moriio-decode moriio-router moriio-toy-proxy 2>/dev/null || true + echo "Done. All containers removed." +fi diff --git a/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh b/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh new file mode 100755 index 000000000000..05229dc7f4da --- /dev/null +++ b/examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh @@ -0,0 +1,740 @@ +#!/usr/bin/env bash +# run_pd_demo_2node.sh — Two-node MoRIIO PD-disaggregation demo +# +# Run the SAME script on BOTH nodes, setting IS_PREFILL to distinguish roles: +# +# Node 1 — prefill instance + vllm-router: +# IS_PREFILL=1 PREFILL_IP= DECODE_IP= \ +# ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh +# +# Node 2 — decode instance: +# IS_PREFILL=0 PREFILL_IP= DECODE_IP= \ +# ./examples/online_serving/disaggregated_serving/moriio_pd_demo/run_pd_demo_2node.sh +# +# Prerequisites +# • 8 ROCm GPUs on each node +# • Docker image pulled: ghcr.io/simondanielsson/vllm-rocm-moriio:dev-0414-0859 +# • Router image pulled: ghcr.io/simondanielsson/vllm-router:dev (or :dev-streaming-cn-cjy) +# • RDMA / InfiniBand devices visible at /dev/infiniband on each node +# • Ports PREFILL_PORT, DECODE_PORT, ROUTER_PORT, PROXY_PING_PORT, +# HANDSHAKE_PORT, NOTIFY_PORT, PHASE2_SIGNAL_PORT open between the two nodes +# +# Phase 2 coordination (USE_BENCH=1 or USE_GSM8K=1) +# The prefill node starts a tiny HTTP signal server on PHASE2_SIGNAL_PORT +# after Phase 1 completes. The decode node polls that port; once it gets a +# response it tears down its container and restarts so the new instance +# registers with the toy proxy instead of the router. The prefill node waits +# for the decode to become healthy again before running Phase 2. + +set -euo pipefail + +# ── Required env vars ───────────────────────────────────────────────────────── +IS_PREFILL="${IS_PREFILL:-}" +PREFILL_IP="${PREFILL_IP:-}" +DECODE_IP="${DECODE_IP:-}" + +if [[ -z "${IS_PREFILL}" || -z "${PREFILL_IP}" || -z "${DECODE_IP}" ]]; then + echo "ERROR: IS_PREFILL, PREFILL_IP, and DECODE_IP must all be set." >&2 + echo "" >&2 + echo " Node 1 (prefill + router):" >&2 + echo " IS_PREFILL=1 PREFILL_IP= DECODE_IP= $0" >&2 + echo "" >&2 + echo " Node 2 (decode):" >&2 + echo " IS_PREFILL=0 PREFILL_IP= DECODE_IP= $0" >&2 + exit 1 +fi + +# ── Configuration ───────────────────────────────────────────────────────────── +# DeepSeek-R1-0528 is a 671B MoE model; --load-format dummy skips weight +# loading so the script can be used for integration testing without the +# full model checkpoint. Set MODEL to the real HF id when running with +# actual weights. +MODEL="${MODEL:-deepseek-ai/DeepSeek-R1-0528}" + +PREFILL_PORT="${PREFILL_PORT:-8100}" # HTTP port for the prefill vLLM instance +DECODE_PORT="${DECODE_PORT:-8200}" # HTTP port for the decode vLLM instance +ROUTER_PORT="${ROUTER_PORT:-8080}" # HTTP port for vllm-router (prefill node only) +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # ZMQ service-discovery port (router ↔ vLLM) + +# MoRIIO internal ports — nodes are separate machines so no port conflicts; +# both instances can use the same numbers. +HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" # MoRIIO engine handshake +NOTIFY_PORT="${NOTIFY_PORT:-61005}" # Prefill↔decode stage synchronisation + +# Port used by the prefill node to signal the decode node to restart for Phase 2. +# The prefill node starts a simple HTTP server on this port; the decode node polls it. +PHASE2_SIGNAL_PORT="${PHASE2_SIGNAL_PORT:-19876}" + +VLLM_IMAGE="${VLLM_IMAGE:-ghcr.io/simondanielsson/vllm-rocm-moriio:dev}" +#VLLM_IMAGE="${VLLM_IMAGE:-ghcr.io/simondanielsson/vllm-rocm-moriio:dev-hang-fixes}" + +# Basic router (smoke-test only — no streaming support) +ROUTER_IMAGE="${ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev}" +# Streaming-capable router (required for USE_BENCH=1 and USE_GSM8K=1) +ROUTER_STREAMING_IMAGE="${ROUTER_STREAMING_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" + +HF_HOME="${HF_HOME:-${HOME}/.cache/huggingface}" +LOG_DIR="${LOG_DIR:-${HOME}/moriio-logs}" +SHM_SIZE="${SHM_SIZE:-256G}" + +USE_BENCH="${USE_BENCH:-0}" # Set to 1 to run the perf benchmark (prefill node only) +USE_GSM8K="${USE_GSM8K:-0}" # Set to 1 to run GSM8K accuracy eval (prefill node only) +KEEP_ALIVE="${KEEP_ALIVE:-0}" # Set to 1 to leave containers running after the script exits + +BENCH_NUM_PROMPTS_FACTOR=${BENCH_NUM_PROMPTS_FACTOR:-10} +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-16}" +BENCH_NUM_WARMUPS=$((BENCH_MAX_CONCURRENCY * 2)) +BENCH_NUM_PROMPTS=$((BENCH_MAX_CONCURRENCY * $BENCH_NUM_PROMPTS_FACTOR)) + +# HTTP port the toy proxy listens on (hardcoded in the image). +TOY_PROXY_HTTP_PORT=10001 +# Path to the toy proxy script inside the vLLM image. +TOY_PROXY_CONTAINER_PATH="/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py" + +mkdir -p "${LOG_DIR}" + +# ── Toy-proxy patch: fix non-streaming responses ────────────────────────────── +# Makes handle_request() return a proper JSON body when the client did not +# request streaming (needed by lm_eval and any non-streaming client). +TOY_PROXY_PATCH_SCRIPT="$(mktemp /tmp/patch_toy_proxy.XXXXXX.py)" +cat > "${TOY_PROXY_PATCH_SCRIPT}" << 'PYEOF' +import pathlib + +TARGET = pathlib.Path( + "/app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py" +) +src = TARGET.read_text() + +OLD = ( + " session, decode_response = await decode_request_task\n" + " stream_generator = stream_decode_response(session, decode_response, request_id)\n" + " response = await make_response(stream_generator)\n" + " return response" +) +NEW = ( + " session, decode_response = await decode_request_task\n" + " if req_data.get(\"stream\", False):\n" + " stream_generator = stream_decode_response(\n" + " session, decode_response, request_id\n" + " )\n" + " response = await make_response(stream_generator)\n" + " return response\n" + " else:\n" + " try:\n" + " body = await decode_response.read()\n" + " content_type = decode_response.headers.get(\n" + " \"Content-Type\", \"application/json\"\n" + " )\n" + " finally:\n" + " await session.close()\n" + " response = await make_response(body, decode_response.status)\n" + " response.headers[\"Content-Type\"] = content_type\n" + " return response" +) + +if OLD in src: + TARGET.write_text(src.replace(OLD, NEW, 1)) + print("toy proxy patch: non-streaming fix applied.") +else: + print("toy proxy patch: target not found — skipping (already patched?).") +PYEOF + +# ── vLLM serve flags shared between prefill and decode ─────────────────────── +VLLM_SERVE_ARGS=( + --tensor-parallel-size 8 + --kv-cache-dtype fp8 + --gpu-memory-utilization 0.7 + --max-num-batched-tokens 32768 + --max-model-len 16384 + --trust-remote-code + --no-enable-prefix-caching + --block-size 1 +) + +# ── Role-specific vLLM serve flags ─────────────────────────────────────────── +PREFILL_EXTRA_ARGS=( + --enforce-eager +) + +DECODE_EXTRA_ARGS=( + --enable-expert-parallel + --all2all-backend mori + --compilation-config '{"cudagraph_mode": "PIECEWISE"}' +) + +# ── KV-transfer configs ─────────────────────────────────────────────────────── +# proxy_ip : IP of the node running vllm-router (always the prefill node) +# proxy_ping_port : ZMQ port the router listens on for instance registration +# http_port : this instance's own HTTP port (embedded in zmq_address by router) +# handshake_port / notify_port : MoRIIO RDMA coordination ports on this node +PREFILL_KV_CONFIG=$(cat </dev/null 2>&1; then + echo " ${name} is healthy." + return 0 + fi + sleep "${interval}" + elapsed=$((elapsed + interval)) + if [[ "${elapsed}" -ge "${max_wait}" ]]; then + echo "ERROR: ${name} did not become healthy after ${max_wait}s" >&2 + exit 1 + fi + echo " Still waiting for ${name} (${elapsed}s / ${max_wait}s)..." + done +} + +# Poll docker logs until both P and D have registered with the toy proxy. +wait_for_toy_proxy_registrations() { + local max_wait=120 interval=3 elapsed=0 + echo "Waiting for prefill and decode to register with toy proxy..." + while true; do + local p d + p=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Prefill" || true) + d=$(docker logs moriio-toy-proxy 2>&1 | grep -c "Registered Decode" || true) + if [[ "${p:-0}" -ge 1 && "${d:-0}" -ge 1 ]]; then + echo " Both instances registered (prefill=${p} decode=${d})." + return 0 + fi + sleep "${interval}" + elapsed=$((elapsed + interval)) + if [[ "${elapsed}" -ge "${max_wait}" ]]; then + echo "WARNING: timed out waiting for toy proxy registrations after ${max_wait}s" >&2 + return 0 + fi + echo " Still waiting (${elapsed}s / ${max_wait}s) — prefill=${p:-0} decode=${d:-0}..." + done +} + +# Stop router+prefill, launch toy proxy, restart prefill, wait for decode Phase 2. +# Called by both USE_BENCH and USE_GSM8K after Phase 1 completes. +run_phase2_switchover() { + # Signal the decode node to restart by serving a one-line HTTP response on + # PHASE2_SIGNAL_PORT. We use a Python one-liner so the response is proper + # HTTP (nc -l alone sends no headers and some curl versions reject it). + echo "" + echo ">>> Phase 2: signalling decode node to restart (port ${PHASE2_SIGNAL_PORT})..." + echo " The decode node is polling http://${PREFILL_IP}:${PHASE2_SIGNAL_PORT}/phase2" + # Serve exactly one request then exit (Python http.server handles one GET then we kill it). + python3 -c " +import http.server, socketserver, threading, time + +class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + self.end_headers() + self.wfile.write(b'phase2\n') + # Schedule shutdown after we've sent the response + threading.Thread(target=self.server.shutdown, daemon=True).start() + def log_message(self, *a): pass + +with socketserver.TCPServer(('0.0.0.0', ${PHASE2_SIGNAL_PORT}), Handler) as srv: + srv.serve_forever() +" & + _SIGNAL_PID=$! + + # Now tear down the router and this node's prefill container. + echo ">>> Stopping vllm-router and prefill (Phase 1)..." + docker rm -f moriio-router moriio-prefill + # Wait for GPU memory and host-network TCP sockets (TIME_WAIT) to be released + # before starting the Phase 2 prefill container. + echo "Waiting 30s for GPU/network resources to be released..." + sleep 30 + + # Start toy proxy (HTTP :${TOY_PROXY_HTTP_PORT}, ZMQ :${PROXY_PING_PORT}) + echo ">>> Starting toy proxy container (HTTP :${TOY_PROXY_HTTP_PORT}, ZMQ :${PROXY_PING_PORT})..." + docker run -d \ + --name moriio-toy-proxy \ + --network host \ + -v "${TOY_PROXY_PATCH_SCRIPT}:/tmp/patch_toy_proxy.py:ro" \ + "${VLLM_IMAGE}" \ + bash -c "pip install --quiet --ignore-installed quart aiohttp msgpack && \ + python3 /tmp/patch_toy_proxy.py && \ + python3 -u ${TOY_PROXY_CONTAINER_PATH}" + + docker logs -f moriio-toy-proxy 2>&1 | tee "${LOG_DIR}/toy_proxy.log" & + + # Wait for toy proxy HTTP port to open before starting vLLM. + echo "Waiting for toy proxy HTTP port ${TOY_PROXY_HTTP_PORT} to open..." + _tp_wait=0 + until curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/" >/dev/null 2>&1 \ + || curl -sf "http://localhost:${TOY_PROXY_HTTP_PORT}/v1/completions" >/dev/null 2>&1 \ + || nc -z 127.0.0.1 "${TOY_PROXY_HTTP_PORT}" 2>/dev/null; do + sleep 2 + _tp_wait=$((_tp_wait + 2)) + if [[ "${_tp_wait}" -ge 60 ]]; then + echo "WARNING: toy proxy did not open port ${TOY_PROXY_HTTP_PORT} after 60s" >&2 + docker logs moriio-toy-proxy 2>&1 | tail -20 >&2 + break + fi + done + echo " Toy proxy is up." + + # Restart the prefill container so it registers with the toy proxy. + echo ">>> Restarting prefill instance (port ${PREFILL_PORT})..." + docker run -d \ + --name moriio-prefill \ + "${VLLM_COMMON_ARGS[@]}" \ + "${VLLM_IMAGE}" \ + vllm serve "${MODEL}" \ + --port "${PREFILL_PORT}" \ + "${VLLM_SERVE_ARGS[@]}" \ + "${PREFILL_EXTRA_ARGS[@]}" \ + --kv-transfer-config "${PREFILL_KV_CONFIG}" + + docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill_phase2.log" & + + wait_for_health "moriio-prefill" "http://localhost:${PREFILL_PORT}/health" 1800 + + # Kill the signal server now that decode has been notified (it may already be gone). + kill "${_SIGNAL_PID}" 2>/dev/null || true + + # Wait for the remote decode to come back up (it restarts itself upon seeing the signal). + echo "" + echo ">>> Waiting for decode node (${DECODE_IP}:${DECODE_PORT}) to come back up..." + wait_for_health "moriio-decode (remote, Phase 2)" "http://${DECODE_IP}:${DECODE_PORT}/health" 1200 + + echo "Waiting 10s for decode to register with toy proxy..." + sleep 10 + + wait_for_toy_proxy_registrations +} + +# ── Cleanup trap ────────────────────────────────────────────────────────────── +_cleanup() { + if [[ "${KEEP_ALIVE}" == "1" ]]; then + echo "" + echo "KEEP_ALIVE=1 — containers left running." + if [[ "${IS_PREFILL}" == "1" ]]; then + echo " To tear down: docker rm -f moriio-prefill moriio-router moriio-toy-proxy" + else + echo " To tear down: docker rm -f moriio-decode" + fi + return + fi + echo "" + echo ">>> Shutting down containers..." + if [[ "${IS_PREFILL}" == "1" ]]; then + docker rm -f moriio-prefill moriio-router moriio-toy-proxy 2>/dev/null || true + else + docker rm -f moriio-decode 2>/dev/null || true + fi + echo "Done." +} +trap _cleanup EXIT + +# ── Remove stale containers ─────────────────────────────────────────────────── +if [[ "${IS_PREFILL}" == "1" ]]; then + _stale_containers=(moriio-prefill moriio-router moriio-toy-proxy) +else + _stale_containers=(moriio-decode) +fi +for cname in "${_stale_containers[@]}"; do + if docker ps -a --format '{{.Names}}' | grep -q "^${cname}$"; then + echo "Removing existing container: ${cname}" + docker rm -f "${cname}" + fi +done + +# ── Print summary ───────────────────────────────────────────────────────────── +_role="$([ "${IS_PREFILL}" == "1" ] && echo "prefill + router" || echo "decode")" +echo "=== MoRIIO PD disaggregation demo (2-node) ===" +echo " Model : ${MODEL}" +echo " This node : ${_role}" +echo " Prefill : http://${PREFILL_IP}:${PREFILL_PORT}" +echo " Decode : http://${DECODE_IP}:${DECODE_PORT}" +echo " Router : http://${PREFILL_IP}:${ROUTER_PORT} (prefill node)" +echo " Discovery : ${PREFILL_IP}:${PROXY_PING_PORT}" +echo " MoRIIO ports: handshake=${HANDSHAKE_PORT} notify=${NOTIFY_PORT}" +echo " Phase2 sig : ${PREFILL_IP}:${PHASE2_SIGNAL_PORT} (USE_BENCH=1 or USE_GSM8K=1)" +echo " Log dir : ${LOG_DIR}" +echo "" + +# ═══════════════════════════════════════════════════════════════════════════════ +# PREFILL NODE +# ═══════════════════════════════════════════════════════════════════════════════ +if [[ "${IS_PREFILL}" == "1" ]]; then + +echo ">>> Starting prefill instance (port ${PREFILL_PORT})..." +docker run -d \ + --name moriio-prefill \ + "${VLLM_COMMON_ARGS[@]}" \ + "${VLLM_IMAGE}" \ + vllm serve "${MODEL}" \ + --port "${PREFILL_PORT}" \ + "${VLLM_SERVE_ARGS[@]}" \ + "${PREFILL_EXTRA_ARGS[@]}" \ + --kv-transfer-config "${PREFILL_KV_CONFIG}" + +docker logs -f moriio-prefill 2>&1 | tee "${LOG_DIR}/prefill.log" & + +wait_for_health "moriio-prefill" "http://localhost:${PREFILL_PORT}/health" 1800 + +# ── Launch vllm-router ──────────────────────────────────────────────────────── +if [[ "${USE_BENCH}" == "1" || "${USE_GSM8K}" == "1" ]]; then + _ACTIVE_ROUTER_IMAGE="${ROUTER_STREAMING_IMAGE}" +else + _ACTIVE_ROUTER_IMAGE="${ROUTER_IMAGE}" +fi + +echo "" +echo ">>> Starting vllm-router (port ${ROUTER_PORT}, discovery port ${PROXY_PING_PORT})..." +echo " Image: ${_ACTIVE_ROUTER_IMAGE}" +docker run -d \ + --name moriio-router \ + --network host \ + "${_ACTIVE_ROUTER_IMAGE}" \ + vllm-router \ + --vllm-pd-disaggregation \ + --kv-connector moriio \ + --vllm-discovery-address "0.0.0.0:${PROXY_PING_PORT}" \ + --port "${ROUTER_PORT}" \ + --host 0.0.0.0 \ + --policy consistent_hash \ + --prefill-policy consistent_hash \ + --decode-policy consistent_hash \ + --log-level info + +docker logs -f moriio-router 2>&1 | tee "${LOG_DIR}/router.log" & + +# ── Wait for the decode node to be healthy (cross-node health check) ────────── +echo "" +echo ">>> Waiting for decode node (${DECODE_IP}:${DECODE_PORT}) to become healthy..." +echo " Start the decode node now if you haven't already." +wait_for_health "moriio-decode (remote)" "http://${DECODE_IP}:${DECODE_PORT}/health" 1200 + +# Brief pause for the decode instance to register with the router via ZMQ. +echo "Waiting 10s for decode instance to register with the router..." +sleep 10 + +# ── Summary ─────────────────────────────────────────────────────────────────── +echo "" +echo "=== All services ready ===" +echo " Prefill : http://localhost:${PREFILL_PORT}" +echo " Decode : http://${DECODE_IP}:${DECODE_PORT}" +echo " Router : http://localhost:${ROUTER_PORT} ← send requests here" +echo "" +echo "To follow logs (written to ${LOG_DIR}):" +echo " tail -f ${LOG_DIR}/prefill.log" +echo " tail -f ${LOG_DIR}/router.log" +echo "" +echo "Containers will be shut down automatically when this script exits." +echo "(Set KEEP_ALIVE=1 to leave them running.)" + +# ── Smoke test / benchmark ──────────────────────────────────────────────────── +if [[ "${USE_GSM8K}" == "1" ]]; then + + GSM8K_LOG="${LOG_DIR}/gsm8k_results.log" + GSM8K_JSON_ROUTER="${LOG_DIR}/gsm8k_results_router.json" + GSM8K_JSON_PROXY="${LOG_DIR}/gsm8k_results_proxy.json" + _out_dir="/tmp/lm_eval_out" + + _run_lm_eval() { + local base_url="$1" + docker exec moriio-prefill bash -c \ + "pip install --quiet 'lm_eval[api]' && \ + rm -rf ${_out_dir} && \ + lm_eval \ + --model local-completions \ + --model_args model=${MODEL},base_url=${base_url}/v1/completions,tokenized_requests=False,trust_remote_code=True \ + --tasks gsm8k \ + --output_path ${_out_dir}" \ + 2>&1 | tee -a "${GSM8K_LOG}" + } + + _save_lm_eval_json() { + local dest="$1" + local _remote_json + _remote_json=$(docker exec moriio-prefill \ + find "${_out_dir}" -name "results.json" 2>/dev/null | head -1) + if [[ -n "${_remote_json}" ]]; then + docker cp "moriio-prefill:${_remote_json}" "${dest}" 2>/dev/null || true + else + echo "WARNING: lm_eval results.json not found in ${_out_dir}" >&2 + fi + } + + if [[ "${GSM8K_PHASE2_ONLY:-0}" != "1" ]]; then + # ── Phase 1: GSM8K through vllm-router ─────────────────────────────────── + echo "" + echo ">>> Phase 1: running GSM8K accuracy evaluation (lm_eval) through vllm-router..." + { + echo "======================================================" + echo " GSM8K evaluation (lm_eval) via MoRIIO PD-disaggregation (2-node)" + echo " Router: vllm-router" + echo " Model : ${MODEL}" + echo " Date : $(date)" + echo "======================================================" + } | tee "${GSM8K_LOG}" + + _run_lm_eval "http://127.0.0.1:${ROUTER_PORT}" + _save_lm_eval_json "${GSM8K_JSON_ROUTER}" + + # ── Phase 2: switch to toy proxy ────────────────────────────────────────── + run_phase2_switchover + else + echo "" + echo ">>> GSM8K_PHASE2_ONLY=1: skipping Phase 1 (vllm-router) and running switchover." + { + echo "======================================================" + echo " GSM8K evaluation (lm_eval) via MoRIIO PD-disaggregation (2-node)" + echo " (Phase 2 only run)" + echo " Date : $(date)" + echo "======================================================" + } | tee "${GSM8K_LOG}" + + # The switchover must still happen: it stops the router + prefill (freeing + # PROXY_PING_PORT so the toy proxy's ZMQ listener can bind), starts the toy + # proxy, restarts prefill, and waits for decode to re-register. + run_phase2_switchover + fi + + # ── Phase 2: GSM8K through toy proxy ───────────────────────────────────── + echo "" + echo ">>> Phase 2: running GSM8K accuracy evaluation (lm_eval) through toy proxy..." + { + echo "" + echo "======================================================" + echo " GSM8K evaluation (lm_eval) via MoRIIO PD-disaggregation (2-node)" + echo " Router: moriio_toy_proxy_server.py" + echo " Model : ${MODEL}" + echo " Date : $(date)" + echo "======================================================" + } | tee -a "${GSM8K_LOG}" + + _run_lm_eval "http://127.0.0.1:${TOY_PROXY_HTTP_PORT}" + _save_lm_eval_json "${GSM8K_JSON_PROXY}" + + echo "" + echo "=== GSM8K evaluation complete ===" + echo " Log : ${GSM8K_LOG}" + if [[ "${GSM8K_PHASE2_ONLY:-0}" != "1" ]]; then + echo " Router JSON: ${GSM8K_JSON_ROUTER}" + fi + echo " Proxy JSON : ${GSM8K_JSON_PROXY}" + +elif [[ "${USE_BENCH}" == "1" ]]; then + + BENCH_LOG="${LOG_DIR}/benchmark_results.log" + + BENCH_ARGS=( + --backend vllm + --model "${MODEL}" + --dataset-name random + --random-input-len 1000 + --random-output-len 1000 + --max-concurrency "${BENCH_MAX_CONCURRENCY}" + --num-warmups "${BENCH_NUM_WARMUPS}" + --num-prompts "${BENCH_NUM_PROMPTS}" + --ready_check_timeout_sec 3000 + --seed 1234 + ) + + # ── Phase 1: benchmark through vllm-router ──────────────────────────────── + echo "" + echo ">>> Phase 1: benchmarking through vllm-router..." + { + echo "======================================================" + echo " Router: vllm-router (2-node)" + echo " Model : ${MODEL}" + echo " Date : $(date)" + echo "======================================================" + } | tee "${BENCH_LOG}" + + docker exec moriio-prefill \ + vllm bench serve \ + --base-url "http://localhost:${ROUTER_PORT}" \ + "${BENCH_ARGS[@]}" 2>&1 | tee -a "${BENCH_LOG}" + + # ── Phase 2: switch to toy proxy and benchmark again ────────────────────── + run_phase2_switchover + + echo "" + echo ">>> Phase 2: benchmarking through toy proxy (HTTP port ${TOY_PROXY_HTTP_PORT})..." + { + echo "" + echo "======================================================" + echo " Router: moriio_toy_proxy_server.py (2-node)" + echo " Model : ${MODEL}" + echo " Date : $(date)" + echo "======================================================" + } | tee -a "${BENCH_LOG}" + + docker exec moriio-prefill \ + vllm bench serve \ + --base-url "http://localhost:${TOY_PROXY_HTTP_PORT}" \ + "${BENCH_ARGS[@]}" 2>&1 | tee -a "${BENCH_LOG}" + + echo "" + echo "=== Both benchmark runs complete ===" + echo " Results : ${BENCH_LOG}" + echo " Toy proxy log: ${LOG_DIR}/toy_proxy.log" + +else + + echo "" + echo ">>> Smoke test: sending a completion request through vllm-router..." + curl -s "http://localhost:${ROUTER_PORT}/v1/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"${MODEL}\", + \"prompt\": \"San Francisco is a\", + \"max_tokens\": 64, + \"temperature\": 0 + }" | python3 -m json.tool + echo "" + echo "(Set USE_BENCH=1 for a perf benchmark, or USE_GSM8K=1 for accuracy eval.)" + +fi + +# ═══════════════════════════════════════════════════════════════════════════════ +# DECODE NODE +# ═══════════════════════════════════════════════════════════════════════════════ +else + +# ── Helper: start (or restart) the decode container ────────────────────────── +start_decode() { + local log_suffix="${1:-}" + docker run -d \ + --name moriio-decode \ + "${VLLM_COMMON_ARGS[@]}" \ + "${VLLM_IMAGE}" \ + vllm serve "${MODEL}" \ + --port "${DECODE_PORT}" \ + "${VLLM_SERVE_ARGS[@]}" \ + "${DECODE_EXTRA_ARGS[@]}" \ + --kv-transfer-config "${DECODE_KV_CONFIG}" + + docker logs -f moriio-decode 2>&1 | tee "${LOG_DIR}/decode${log_suffix}.log" & +} + +echo ">>> Starting decode instance (port ${DECODE_PORT})..." +start_decode "" + +wait_for_health "moriio-decode" "http://localhost:${DECODE_PORT}/health" 1800 + +echo "" +echo "=== Decode instance is healthy and registered with router on ${PREFILL_IP} ===" +echo " Decode : http://localhost:${DECODE_PORT}" +echo " Router : http://${PREFILL_IP}:${ROUTER_PORT}" +echo "" +echo "To follow logs:" +echo " tail -f ${LOG_DIR}/decode.log" +echo "" + +if [[ "${USE_BENCH}" == "1" || "${USE_GSM8K}" == "1" ]]; then + # ── Phase 2: wait for the prefill node to signal a restart ─────────────── + echo ">>> USE_BENCH/USE_GSM8K: waiting for Phase 2 restart signal from prefill node" + echo " (polling http://${PREFILL_IP}:${PHASE2_SIGNAL_PORT}/phase2 ...)" + _sig_wait=0 + until curl -sf --max-time 5 "http://${PREFILL_IP}:${PHASE2_SIGNAL_PORT}/phase2" >/dev/null 2>&1; do + sleep 5 + _sig_wait=$((_sig_wait + 5)) + if [[ "${_sig_wait}" -ge 7200 ]]; then + echo "WARNING: Phase 2 signal never arrived after ${_sig_wait}s — exiting." >&2 + exit 0 + fi + if (( _sig_wait % 60 == 0 )); then + echo " Still waiting for Phase 2 signal (${_sig_wait}s)..." + fi + done + + echo "" + echo ">>> Phase 2 signal received — restarting decode container to register with toy proxy..." + docker rm -f moriio-decode 2>/dev/null || true + + # The kv_transfer_config proxy_ip still points at PREFILL_IP where the toy proxy runs. + start_decode "_phase2" + + wait_for_health "moriio-decode (Phase 2)" "http://localhost:${DECODE_PORT}/health" 1800 + + echo "" + echo "=== Decode Phase 2 instance is healthy and registered with toy proxy ===" + echo " Decode : http://localhost:${DECODE_PORT}" + echo " Proxy : http://${PREFILL_IP}:10001" + echo "" + echo "Containers will be shut down when the prefill node finishes (Ctrl+C to abort)." + echo "(Set KEEP_ALIVE=1 to leave the container running after exit.)" + echo "" + + # Block until the user stops the script (prefill node runs the benchmark). + wait +else + echo "Containers will be shut down when this script exits (Ctrl+C)." + echo "(Set KEEP_ALIVE=1 to leave the container running after exit.)" + echo "" + + # Block until the user stops the script; the cleanup trap handles teardown. + wait +fi + +fi diff --git a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py b/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py index eaff0904b877..762d6896dfed 100644 --- a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py +++ b/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py @@ -283,10 +283,12 @@ async def handle_request(): if __name__ == "__main__": - t = start_service_discovery("0.0.0.0", 36367) + _ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) + _http_port = int(os.environ.get("PROXY_HTTP_PORT", "10001")) + t = start_service_discovery("0.0.0.0", _ping_port) app.debug = True app.config["BODY_TIMEOUT"] = 360000 app.config["RESPONSE_TIMEOUT"] = 360000 - app.run(host="0.0.0.0", port=10001) + app.run(host="0.0.0.0", port=_http_port) t.join()