vllm-project · timzsu · May 21, 2026 · May 22, 2026 · May 24, 2026 · Jun 1, 2026
@@ -22,6 +22,7 @@ th {
 | `MingFlashOmniForConditionalGeneration` + `MingImagePipeline` | Ming-flash-omni-2.0 (omni-speech + imagegen<sup>1</sup>) | `Jonathan1909/Ming-flash-omni-2.0` | ✅︎ |   |   |   |
 | `BagelForConditionalGeneration` | BAGEL (DiT-only) | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ | | ✅︎ |
 | `InternVLAA1Pipeline` | InternVLA-A1 | `InternRobotics/InternVLA-A1-3B` | ✅︎ | ✅︎ | | |
+| `Gr00tN1d7Pipeline` | GR00T N1.7 | `nvidia/GR00T-N1.7-3B` | ✅︎ | | | |
 | `HunyuanImage3ForCausalMM` | HunyuanImage3.0 (DiT-only) | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `QwenImagePipeline` | Qwen-Image | `Qwen/Qwen-Image` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `QwenImagePipeline` | Qwen-Image-2512 | `Qwen/Qwen-Image-2512` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |

@@ -0,0 +1,86 @@
+# GR00T-N1.7
+
+> NVIDIA Isaac GR00T-N1.7-3B robot VLA policy served over the OpenPI WebSocket protocol
+
+## Summary
+
+- Vendor: NVIDIA
+- Model: `nvidia/GR00T-N1.7-3B`
+- Task: Vision-Language-Action (VLA) inference for robot manipulation
+- Mode: Online serving via OpenPI WebSocket endpoint
+- Maintainer: timzsu
+
+## When to use this recipe
+
+Use this recipe when you need to serve GR00T-N1.7 as a real-time robot policy
+over the OpenPI WebSocket API. It configures the DROID embodiment
+(`OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT`) and exposes the standard DROID action
+keys (`eef_9d`, `gripper_position`, `joint_position`) with action horizon 40.
+
+## References
+
+- Upstream model: <https://huggingface.co/nvidia/GR00T-N1.7-3B>
+- Upstream codebase: <https://github.com/NVIDIA/Isaac-GR00T>
+- OpenPI client library: <https://github.com/Physical-Intelligence/openpi>
+- Pipeline: `vllm_omni.diffusion.models.gr00t.pipeline_gr00t.Gr00tN1d7Pipeline`
+- Deploy config: [`vllm_omni/deploy/Gr00tN1d7.yaml`](../../vllm_omni/deploy/Gr00tN1d7.yaml)
+- E2E test: [`tests/e2e/online_serving/test_gr00t_openpi.py`](../../tests/e2e/online_serving/test_gr00t_openpi.py)
+
+## Environment
+
+- OS: Linux
+- Python: 3.11+
+- Driver / runtime: NVIDIA CUDA
+- vLLM-Omni version or commit: use versions from your current checkout
+
+## Start server
+
+From repository root:
+
+```bash
+vllm serve nvidia/GR00T-N1.7-3B \
+  --omni \
+  --host 127.0.0.1 \
+  --port 8000 \
+  --served-model-name gr00t-n1d7 \
+  --stage-configs-path vllm_omni/deploy/Gr00tN1d7.yaml
+```
+
+Notes:
+
+- Only `max_num_seqs: 1` is supported (configured in the deploy YAML); GR00T
+  policy state is per-session and not designed for concurrent batching.
+- The WebSocket endpoint is `ws://127.0.0.1:8000/v1/realtime/robot/openpi`.
+  The server handshake message (first frame after connect) is a msgpack-encoded
+  dict with `action_horizon`, `action_keys`, `embodiment_tag`, and
+  `needs_session_id`.
+
+## Verification
+
+```python
+from tests.gr00t.openpi_client_helper import run_policy_session, validate_session_result
+validate_session_result(run_policy_session(host="127.0.0.1", port=8000))
+```
+
+Or run the e2e test suite:
+
+```bash
+python -m pytest tests/e2e/online_serving/test_gr00t_openpi.py -v
+```
+
+The test sends a synthetic two-frame DROID observation and checks:
+
+- GR00T metadata contract: `image_resolution`, `action_horizon`, `action_keys`, `embodiment_tag`
+- Action shapes: `eef_9d (1,40,9)`, `gripper_position (1,40,1)`, `joint_position (1,40,7)`
+- All action values are finite float32
+- Reset response is `"reset successful"`
+
+## Notes
+
+- To switch embodiment, edit `embodiment_tag` under both `model_config` and
+  `policy_server_config` in `vllm_omni/deploy/Gr00tN1d7.yaml`. Supported values:
+  `OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` (default), `XDOF`, `XDOF_SUBTASK`,
+  `REAL_G1`, `REAL_R1_PRO_SHARPA`, `LIBERO_PANDA`, `SIMPLER_ENV_GOOGLE`,
+  `SIMPLER_ENV_WIDOWX`.
+- GR00T weights are loaded directly by `Gr00tPolicy` via `AutoModel.from_pretrained`;
+  the pipeline's `load_weights` is intentionally a no-op.
@@ -0,0 +1,142 @@
+from types import SimpleNamespace
+
+import numpy as np
+import pytest
+
+from vllm_omni.diffusion.models.gr00t import pipeline_gr00t
+from vllm_omni.diffusion.models.gr00t.pipeline_gr00t import Gr00tN1d7Pipeline
+from vllm_omni.diffusion.request import OmniDiffusionRequest
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+class FakeGr00tPolicy:
+    instances = []
+
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+        self.reset_calls = 0
+        self.seen_obs = None
+        self.embodiment_tag = SimpleNamespace(value="fake_embodiment")
+        self.language_key = "annotation.language.language_instruction"
+        self.modality_configs = {
+            "action": SimpleNamespace(
+                delta_indices=[0, 1],
+                modality_keys=["arm", "gripper"],
+            )
+        }
+        self.processor = SimpleNamespace(
+            state_action_processor=SimpleNamespace(
+                norm_params={
+                    "fake_embodiment": {
+                        "action": {
+                            "arm": {"dim": np.array(2)},
+                            "gripper": {"dim": np.array(1)},
+                        }
+                    }
+                }
+            )
+        )
+        FakeGr00tPolicy.instances.append(self)
+
+    def get_action(self, obs):
+        self.seen_obs = obs
+        return {
+            "arm": np.array([[[1.0, 2.0]]], dtype=np.float64),
+            "gripper": [[[3.0]]],
+        }, {"latency_ms": 1.0}
+
+    def reset(self):
+        self.reset_calls += 1
+        return {"reset": True}
+
+
+@pytest.fixture(autouse=True)
+def fake_gr00t_policy(monkeypatch):
+    FakeGr00tPolicy.instances.clear()
+    monkeypatch.setattr(pipeline_gr00t, "Gr00tPolicy", FakeGr00tPolicy)
+
+
+def _pipeline():
+    od_config = SimpleNamespace(
+        model="nvidia/GR00T-N1.7-3B",
+        model_config={
+            "embodiment_tag": "LIBERO_PANDA",
+            "strict": False,
+        },
+        custom_pipeline_args={},
+    )
+    return Gr00tN1d7Pipeline(od_config=od_config)
+
+
+def test_pipeline_initializes_local_policy():
+    pipeline = _pipeline()
+
+    policy = FakeGr00tPolicy.instances[0]
+    assert policy.kwargs["model_path"] == "nvidia/GR00T-N1.7-3B"
+    assert policy.kwargs["embodiment_tag"] == "LIBERO_PANDA"
+    assert policy.kwargs["strict"] is False
+    assert pipeline.weights_sources == ()
+    assert pipeline.load_weights(iter(())) == set()
+
+
+def test_forward_returns_dict_actions_in_multimodal_output():
+    pipeline = _pipeline()
+    req = OmniDiffusionRequest(
+        prompts=["pick"],
+        request_ids=["req"],
+        sampling_params=OmniDiffusionSamplingParams(
+            extra_args={
+                "robot_obs": {
+                    "images": {"cam": np.zeros((1, 1, 8, 8, 3), dtype=np.uint8)},
+                    "state": {"joint": np.zeros((1, 1, 2), dtype=np.float32)},
+                    "prompt": "pick the cube",
+                    "session_id": "session-a",
+                },
+                "reset": True,
+            }
+        ),
+    )
+
+    output = pipeline.forward(req)
+
+    assert output.error is None
+    assert output.output is None
+    actions = output.multimodal_output["actions"]
+    assert set(actions) == {"arm", "gripper"}
+    assert actions["arm"].dtype == np.float32
+    np.testing.assert_allclose(actions["arm"], np.array([[[1.0, 2.0]]], dtype=np.float32))
+    policy = FakeGr00tPolicy.instances[0]
+    assert "video" in policy.seen_obs
+    assert policy.seen_obs["language"] == {"annotation.language.language_instruction": [["pick the cube"]]}
+    assert "images" not in policy.seen_obs
+    assert "prompt" not in policy.seen_obs
+    assert "session_id" not in policy.seen_obs
+    assert policy.reset_calls == 1
+
+
+def test_dummy_warmup_returns_shape_correct_zero_actions():
+    pipeline = _pipeline()
+    req = OmniDiffusionRequest(
+        prompts=["dummy run"],
+        request_ids=["dummy_req_id"],
+        sampling_params=OmniDiffusionSamplingParams(num_inference_steps=1),
+    )
+
+    output = pipeline.forward(req)
+
+    assert output.error is None
+    actions = output.multimodal_output["actions"]
+    assert set(actions) == {"arm", "gripper"}
+    assert actions["arm"].shape == (1, 2, 2)
+    assert actions["gripper"].shape == (1, 2, 1)
+    assert not actions["arm"].any()
+    assert FakeGr00tPolicy.instances[0].seen_obs is None
+
+
+def test_reset_delegates_to_policy():
+    pipeline = _pipeline()
+
+    assert pipeline.reset() == {"reset": True}
+    assert FakeGr00tPolicy.instances[0].reset_calls == 1
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""End-to-end online serving test for GR00T N1.7 through the OpenPI robot endpoint."""
+
+import os
+
+import numpy as np
+import pytest
+
+from tests.gr00t import openpi_client_helper as openpi_client
+from tests.helpers.mark import hardware_test
+from tests.helpers.runtime import OmniServerParams
+from tests.helpers.stage_config import get_deploy_config_path
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+MODEL = "nvidia/GR00T-N1.7-3B"
+openpi_client.require_dependencies()
+
+test_params = [
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=get_deploy_config_path("Gr00tN1d7.yaml"),
+            server_args=["--disable-log-stats"],
+            env_dict={"VLLM_DISABLE_COMPILE_CACHE": "1", "GR00T_NOISE_SEED": "42"},
+            init_timeout=1200,
+            stage_init_timeout=900,
+        ),
+        id="gr00t-n1d7-openpi",
+    )
+]
+
+# Reference values captured from the Isaac-GR00T ZMQ server (nvidia/GR00T-N1.7-3B,
+# OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT embodiment) using build_droid_observation()
+# inputs: zero images (256×256), identity eef_9d, zero gripper/joint, "pick up the object".
+# Outputs are bit-reproducible across resets and across runs (max_diff=0.0).
+_REF_EEF_9D = np.array(
+    [
+        [
+            0.014888550154864788,
+            -0.00039258378092199564,
+            -0.013574761338531971,
+            0.9999837875366211,
+            0.005678884219378233,
+            -0.00034708858584053814,
+            -0.005677700508385897,
+            0.9999783635139465,
+            0.0033210734836757183,
+        ],
+        [
+            0.020188070833683014,
+            -0.0003105341165792197,
+            -0.02232760563492775,
+            0.9997346997261047,
+            0.02301345206797123,
+            0.000983047066256404,
+            -0.02302766777575016,
+            0.9995648860931396,
+            0.018431292846798897,
+        ],
+        [
+            -0.007266733795404434,
+            -0.05537768080830574,
+            0.03667901083827019,
+            0.992686927318573,
+            0.1206541359424591,
+            0.003906540106981993,
+            -0.12024091929197311,
+            0.9853788614273071,
+            0.12070894986391068,
+        ],
+    ],
+    dtype=np.float32,
+)  # rows = step 0, step 4, step 39
+
+_REF_GRIPPER = np.array([[0.0], [0.0078125], [0.939453125]], dtype=np.float32)  # steps 0, 4, 39
+
+_REF_JOINT = np.array(
+    [
+        [
+            -0.0010484338272362947,
+            0.0014262489276006818,
+            -0.003565810853615403,
+            -3.846167237497866e-05,
+            -0.0002604846959002316,
+            0.008521700277924538,
+            -0.006872728932648897,
+        ],
+        [
+            -0.009435676969587803,
+            0.0021475711837410927,
+            -0.0031688229646533728,
+            -1.8328893929719925e-05,
+            0.0005945992306806147,
+            0.019159257411956787,
+            0.0009468861389905214,
+        ],
+        [
+            -0.02944088727235794,
+            -0.08419207483530045,
+            -0.0251418836414814,
+            0.00540524534881115,
+            0.04752273112535477,
+            -0.012884500436484814,
+            0.024298785254359245,
+        ],
+    ],
+    dtype=np.float32,
+)  # rows = step 0, step 4, step 39
+
+
+@pytest.mark.advanced_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_gr00t_n1d7_openpi_online(omni_server) -> None:
+    result = openpi_client.run_policy_session(
+        host=omni_server.host,
+        port=omni_server.port,
+        session_id="gr00t-online-e2e",
+    )
+    openpi_client.validate_session_result(result)
+
+
+@pytest.mark.advanced_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_gr00t_n1d7_openpi_precision(omni_server) -> None:
+    """Assert actions match Isaac-GR00T reference (GR00T_NOISE_SEED=42, zero inputs)."""
+    client = openpi_client.OpenPIWebsocketClient(host=omni_server.host, port=omni_server.port)
+    try:
+        obs = openpi_client.build_droid_observation(session_id="gr00t-precision-e2e")
+        client.reset({})
+        actions = client.infer(obs)
+    finally:
+        client.close()
+
+    steps = [0, 4, 39]
+    np.testing.assert_allclose(
+        actions["eef_9d"][0, steps, :],
+        _REF_EEF_9D,
+        atol=1e-2,
+        rtol=0.0,
+        err_msg="eef_9d action mismatch vs Isaac-GR00T reference",
+    )
+    np.testing.assert_allclose(
+        actions["gripper_position"][0, steps, :],
+        _REF_GRIPPER,
+        atol=1e-2,
+        rtol=0.0,
+        err_msg="gripper_position action mismatch vs Isaac-GR00T reference",
+    )
+    np.testing.assert_allclose(
+        actions["joint_position"][0, steps, :],
+        _REF_JOINT,
+        atol=1e-2,
+        rtol=0.0,
+        err_msg="joint_position action mismatch vs Isaac-GR00T reference",
+    )