Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ th {
| `MingFlashOmniForConditionalGeneration` + `MingImagePipeline` | Ming-flash-omni-2.0 (omni-speech + imagegen<sup>1</sup>) | `Jonathan1909/Ming-flash-omni-2.0` | ✅︎ | | | |
| `BagelForConditionalGeneration` | BAGEL (DiT-only) | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ | | ✅︎ |
| `InternVLAA1Pipeline` | InternVLA-A1 | `InternRobotics/InternVLA-A1-3B` | ✅︎ | ✅︎ | | |
| `Gr00tN1d7Pipeline` | GR00T N1.7 | `nvidia/GR00T-N1.7-3B` | ✅︎ | | | |
| `HunyuanImage3ForCausalMM` | HunyuanImage3.0 (DiT-only) | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
| `QwenImagePipeline` | Qwen-Image | `Qwen/Qwen-Image` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
| `QwenImagePipeline` | Qwen-Image-2512 | `Qwen/Qwen-Image-2512` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
Expand Down
86 changes: 86 additions & 0 deletions recipes/NVIDIA/GR00T-N1.7.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# GR00T-N1.7

> NVIDIA Isaac GR00T-N1.7-3B robot VLA policy served over the OpenPI WebSocket protocol

## Summary

- Vendor: NVIDIA
- Model: `nvidia/GR00T-N1.7-3B`
- Task: Vision-Language-Action (VLA) inference for robot manipulation
- Mode: Online serving via OpenPI WebSocket endpoint
- Maintainer: timzsu

## When to use this recipe

Use this recipe when you need to serve GR00T-N1.7 as a real-time robot policy
over the OpenPI WebSocket API. It configures the DROID embodiment
(`OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT`) and exposes the standard DROID action
keys (`eef_9d`, `gripper_position`, `joint_position`) with action horizon 40.

## References

- Upstream model: <https://huggingface.co/nvidia/GR00T-N1.7-3B>
- Upstream codebase: <https://github.com/NVIDIA/Isaac-GR00T>
- OpenPI client library: <https://github.com/Physical-Intelligence/openpi>
- Pipeline: `vllm_omni.diffusion.models.gr00t.pipeline_gr00t.Gr00tN1d7Pipeline`
- Deploy config: [`vllm_omni/deploy/Gr00tN1d7.yaml`](../../vllm_omni/deploy/Gr00tN1d7.yaml)
- E2E test: [`tests/e2e/online_serving/test_gr00t_openpi.py`](../../tests/e2e/online_serving/test_gr00t_openpi.py)

## Environment

- OS: Linux
- Python: 3.11+
- Driver / runtime: NVIDIA CUDA
- vLLM-Omni version or commit: use versions from your current checkout

## Start server

From repository root:

```bash
vllm serve nvidia/GR00T-N1.7-3B \
--omni \
--host 127.0.0.1 \
--port 8000 \
--served-model-name gr00t-n1d7 \
--stage-configs-path vllm_omni/deploy/Gr00tN1d7.yaml
```

Notes:

- Only `max_num_seqs: 1` is supported (configured in the deploy YAML); GR00T
policy state is per-session and not designed for concurrent batching.
- The WebSocket endpoint is `ws://127.0.0.1:8000/v1/realtime/robot/openpi`.
The server handshake message (first frame after connect) is a msgpack-encoded
dict with `action_horizon`, `action_keys`, `embodiment_tag`, and
`needs_session_id`.

## Verification

```python
from tests.gr00t.openpi_client_helper import run_policy_session, validate_session_result
validate_session_result(run_policy_session(host="127.0.0.1", port=8000))
```

Or run the e2e test suite:

```bash
python -m pytest tests/e2e/online_serving/test_gr00t_openpi.py -v
```

The test sends a synthetic two-frame DROID observation and checks:

- GR00T metadata contract: `image_resolution`, `action_horizon`, `action_keys`, `embodiment_tag`
- Action shapes: `eef_9d (1,40,9)`, `gripper_position (1,40,1)`, `joint_position (1,40,7)`
- All action values are finite float32
- Reset response is `"reset successful"`

## Notes

- To switch embodiment, edit `embodiment_tag` under both `model_config` and
`policy_server_config` in `vllm_omni/deploy/Gr00tN1d7.yaml`. Supported values:
`OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` (default), `XDOF`, `XDOF_SUBTASK`,
`REAL_G1`, `REAL_R1_PRO_SHARPA`, `LIBERO_PANDA`, `SIMPLER_ENV_GOOGLE`,
`SIMPLER_ENV_WIDOWX`.
- GR00T weights are loaded directly by `Gr00tPolicy` via `AutoModel.from_pretrained`;
the pipeline's `load_weights` is intentionally a no-op.
Empty file.
142 changes: 142 additions & 0 deletions tests/diffusion/models/gr00t/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from types import SimpleNamespace

import numpy as np
import pytest

from vllm_omni.diffusion.models.gr00t import pipeline_gr00t
from vllm_omni.diffusion.models.gr00t.pipeline_gr00t import Gr00tN1d7Pipeline
from vllm_omni.diffusion.request import OmniDiffusionRequest
from vllm_omni.inputs.data import OmniDiffusionSamplingParams

pytestmark = [pytest.mark.core_model, pytest.mark.cpu]


class FakeGr00tPolicy:
instances = []

def __init__(self, **kwargs):
self.kwargs = kwargs
self.reset_calls = 0
self.seen_obs = None
self.embodiment_tag = SimpleNamespace(value="fake_embodiment")
self.language_key = "annotation.language.language_instruction"
self.modality_configs = {
"action": SimpleNamespace(
delta_indices=[0, 1],
modality_keys=["arm", "gripper"],
)
}
self.processor = SimpleNamespace(
state_action_processor=SimpleNamespace(
norm_params={
"fake_embodiment": {
"action": {
"arm": {"dim": np.array(2)},
"gripper": {"dim": np.array(1)},
}
}
}
)
)
FakeGr00tPolicy.instances.append(self)

def get_action(self, obs):
self.seen_obs = obs
return {
"arm": np.array([[[1.0, 2.0]]], dtype=np.float64),
"gripper": [[[3.0]]],
}, {"latency_ms": 1.0}

def reset(self):
self.reset_calls += 1
return {"reset": True}


@pytest.fixture(autouse=True)
def fake_gr00t_policy(monkeypatch):
FakeGr00tPolicy.instances.clear()
monkeypatch.setattr(pipeline_gr00t, "Gr00tPolicy", FakeGr00tPolicy)


def _pipeline():
od_config = SimpleNamespace(
model="nvidia/GR00T-N1.7-3B",
model_config={
"embodiment_tag": "LIBERO_PANDA",
"strict": False,
},
custom_pipeline_args={},
)
return Gr00tN1d7Pipeline(od_config=od_config)


def test_pipeline_initializes_local_policy():
pipeline = _pipeline()

policy = FakeGr00tPolicy.instances[0]
assert policy.kwargs["model_path"] == "nvidia/GR00T-N1.7-3B"
assert policy.kwargs["embodiment_tag"] == "LIBERO_PANDA"
assert policy.kwargs["strict"] is False
assert pipeline.weights_sources == ()
assert pipeline.load_weights(iter(())) == set()


def test_forward_returns_dict_actions_in_multimodal_output():
pipeline = _pipeline()
req = OmniDiffusionRequest(
prompts=["pick"],
request_ids=["req"],
sampling_params=OmniDiffusionSamplingParams(
extra_args={
"robot_obs": {
"images": {"cam": np.zeros((1, 1, 8, 8, 3), dtype=np.uint8)},
"state": {"joint": np.zeros((1, 1, 2), dtype=np.float32)},
"prompt": "pick the cube",
"session_id": "session-a",
},
"reset": True,
}
),
)

output = pipeline.forward(req)

assert output.error is None
assert output.output is None
actions = output.multimodal_output["actions"]
assert set(actions) == {"arm", "gripper"}
assert actions["arm"].dtype == np.float32
np.testing.assert_allclose(actions["arm"], np.array([[[1.0, 2.0]]], dtype=np.float32))
policy = FakeGr00tPolicy.instances[0]
assert "video" in policy.seen_obs
assert policy.seen_obs["language"] == {"annotation.language.language_instruction": [["pick the cube"]]}
assert "images" not in policy.seen_obs
assert "prompt" not in policy.seen_obs
assert "session_id" not in policy.seen_obs
assert policy.reset_calls == 1


def test_dummy_warmup_returns_shape_correct_zero_actions():
pipeline = _pipeline()
req = OmniDiffusionRequest(
prompts=["dummy run"],
request_ids=["dummy_req_id"],
sampling_params=OmniDiffusionSamplingParams(num_inference_steps=1),
)

output = pipeline.forward(req)

assert output.error is None
actions = output.multimodal_output["actions"]
assert set(actions) == {"arm", "gripper"}
assert actions["arm"].shape == (1, 2, 2)
assert actions["gripper"].shape == (1, 2, 1)
assert not actions["arm"].any()
assert FakeGr00tPolicy.instances[0].seen_obs is None


def test_reset_delegates_to_policy():
pipeline = _pipeline()

assert pipeline.reset() == {"reset": True}
assert FakeGr00tPolicy.instances[0].reset_calls == 1
161 changes: 161 additions & 0 deletions tests/e2e/online_serving/test_gr00t_openpi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""End-to-end online serving test for GR00T N1.7 through the OpenPI robot endpoint."""

import os

import numpy as np
import pytest

from tests.gr00t import openpi_client_helper as openpi_client
from tests.helpers.mark import hardware_test
from tests.helpers.runtime import OmniServerParams
from tests.helpers.stage_config import get_deploy_config_path

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

MODEL = "nvidia/GR00T-N1.7-3B"
openpi_client.require_dependencies()

test_params = [
pytest.param(
OmniServerParams(
model=MODEL,
stage_config_path=get_deploy_config_path("Gr00tN1d7.yaml"),
server_args=["--disable-log-stats"],
env_dict={"VLLM_DISABLE_COMPILE_CACHE": "1", "GR00T_NOISE_SEED": "42"},
init_timeout=1200,
stage_init_timeout=900,
),
id="gr00t-n1d7-openpi",
)
]

# Reference values captured from the Isaac-GR00T ZMQ server (nvidia/GR00T-N1.7-3B,
# OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT embodiment) using build_droid_observation()
# inputs: zero images (256×256), identity eef_9d, zero gripper/joint, "pick up the object".
# Outputs are bit-reproducible across resets and across runs (max_diff=0.0).
_REF_EEF_9D = np.array(
[
[
0.014888550154864788,
-0.00039258378092199564,
-0.013574761338531971,
0.9999837875366211,
0.005678884219378233,
-0.00034708858584053814,
-0.005677700508385897,
0.9999783635139465,
0.0033210734836757183,
],
[
0.020188070833683014,
-0.0003105341165792197,
-0.02232760563492775,
0.9997346997261047,
0.02301345206797123,
0.000983047066256404,
-0.02302766777575016,
0.9995648860931396,
0.018431292846798897,
],
[
-0.007266733795404434,
-0.05537768080830574,
0.03667901083827019,
0.992686927318573,
0.1206541359424591,
0.003906540106981993,
-0.12024091929197311,
0.9853788614273071,
0.12070894986391068,
],
],
dtype=np.float32,
) # rows = step 0, step 4, step 39

_REF_GRIPPER = np.array([[0.0], [0.0078125], [0.939453125]], dtype=np.float32) # steps 0, 4, 39

_REF_JOINT = np.array(
[
[
-0.0010484338272362947,
0.0014262489276006818,
-0.003565810853615403,
-3.846167237497866e-05,
-0.0002604846959002316,
0.008521700277924538,
-0.006872728932648897,
],
[
-0.009435676969587803,
0.0021475711837410927,
-0.0031688229646533728,
-1.8328893929719925e-05,
0.0005945992306806147,
0.019159257411956787,
0.0009468861389905214,
],
[
-0.02944088727235794,
-0.08419207483530045,
-0.0251418836414814,
0.00540524534881115,
0.04752273112535477,
-0.012884500436484814,
0.024298785254359245,
],
],
dtype=np.float32,
) # rows = step 0, step 4, step 39


@pytest.mark.advanced_model
@pytest.mark.diffusion
@hardware_test(res={"cuda": "H100"}, num_cards=1)
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_gr00t_n1d7_openpi_online(omni_server) -> None:
result = openpi_client.run_policy_session(
host=omni_server.host,
port=omni_server.port,
session_id="gr00t-online-e2e",
)
openpi_client.validate_session_result(result)


@pytest.mark.advanced_model
@pytest.mark.diffusion
@hardware_test(res={"cuda": "H100"}, num_cards=1)
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_gr00t_n1d7_openpi_precision(omni_server) -> None:
"""Assert actions match Isaac-GR00T reference (GR00T_NOISE_SEED=42, zero inputs)."""
client = openpi_client.OpenPIWebsocketClient(host=omni_server.host, port=omni_server.port)
try:
obs = openpi_client.build_droid_observation(session_id="gr00t-precision-e2e")
client.reset({})
actions = client.infer(obs)
finally:
client.close()

steps = [0, 4, 39]
np.testing.assert_allclose(
actions["eef_9d"][0, steps, :],
_REF_EEF_9D,
atol=1e-2,
rtol=0.0,
err_msg="eef_9d action mismatch vs Isaac-GR00T reference",
)
np.testing.assert_allclose(
actions["gripper_position"][0, steps, :],
_REF_GRIPPER,
atol=1e-2,
rtol=0.0,
err_msg="gripper_position action mismatch vs Isaac-GR00T reference",
)
np.testing.assert_allclose(
actions["joint_position"][0, steps, :],
_REF_JOINT,
atol=1e-2,
rtol=0.0,
err_msg="joint_position action mismatch vs Isaac-GR00T reference",
)
Loading
Loading