Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
25cc283
Init
fsiino-nvidia Nov 8, 2025
296526d
Parse server info
fsiino-nvidia Nov 10, 2025
64705b9
Print output
fsiino-nvidia Nov 11, 2025
c36f84b
Merge remote-tracking branch 'github/main' into fsiino/server-health-…
fsiino-nvidia Nov 11, 2025
8cc480a
Round uptime seconds, add tests
fsiino-nvidia Nov 12, 2025
dfc3cfe
Add docs
fsiino-nvidia Nov 13, 2025
4cf6d1f
Fix copyrights
fsiino-nvidia Nov 13, 2025
c3d2fe6
Clean print statement
fsiino-nvidia Nov 13, 2025
2647e8f
Merge remote-tracking branch 'github/main' into fsiino/server-health-…
fsiino-nvidia Nov 13, 2025
4261772
Merge remote-tracking branch 'github/main' into fsiino/server-health-…
fsiino-nvidia Nov 15, 2025
40445c8
Update uv.lock
fsiino-nvidia Nov 18, 2025
1087b83
Merge branch 'main' into fsiino/server-health-status
fsiino-nvidia Dec 1, 2025
ab6feb4
Merge remote-tracking branch 'github/main' into fsiino/server-health-…
fsiino-nvidia Dec 15, 2025
696fdc0
Improve test coverage
fsiino-nvidia Dec 15, 2025
91a525c
Merge remote-tracking branch 'github/main' into fsiino/server-health-…
fsiino-nvidia Dec 16, 2025
5f2532a
Merge remote-tracking branch 'github/main' into fsiino/server-health-…
fsiino-nvidia Dec 16, 2025
00abd06
Move docs
fsiino-nvidia Dec 16, 2025
35e0206
Reuse server info base class
fsiino-nvidia Dec 16, 2025
00e7629
Call headserver instead of process search
fsiino-nvidia Dec 16, 2025
06994c3
Use ServerInstanceDisplayConfig for ng_run and ng_status
fsiino-nvidia Dec 17, 2025
6d174d2
Small format adjustment, update output in docs
fsiino-nvidia Dec 17, 2025
bacf133
Remove ng_version from faq
fsiino-nvidia Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions docs/reference/cli-commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,49 @@ ng_pip_list +entrypoint=resources_servers/example_single_tool_call +format=json
ng_pip_list +entrypoint=resources_servers/example_single_tool_call +outdated=true
```

---

### `ng_status` / `nemo_gym_status`

View all currently running NeMo Gym servers and their health status.

**Example**

```bash
ng_status

NeMo Gym Server Status:

[1] ✓ example_single_tool_call (resources_servers/example_single_tool_call)
{
'server_type': 'resources_servers',
'name': 'example_single_tool_call',
'port': 58117,
'pid': 89904,
'uptime_seconds': '0d 0h 0m 41.5s',
}
[2] ✓ example_single_tool_call_simple_agent (responses_api_agents/simple_agent)
{
'server_type': 'responses_api_agents',
'name': 'simple_agent',
'port': 58118,
'pid': 89905,
'uptime_seconds': '0d 0h 0m 41.5s',
}
[3] ✓ policy_model (responses_api_models/openai_model)
{
'server_type': 'responses_api_models',
'name': 'openai_model',
'port': 58119,
'pid': 89907,
'uptime_seconds': '0d 0h 0m 41.5s',
}

3 servers found (3 healthy, 0 unhealthy)

```

---

## Getting Help

Expand Down
40 changes: 23 additions & 17 deletions nemo_gym/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@
from signal import SIGINT
from subprocess import Popen
from threading import Thread
from time import sleep
from time import sleep, time
from typing import Dict, List, Optional, Tuple

import psutil
import rich
import uvicorn
from devtools import pprint
from omegaconf import DictConfig, OmegaConf
from pydantic import BaseModel, Field
from pydantic import Field
from tqdm.auto import tqdm

from nemo_gym import PARENT_DIR, __version__
Expand All @@ -49,10 +49,12 @@
GlobalConfigDictParserConfig,
get_global_config_dict,
)
from nemo_gym.server_status import StatusCommand
from nemo_gym.server_utils import (
HEAD_SERVER_KEY_NAME,
HeadServer,
ServerClient,
ServerInstanceDisplayConfig,
ServerStatus,
initialize_ray,
)
Expand Down Expand Up @@ -146,22 +148,10 @@ def dir_path(self) -> Path:
return self._dir_path


class ServerInstanceDisplayConfig(BaseModel):
process_name: str
server_type: str
name: str
dir_path: Path
entrypoint: str
host: Optional[str] = None
port: Optional[int] = None
pid: Optional[int] = None
config_path: str
url: Optional[str] = None


class RunHelper: # pragma: no cover
_head_server: uvicorn.Server
_head_server_thread: Thread
_head_server_instance: HeadServer

_processes: Dict[str, Popen]
_server_instance_display_configs: List[ServerInstanceDisplayConfig]
Expand All @@ -178,13 +168,15 @@ def start(self, global_config_dict_parser_config: GlobalConfigDictParserConfig)
escaped_config_dict_yaml_str = shlex.quote(OmegaConf.to_yaml(global_config_dict))

# We always run the head server in this `run` command.
self._head_server, self._head_server_thread = HeadServer.run_webserver()
self._head_server, self._head_server_thread, self._head_server_instance = HeadServer.run_webserver()

top_level_paths = [k for k in global_config_dict.keys() if k not in NEMO_GYM_RESERVED_TOP_LEVEL_KEYS]

self._processes: Dict[str, Popen] = dict()
self._server_instance_display_configs: List[ServerInstanceDisplayConfig] = []

start_time = time()

# TODO there is a better way to resolve this that uses nemo_gym/global_config.py::ServerInstanceConfig
for top_level_path in top_level_paths:
server_config_dict = global_config_dict[top_level_path]
Expand Down Expand Up @@ -232,9 +224,14 @@ def start(self, global_config_dict_parser_config: GlobalConfigDictParserConfig)
url=f"http://{host}:{port}" if host and port else None,
pid=process.pid,
config_path=top_level_path,
start_time=start_time,
)
)

self._head_server_instance.set_server_instances(
[inst.model_dump(mode="json") for inst in self._server_instance_display_configs]
)

self._server_client = ServerClient(
head_server_config=ServerClient.load_head_server_config(),
global_config_dict=global_config_dict,
Expand Down Expand Up @@ -267,7 +264,7 @@ def display_server_instance_info(self) -> None:

for i, inst in enumerate(self._server_instance_display_configs, 1):
print(f"[{i}] {inst.process_name} ({inst.server_type}/{inst.name})")
pprint(inst.model_dump(mode="json"))
pprint(inst.model_dump(mode="json", exclude={"start_time", "status", "uptime_seconds"}))
print(f"{'#' * 100}\n")

def poll(self) -> None:
Expand Down Expand Up @@ -791,6 +788,15 @@ def display_help(): # pragma: no cover
print(script)


def status(): # pragma: no cover
global_config_dict = get_global_config_dict()
BaseNeMoGymCLIConfig.model_validate(global_config_dict)

status_cmd = StatusCommand()
servers = status_cmd.discover_servers()
status_cmd.display_status(servers)


class PipListConfig(RunConfig):
format: Optional[str] = Field(
default=None,
Expand Down
5 changes: 3 additions & 2 deletions nemo_gym/dataset_orchestrator.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this intended? can we double check this is using the right license?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. This appears to be the updated licensing info that we have been using for all files.

Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
Expand Down
5 changes: 3 additions & 2 deletions nemo_gym/hf_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
Expand Down
113 changes: 113 additions & 0 deletions nemo_gym/server_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from time import time
from typing import List

import requests
from devtools import pprint

from nemo_gym.server_utils import ServerClient, ServerInstanceDisplayConfig, ServerStatus


class StatusCommand:
"""Main class to check server status"""

def check_health(self, server_info: ServerInstanceDisplayConfig) -> ServerStatus:
"""Check if server is responding"""
if not server_info.url:
return "unknown_error"

try:
requests.get(server_info.url, timeout=2)
return "success"
except requests.exceptions.ConnectionError:
return "connection_error"
except requests.exceptions.Timeout:
return "timeout"
except Exception:
return "unknown_error"

def discover_servers(self) -> List[ServerInstanceDisplayConfig]:
"""Find all running NeMo Gym server processes"""

try:
head_server_config = ServerClient.load_head_server_config()
head_url = f"http://{head_server_config.host}:{head_server_config.port}"

response = requests.get(f"{head_url}/server_instances", timeout=5)
response.raise_for_status()
instances = response.json()

servers = []
current_time = time()

for inst in instances:
uptime = current_time - inst.get("start_time", current_time)
server_info = ServerInstanceDisplayConfig(
process_name=inst["process_name"],
server_type=inst["server_type"],
name=inst["name"],
host=inst.get("host"),
port=inst.get("port"),
url=inst.get("url"),
entrypoint=inst.get("entrypoint"),
pid=inst.get("pid"),
uptime_seconds=uptime,
status="unknown_error",
)
server_info.status = self.check_health(server_info)
servers.append(server_info)

return servers

except (requests.RequestException, ConnectionError) as e:
print(f"""
Could not connect to head server: {e}
Is the head server running? Start it with: `ng_run`
""")
return []

def display_status(self, servers: List[ServerInstanceDisplayConfig]) -> None:
"""Show server info in a table"""

def format_uptime(uptime_seconds: float) -> str:
"""Format uptime in a human readable format"""
minutes, seconds = divmod(uptime_seconds, 60)
hours, minutes = divmod(minutes, 60)
days, hours = divmod(hours, 24)
return f"{int(days)}d {int(hours)}h {int(minutes)}m {seconds:.1f}s"

if not servers:
print("No NeMo Gym servers found running.")
return

print("\nNeMo Gym Server Status:\n")

for i, server in enumerate(servers, 1):
status_icon = "✓" if server.status == "success" else "✗"
print(f"[{i}] {status_icon} {server.process_name} ({server.server_type}/{server.name})")
display_dict = {
"server_type": server.server_type,
"name": server.name,
"port": server.port,
"pid": server.pid,
"uptime_seconds": format_uptime(server.uptime_seconds),
}
pprint(display_dict)

healthy_count = sum(1 for s in servers if s.status == "success")
print(f"""
{len(servers)} servers found ({healthy_count} healthy, {len(servers) - healthy_count} unhealthy)
""")
38 changes: 31 additions & 7 deletions nemo_gym/server_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pathlib import Path
from threading import Thread
from traceback import print_exc
from typing import Literal, Optional, Tuple, Type, Union, Unpack
from typing import List, Literal, Optional, Tuple, Type, Union, Unpack
from uuid import uuid4

import ray
Expand All @@ -45,7 +45,7 @@
from fastapi import FastAPI, Request, Response
from fastapi.exception_handlers import request_validation_exception_handler
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from fastapi.responses import JSONResponse, PlainTextResponse
from omegaconf import DictConfig, OmegaConf, open_dict
from pydantic import BaseModel, ConfigDict
from requests.exceptions import ConnectionError
Expand Down Expand Up @@ -543,16 +543,24 @@ def filter(self, record: LogRecord) -> bool:

class HeadServer(BaseServer):
config: BaseServerConfig
_server_instances: List[dict] = []

def setup_webserver(self) -> FastAPI:
app = FastAPI()

app.get("/global_config_dict_yaml")(self.global_config_dict_yaml)
app.get("/global_config_dict_yaml", response_class=PlainTextResponse)(self.global_config_dict_yaml)
app.get("/server_instances")(self.get_server_instances)

return app

def get_server_instances(self) -> List[dict]:
return self._server_instances

def set_server_instances(self, instances: List) -> None:
self._server_instances = instances

@classmethod
def run_webserver(cls) -> Tuple[uvicorn.Server, Thread]: # pragma: no cover
def run_webserver(cls) -> Tuple[uvicorn.Server, Thread, "HeadServer"]: # pragma: no cover
config = ServerClient.load_head_server_config()
server = cls(config=config)

Expand All @@ -563,12 +571,28 @@ def run_webserver(cls) -> Tuple[uvicorn.Server, Thread]: # pragma: no cover
host=server.config.host,
port=server.config.port,
)
server = uvicorn.Server(config=config)
uvicorn_server = uvicorn.Server(config=config)

thread = Thread(target=server.run, daemon=True)
thread = Thread(target=uvicorn_server.run, daemon=True)
thread.start()

return server, thread
return uvicorn_server, thread, server

async def global_config_dict_yaml(self) -> str:
return OmegaConf.to_yaml(get_global_config_dict())


class ServerInstanceDisplayConfig(BaseModel):
config_path: Optional[str] = None
dir_path: Optional[Path] = None
entrypoint: Optional[str] = None
host: Optional[str] = None
name: Optional[str] = None
pid: Optional[int] = None
port: Optional[int] = None
process_name: Optional[str] = None
server_type: Optional[str] = None
start_time: Optional[float] = None
status: Optional[ServerStatus] = None
uptime_seconds: Optional[float] = None
url: Optional[str] = None
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,10 @@ ng_dump_config = "nemo_gym.cli:dump_config"
nemo_gym_help = "nemo_gym.cli:display_help"
ng_help = "nemo_gym.cli:display_help"

# Server status
nemo_gym_status = "nemo_gym.cli:status"
ng_status = "nemo_gym.cli:status"

# Environment-specific uv pip list
nemo_gym_pip_list = "nemo_gym.cli:pip_list"
ng_pip_list = "nemo_gym.cli:pip_list"
Expand Down
Loading