NVIDIA-NeMo · bxyu-nvidia · Dec 18, 2025 · Nov 8, 2025 · Nov 10, 2025 · Nov 11, 2025
diff --git a/docs/reference/cli-commands.md b/docs/reference/cli-commands.md
@@ -584,6 +584,49 @@ ng_pip_list +entrypoint=resources_servers/example_single_tool_call +format=json
 ng_pip_list +entrypoint=resources_servers/example_single_tool_call +outdated=true
 ```
 
+---
+
+### `ng_status` / `nemo_gym_status`
+
+View all currently running NeMo Gym servers and their health status.
+
+**Example**
+
+```bash
+ng_status
+
+NeMo Gym Server Status:
+
+[1] ✓ example_single_tool_call (resources_servers/example_single_tool_call)
+{
+    'server_type': 'resources_servers',
+    'name': 'example_single_tool_call',
+    'port': 58117,
+    'pid': 89904,
+    'uptime_seconds': '0d 0h 0m 41.5s',
+}
+[2] ✓ example_single_tool_call_simple_agent (responses_api_agents/simple_agent)
+{
+    'server_type': 'responses_api_agents',
+    'name': 'simple_agent',
+    'port': 58118,
+    'pid': 89905,
+    'uptime_seconds': '0d 0h 0m 41.5s',
+}
+[3] ✓ policy_model (responses_api_models/openai_model)
+{
+    'server_type': 'responses_api_models',
+    'name': 'openai_model',
+    'port': 58119,
+    'pid': 89907,
+    'uptime_seconds': '0d 0h 0m 41.5s',
+}
+
+3 servers found (3 healthy, 0 unhealthy)
+
+```
+
+---
 
 ## Getting Help
 

diff --git a/nemo_gym/cli.py b/nemo_gym/cli.py
@@ -27,15 +27,15 @@
 from signal import SIGINT
 from subprocess import Popen
 from threading import Thread
-from time import sleep
+from time import sleep, time
 from typing import Dict, List, Optional, Tuple
 
 import psutil
 import rich
 import uvicorn
 from devtools import pprint
 from omegaconf import DictConfig, OmegaConf
-from pydantic import BaseModel, Field
+from pydantic import Field
 from tqdm.auto import tqdm
 
 from nemo_gym import PARENT_DIR, __version__
@@ -49,10 +49,12 @@
     GlobalConfigDictParserConfig,
     get_global_config_dict,
 )
+from nemo_gym.server_status import StatusCommand
 from nemo_gym.server_utils import (
     HEAD_SERVER_KEY_NAME,
     HeadServer,
     ServerClient,
+    ServerInstanceDisplayConfig,
     ServerStatus,
     initialize_ray,
 )
@@ -146,22 +148,10 @@ def dir_path(self) -> Path:
         return self._dir_path
 
 
-class ServerInstanceDisplayConfig(BaseModel):
-    process_name: str
-    server_type: str
-    name: str
-    dir_path: Path
-    entrypoint: str
-    host: Optional[str] = None
-    port: Optional[int] = None
-    pid: Optional[int] = None
-    config_path: str
-    url: Optional[str] = None
-
-
 class RunHelper:  # pragma: no cover
     _head_server: uvicorn.Server
     _head_server_thread: Thread
+    _head_server_instance: HeadServer
 
     _processes: Dict[str, Popen]
     _server_instance_display_configs: List[ServerInstanceDisplayConfig]
@@ -178,13 +168,15 @@ def start(self, global_config_dict_parser_config: GlobalConfigDictParserConfig)
         escaped_config_dict_yaml_str = shlex.quote(OmegaConf.to_yaml(global_config_dict))
 
         # We always run the head server in this `run` command.
-        self._head_server, self._head_server_thread = HeadServer.run_webserver()
+        self._head_server, self._head_server_thread, self._head_server_instance = HeadServer.run_webserver()
 
         top_level_paths = [k for k in global_config_dict.keys() if k not in NEMO_GYM_RESERVED_TOP_LEVEL_KEYS]
 
         self._processes: Dict[str, Popen] = dict()
         self._server_instance_display_configs: List[ServerInstanceDisplayConfig] = []
 
+        start_time = time()
+
         # TODO there is a better way to resolve this that uses nemo_gym/global_config.py::ServerInstanceConfig
         for top_level_path in top_level_paths:
             server_config_dict = global_config_dict[top_level_path]
@@ -232,9 +224,14 @@ def start(self, global_config_dict_parser_config: GlobalConfigDictParserConfig)
                     url=f"http://{host}:{port}" if host and port else None,
                     pid=process.pid,
                     config_path=top_level_path,
+                    start_time=start_time,
                 )
             )
 
+        self._head_server_instance.set_server_instances(
+            [inst.model_dump(mode="json") for inst in self._server_instance_display_configs]
+        )
+
         self._server_client = ServerClient(
             head_server_config=ServerClient.load_head_server_config(),
             global_config_dict=global_config_dict,
@@ -267,7 +264,7 @@ def display_server_instance_info(self) -> None:
 
         for i, inst in enumerate(self._server_instance_display_configs, 1):
             print(f"[{i}] {inst.process_name} ({inst.server_type}/{inst.name})")
-            pprint(inst.model_dump(mode="json"))
+            pprint(inst.model_dump(mode="json", exclude={"start_time", "status", "uptime_seconds"}))
         print(f"{'#' * 100}\n")
 
     def poll(self) -> None:
@@ -791,6 +788,15 @@ def display_help():  # pragma: no cover
         print(script)
 
 
+def status():  # pragma: no cover
+    global_config_dict = get_global_config_dict()
+    BaseNeMoGymCLIConfig.model_validate(global_config_dict)
+
+    status_cmd = StatusCommand()
+    servers = status_cmd.discover_servers()
+    status_cmd.display_status(servers)
+
+
 class PipListConfig(RunConfig):
     format: Optional[str] = Field(
         default=None,

diff --git a/nemo_gym/dataset_orchestrator.py b/nemo_gym/dataset_orchestrator.py
@@ -1,10 +1,11 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,

diff --git a/nemo_gym/hf_utils.py b/nemo_gym/hf_utils.py
@@ -1,10 +1,11 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,

diff --git a/nemo_gym/server_status.py b/nemo_gym/server_status.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from time import time
+from typing import List
+
+import requests
+from devtools import pprint
+
+from nemo_gym.server_utils import ServerClient, ServerInstanceDisplayConfig, ServerStatus
+
+
+class StatusCommand:
+    """Main class to check server status"""
+
+    def check_health(self, server_info: ServerInstanceDisplayConfig) -> ServerStatus:
+        """Check if server is responding"""
+        if not server_info.url:
+            return "unknown_error"
+
+        try:
+            requests.get(server_info.url, timeout=2)
+            return "success"
+        except requests.exceptions.ConnectionError:
+            return "connection_error"
+        except requests.exceptions.Timeout:
+            return "timeout"
+        except Exception:
+            return "unknown_error"
+
+    def discover_servers(self) -> List[ServerInstanceDisplayConfig]:
+        """Find all running NeMo Gym server processes"""
+
+        try:
+            head_server_config = ServerClient.load_head_server_config()
+            head_url = f"http://{head_server_config.host}:{head_server_config.port}"
+
+            response = requests.get(f"{head_url}/server_instances", timeout=5)
+            response.raise_for_status()
+            instances = response.json()
+
+            servers = []
+            current_time = time()
+
+            for inst in instances:
+                uptime = current_time - inst.get("start_time", current_time)
+                server_info = ServerInstanceDisplayConfig(
+                    process_name=inst["process_name"],
+                    server_type=inst["server_type"],
+                    name=inst["name"],
+                    host=inst.get("host"),
+                    port=inst.get("port"),
+                    url=inst.get("url"),
+                    entrypoint=inst.get("entrypoint"),
+                    pid=inst.get("pid"),
+                    uptime_seconds=uptime,
+                    status="unknown_error",
+                )
+                server_info.status = self.check_health(server_info)
+                servers.append(server_info)
+
+            return servers
+
+        except (requests.RequestException, ConnectionError) as e:
+            print(f"""
+Could not connect to head server: {e}
+Is the head server running? Start it with: `ng_run`
+            """)
+            return []
+
+    def display_status(self, servers: List[ServerInstanceDisplayConfig]) -> None:
+        """Show server info in a table"""
+
+        def format_uptime(uptime_seconds: float) -> str:
+            """Format uptime in a human readable format"""
+            minutes, seconds = divmod(uptime_seconds, 60)
+            hours, minutes = divmod(minutes, 60)
+            days, hours = divmod(hours, 24)
+            return f"{int(days)}d {int(hours)}h {int(minutes)}m {seconds:.1f}s"
+
+        if not servers:
+            print("No NeMo Gym servers found running.")
+            return
+
+        print("\nNeMo Gym Server Status:\n")
+
+        for i, server in enumerate(servers, 1):
+            status_icon = "✓" if server.status == "success" else "✗"
+            print(f"[{i}] {status_icon} {server.process_name} ({server.server_type}/{server.name})")
+            display_dict = {
+                "server_type": server.server_type,
+                "name": server.name,
+                "port": server.port,
+                "pid": server.pid,
+                "uptime_seconds": format_uptime(server.uptime_seconds),
+            }
+            pprint(display_dict)
+
+        healthy_count = sum(1 for s in servers if s.status == "success")
+        print(f"""
+{len(servers)} servers found ({healthy_count} healthy, {len(servers) - healthy_count} unhealthy)
+""")
diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
@@ -25,7 +25,7 @@
 from pathlib import Path
 from threading import Thread
 from traceback import print_exc
-from typing import Literal, Optional, Tuple, Type, Union, Unpack
+from typing import List, Literal, Optional, Tuple, Type, Union, Unpack
 from uuid import uuid4
 
 import ray
@@ -45,7 +45,7 @@
 from fastapi import FastAPI, Request, Response
 from fastapi.exception_handlers import request_validation_exception_handler
 from fastapi.exceptions import RequestValidationError
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, PlainTextResponse
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pydantic import BaseModel, ConfigDict
 from requests.exceptions import ConnectionError
@@ -543,16 +543,24 @@ def filter(self, record: LogRecord) -> bool:
 
 class HeadServer(BaseServer):
     config: BaseServerConfig
+    _server_instances: List[dict] = []
 
     def setup_webserver(self) -> FastAPI:
         app = FastAPI()
 
-        app.get("/global_config_dict_yaml")(self.global_config_dict_yaml)
+        app.get("/global_config_dict_yaml", response_class=PlainTextResponse)(self.global_config_dict_yaml)
+        app.get("/server_instances")(self.get_server_instances)
 
         return app
 
+    def get_server_instances(self) -> List[dict]:
+        return self._server_instances
+
+    def set_server_instances(self, instances: List) -> None:
+        self._server_instances = instances
+
     @classmethod
-    def run_webserver(cls) -> Tuple[uvicorn.Server, Thread]:  # pragma: no cover
+    def run_webserver(cls) -> Tuple[uvicorn.Server, Thread, "HeadServer"]:  # pragma: no cover
         config = ServerClient.load_head_server_config()
         server = cls(config=config)
 
@@ -563,12 +571,28 @@ def run_webserver(cls) -> Tuple[uvicorn.Server, Thread]:  # pragma: no cover
             host=server.config.host,
             port=server.config.port,
         )
-        server = uvicorn.Server(config=config)
+        uvicorn_server = uvicorn.Server(config=config)
 
-        thread = Thread(target=server.run, daemon=True)
+        thread = Thread(target=uvicorn_server.run, daemon=True)
         thread.start()
 
-        return server, thread
+        return uvicorn_server, thread, server
 
     async def global_config_dict_yaml(self) -> str:
         return OmegaConf.to_yaml(get_global_config_dict())
+
+
+class ServerInstanceDisplayConfig(BaseModel):
+    config_path: Optional[str] = None
+    dir_path: Optional[Path] = None
+    entrypoint: Optional[str] = None
+    host: Optional[str] = None
+    name: Optional[str] = None
+    pid: Optional[int] = None
+    port: Optional[int] = None
+    process_name: Optional[str] = None
+    server_type: Optional[str] = None
+    start_time: Optional[float] = None
+    status: Optional[ServerStatus] = None
+    uptime_seconds: Optional[float] = None
+    url: Optional[str] = None
@@ -285,6 +285,10 @@ ng_dump_config = "nemo_gym.cli:dump_config"
 nemo_gym_help = "nemo_gym.cli:display_help"
 ng_help = "nemo_gym.cli:display_help"
 
+# Server status
+nemo_gym_status = "nemo_gym.cli:status"
+ng_status = "nemo_gym.cli:status"
+
 # Environment-specific uv pip list
 nemo_gym_pip_list = "nemo_gym.cli:pip_list"
 ng_pip_list = "nemo_gym.cli:pip_list"