-
Notifications
You must be signed in to change notification settings - Fork 37
Add job-based port offset to avoid cross-job port conflicts #195
base: main
Are you sure you want to change the base?
Changes from all commits
c8c3ec6
5453864
f9fdd32
13c084d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -110,13 +110,29 @@ def start_worker(self, process: "Process", endpoint_processes: list["Process"]) | |
| ) | ||
|
|
||
| # Environment variables | ||
| env_to_set = { | ||
| "HEAD_NODE_IP": self.runtime.head_node_ip, | ||
| "ETCD_ENDPOINTS": f"http://{self.runtime.nodes.infra}:2379", | ||
| "NATS_SERVER": f"nats://{self.runtime.nodes.infra}:4222", | ||
| "DYN_SYSTEM_PORT": str(process.sys_port), | ||
| "DYN_REQUEST_PLANE": "nats", | ||
| } | ||
| env_to_set: dict[str, str] = {"HEAD_NODE_IP": self.runtime.head_node_ip} | ||
|
|
||
| # Only Dynamo workers require etcd/NATS + system status server port. | ||
| if self.config.frontend.type == "dynamo": | ||
| from srtctl.core.slurm import get_port_offset | ||
|
|
||
| port_offset = get_port_offset(self.runtime.job_id) | ||
| nats_port = 4222 + port_offset | ||
| etcd_port = 2379 + port_offset | ||
|
|
||
| env_to_set.update( | ||
| { | ||
| "ETCD_ENDPOINTS": f"http://{self.runtime.infra_node_ip}:{etcd_port}", | ||
| "NATS_SERVER": f"nats://{self.runtime.infra_node_ip}:{nats_port}", | ||
| "DYN_SYSTEM_PORT": str(process.sys_port), | ||
| } | ||
| ) | ||
|
|
||
| # Keep request-plane consistent across frontend/workers | ||
| frontend_plane = None | ||
| if self.config.frontend.env: | ||
| frontend_plane = self.config.frontend.env.get("DYN_REQUEST_PLANE") | ||
| env_to_set["DYN_REQUEST_PLANE"] = frontend_plane if frontend_plane else "nats" | ||
|
|
||
|
Comment on lines
+113
to
136
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Offset/profile fixes are incomplete across worker launch modes.
🛠️ Proposed parity fix for endpoint-launch mode@@
- env_to_set = {
- "HEAD_NODE_IP": self.runtime.head_node_ip,
- "ETCD_ENDPOINTS": f"http://{self.runtime.nodes.infra}:2379",
- "NATS_SERVER": f"nats://{self.runtime.nodes.infra}:4222",
- "DYN_SYSTEM_PORT": str(leader.sys_port),
- }
+ env_to_set = {"HEAD_NODE_IP": self.runtime.head_node_ip, "DYN_SYSTEM_PORT": str(leader.sys_port)}
+ if self.config.frontend.type == "dynamo":
+ from srtctl.core.slurm import get_port_offset
+ port_offset = get_port_offset(self.runtime.job_id)
+ env_to_set["ETCD_ENDPOINTS"] = f"http://{self.runtime.infra_node_ip}:{2379 + port_offset}"
+ env_to_set["NATS_SERVER"] = f"nats://{self.runtime.infra_node_ip}:{4222 + port_offset}"
@@
- if profiling.enabled:
- profile_dir = str(self.runtime.log_dir / "profiles")
- env_to_set.update(profiling.get_env_vars(mode, profile_dir))
+ if profiling.enabled:
+ env_to_set.update(profiling.get_env_vars(mode, "/logs/profiles"))Also applies to: 158-160 🤖 Prompt for AI Agents
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added TODO for trt-llm
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
🧠 Learnings used |
||
| # Add mode-specific environment variables from backend | ||
| # Support simple {node} and {node_id} templating | ||
|
|
@@ -139,8 +155,9 @@ def __missing__(self, key: str) -> str: | |
|
|
||
| # Add profiling environment variables | ||
| if profiling.enabled: | ||
| profile_dir = str(self.runtime.log_dir / "profiles") | ||
| env_to_set.update(profiling.get_env_vars(mode, profile_dir)) | ||
| # /logs is the mounted host log directory inside the container. | ||
| profile_dir_in_container = "/logs/profiles" | ||
| env_to_set.update(profiling.get_env_vars(mode, profile_dir_in_container)) | ||
|
|
||
| # Set CUDA_VISIBLE_DEVICES if not using all GPUs | ||
| if len(process.gpu_indices) < self.runtime.gpus_per_node: | ||
|
|
@@ -228,6 +245,9 @@ def start_endpoint_worker(self, endpoint_processes: list["Process"]) -> ManagedP | |
| ) | ||
|
|
||
| # Environment variables | ||
| # TODO: port-offset is only applied in start_worker() (SGLang path). | ||
| # This MPI-style path (TRTLLM) still uses hardcoded NATS/etcd ports. | ||
| # If TRTLLM needs port-offset support, mirror the dynamo env logic from start_worker(). | ||
| env_to_set = { | ||
| "HEAD_NODE_IP": self.runtime.head_node_ip, | ||
| "ETCD_ENDPOINTS": f"http://{self.runtime.nodes.infra}:2379", | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.