Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
84c8c0e
chore: Add evaluation pipeline
ko3n1g Jan 7, 2026
ab8060c
refactor
ko3n1g Jan 7, 2026
8e5256a
revert
ko3n1g Jan 8, 2026
81e0a4b
refactor
ko3n1g Jan 9, 2026
01883ec
upload to wandb
ko3n1g Jan 9, 2026
74a02e5
write to wandb
ko3n1g Jan 9, 2026
7696d43
docstrings
ko3n1g Jan 9, 2026
9b40152
linting
ko3n1g Jan 9, 2026
b932ca4
copyright
ko3n1g Jan 9, 2026
d3251cc
add argument builder
ko3n1g Jan 9, 2026
c869b88
move scripts
ko3n1g Jan 12, 2026
1228135
kuberay_executor
ko3n1g Jan 13, 2026
9732ecd
KubeRayExecutor
ko3n1g Jan 13, 2026
cf93c86
KubeRayExecutor
ko3n1g Jan 13, 2026
bfcf4d5
tmp
ko3n1g Jan 14, 2026
79d5cce
tmp
ko3n1g Jan 14, 2026
0fd27e6
dont fail on missing dir
ko3n1g Jan 18, 2026
1d5b07f
fixes
ko3n1g Jan 18, 2026
33c0536
logging
ko3n1g Jan 18, 2026
5745dfa
tmp
ko3n1g Jan 19, 2026
67fa44d
update
ko3n1g Jan 19, 2026
4aff1d3
update
ko3n1g Jan 19, 2026
a74faba
d(0GL9TJ2y{d6(w0/<(V
ko3n1g Jan 19, 2026
4d8f1c8
fix
ko3n1g Jan 19, 2026
34d14f6
/examples/evaluation
ko3n1g Jan 19, 2026
ea42d5a
fix
ko3n1g Jan 19, 2026
d07b929
fixes
ko3n1g Jan 23, 2026
1e226ae
fix
ko3n1g Jan 23, 2026
653f35e
cleanup
ko3n1g Jan 23, 2026
69f22ae
fix
ko3n1g Jan 23, 2026
39e435f
cleanup
ko3n1g Jan 23, 2026
5b6b7a3
attach to existing run
ko3n1g Jan 23, 2026
d5aac3e
fix
ko3n1g Jan 23, 2026
ddaaf99
fix
ko3n1g Jan 23, 2026
c598a42
fix
ko3n1g Jan 23, 2026
5c075ee
Merge remote-tracking branch 'origin/main' into ko3n1g/chore/evaluati…
ko3n1g Jan 23, 2026
896ede1
status
ko3n1g Jan 23, 2026
c417081
fix
ko3n1g Jan 23, 2026
75b9d61
fix
ko3n1g Jan 23, 2026
3b78b5e
cleanup
ko3n1g Jan 23, 2026
b2f092a
copyright
ko3n1g Jan 24, 2026
4308a35
Merge remote-tracking branch 'origin/main' into ko3n1g/chore/evaluati…
ko3n1g Jan 24, 2026
d86bbfe
desc
ko3n1g Jan 24, 2026
7e335c1
fixes
ko3n1g Jan 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions examples/evaluation/argument_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

from argument_parser import parse_cli_args


def list_of_strings(arg):
"""Split a comma-separated string into a list of substrings."""
return arg.split(",")


def normalize_arg_name(arg_name: str) -> str:
"""
Normalizes a command-line argument name (e.g., '--model_family_name' or '-m')
into a suitable environment variable name (e.g., 'MODEL_FAMILY_NAME').
"""
name = arg_name.lstrip("-")
name = name.upper()
name = name.replace("-", "_")
return name


def build_cli_args_from_env_vars(parser: argparse.ArgumentParser) -> str:
"""
Inspects an argparse.ArgumentParser, checks for corresponding environment
variables, and constructs a CLI argument string from them.
"""
cli_arg_string = []

for action in parser._actions:
if action.option_strings:
long_arg_name = action.option_strings[-1]
env_var_name = normalize_arg_name(long_arg_name)
env_value = os.getenv(env_var_name)

if env_value is not None:
if isinstance(action, argparse._StoreTrueAction):
is_true = env_value.lower() in ("true", "1", "yes", "on")
if is_true:
cli_arg_string.append(long_arg_name)
continue
elif action.type is list_of_strings:
if env_value:
cli_arg_string.append(long_arg_name)
cli_arg_string.append(env_value)
continue
else:
cli_arg_string.append(long_arg_name)
cli_arg_string.append(env_value)

return " ".join(cli_arg_string)


if __name__ == "__main__":
cli_args_string = build_cli_args_from_env_vars(parse_cli_args())
print(cli_args_string)
247 changes: 247 additions & 0 deletions examples/evaluation/argument_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse


def list_of_strings(arg):
"""Split a comma-separated string into a list of substrings."""
return arg.split(",")


def to_dict(arg):
"""Split a comma-separated string into a dictionary of key-value pairs."""
return dict(item.split("=") for item in arg.split(","))


ENDPOINT_TYPES = {"chat": "chat/completions/", "completions": "completions/"}


def parse_cli_args():
"""Parse command line arguments for launching Megatron-Bridge Evaluation."""
parser = argparse.ArgumentParser(description="Launch Megatron-Bridge Evaluation")
parser.add_argument(
"--dryrun",
action="store_true",
help="Dry run the experiment.",
default=False,
)

# Deployment args
deployment_args = parser.add_argument_group("Deployment arguments")
deployment_args.add_argument("--megatron_checkpoint", type=str, help="Megatron checkpoint to evaluate")
deployment_args.add_argument(
"--host",
type=str,
help="Server address to use for evaluation",
default="0.0.0.0",
)
deployment_args.add_argument("--port", type=int, help="Server port to use for evaluation", default=8000)
deployment_args.add_argument("--gpus_per_node", type=int, help="Number of GPUs per node", default=8)
deployment_args.add_argument("--num_gpus", type=int, help="Number of nodes to use for evaluation", default=8)
deployment_args.add_argument("--num_replicas", type=int, default=1, help="Num of replicas for Ray server")
deployment_args.add_argument(
"--tensor_model_parallel_size",
type=int,
help="Tensor model parallel size to use for evaluation",
default=1,
)
deployment_args.add_argument(
"--pipeline_model_parallel_size",
type=int,
help="Pipeline model parallel size to use for evaluation",
default=1,
)
deployment_args.add_argument(
"--context_model_parallel_size",
type=int,
help="Context model parallel size to use for evaluation",
default=1,
)

# Evaluation args
evaluation_args = parser.add_argument_group("Evaluation arguments")
evaluation_args.add_argument(
"--endpoint_type",
type=str,
default="completions",
help="Whether to use completions or chat endpoint. Refer to the docs for details on tasks that are completions"
"v/s chat.",
choices=list(ENDPOINT_TYPES),
)
evaluation_args.add_argument(
"--limit_samples",
type=float,
default=None,
help="Limit evaluation to `limit` samples. Default: use all samples.",
)
evaluation_args.add_argument(
"--parallelism",
type=int,
default=8,
help="Number of parallel requests to send to server. Default: use default for the task.",
)
evaluation_args.add_argument(
"--request_timeout",
type=int,
default=1000,
help="Time in seconds for the eval client. Default: 1000s",
)
evaluation_args.add_argument(
"--temperature",
type=float,
default=None,
help="Sampling temperature for generation. Higher values = more random. Default: use task default.",
)
evaluation_args.add_argument(
"--top_p",
type=float,
default=None,
help="Top-p (nucleus) sampling threshold. Default: use task default.",
)
evaluation_args.add_argument(
"--top_k",
type=int,
default=None,
help="Top-k sampling threshold. Default: use task default.",
)
evaluation_args.add_argument(
"--eval_task",
type=str,
default="mmlu",
help="Evaluation benchmark to run. Refer to the docs for more details on the tasks/benchmarks.",
)

# Slurm args
slurm_args = parser.add_argument_group("Slurm arguments")
slurm_args.add_argument(
"--custom_mounts", type=list_of_strings, help="Comma separated string of mounts", default=[], required=False
)
slurm_args.add_argument(
"--custom_env_vars",
type=to_dict,
help="Comma separated string of environment variables",
default={},
required=False,
)
slurm_args.add_argument("--account", type=str, help="Cluster account to run test")
slurm_args.add_argument("--partition", type=str, help="Cluster partition to run test")
slurm_args.add_argument("--time_limit", type=str, default="04:00:00", help="Time limit of run")
slurm_args.add_argument("--container_image", type=str, default="", help="Container image to run")

# Logging args
logging_args = parser.add_argument_group("Logging arguments")
logging_args.add_argument(
"--output_dir",
type=str,
help="Output directory to save the results",
required=False,
)
logging_args.add_argument(
"--experiment_name",
type=str,
help="wandb job name",
required=False,
)
logging_args.add_argument(
"--wandb_key",
type=str,
help="wandb key. Needed for wandb logger projection to server",
required=False,
)
logging_args.add_argument(
"--wandb_project_name",
type=str,
help="wandb project name",
required=False,
)
logging_args.add_argument(
"--wandb_entity_name",
type=str,
help="wandb entity name",
required=False,
)
Comment on lines +168 to +173
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Copy-paste error in help text.

The help text for --wandb_entity_name says "wandb project name" but should describe the entity name.

Proposed fix
     logging_args.add_argument(
         "--wandb_entity_name",
         type=str,
-        help="wandb project name",
+        help="wandb entity name",
         required=False,
     )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
logging_args.add_argument(
"--wandb_entity_name",
type=str,
help="wandb project name",
required=False,
)
logging_args.add_argument(
"--wandb_entity_name",
type=str,
help="wandb entity name",
required=False,
)
🤖 Prompt for AI Agents
In `@examples/evaluation/argument_parser.py` around lines 168 - 173, The help text
for the argument added via logging_args.add_argument("--wandb_entity_name", ...)
is incorrect (it currently says "wandb project name"); update the help string to
accurately describe the entity (e.g., "wandb entity name" or "WandB
entity/username for the project") so the flag --wandb_entity_name clearly
documents its purpose in the argument parser.

logging_args.add_argument(
"--wandb_experiment_name",
type=str,
help="wandb job name",
required=False,
)

# Tokenizer args
tokenizer_args = parser.add_argument_group("Tokenizer arguments")
tokenizer_args.add_argument(
"-hf",
"--hf_token",
type=str,
help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.",
)

# DGXCloud
dgxc_args = parser.add_argument_group("DGXCloud arguments")
dgxc_args.add_argument(
"--dgxc_cluster",
type=str,
help="DGXCloud cluster to use for experiment",
required=False,
)
dgxc_args.add_argument(
"--dgxc_base_url",
type=str,
help="DGXCloud base url",
required=False,
)
dgxc_args.add_argument(
"--dgxc_kube_apiserver_url",
type=str,
help="DGXCloud kube apiserver url",
required=False,
)
dgxc_args.add_argument(
"--dgxc_app_id",
type=str,
help="DGXCloud app id",
required=False,
)
dgxc_args.add_argument(
"--dgxc_app_secret",
type=str,
help="DGXCloud app secret",
required=False,
)
dgxc_args.add_argument(
"--dgxc_project_name",
type=str,
help="DGXCloud project name",
required=False,
)
dgxc_args.add_argument(
"--dgxc_pvc_claim_name",
type=str,
help="DGXCloud pvc claim name",
required=False,
)
dgxc_args.add_argument(
"--dgxc_pvc_mount_path",
type=str,
help="DGXCloud pvc mount path",
required=False,
)
dgxc_args.add_argument(
"--dgxc_namespace",
type=str,
help="DGXCloud namespace",
required=False,
)

return parser
19 changes: 19 additions & 0 deletions examples/evaluation/deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Unset SLURM/PMI/PMIX env vars to prevent MPI initialization issues
for i in $(env | grep ^SLURM_ | cut -d"=" -f 1); do unset -v $i; done
for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
Comment on lines +1 to +4
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Add a shebang line to specify the shell interpreter.

The script lacks a shebang, which can cause portability issues and unexpected behavior. Static analysis (SC2148) correctly flags this.

Proposed fix
+#!/bin/bash
+
 # Unset SLURM/PMI/PMIX env vars to prevent MPI initialization issues
 for i in $(env | grep ^SLURM_ | cut -d"=" -f 1); do unset -v $i; done
 for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
 for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Unset SLURM/PMI/PMIX env vars to prevent MPI initialization issues
for i in $(env | grep ^SLURM_ | cut -d"=" -f 1); do unset -v $i; done
for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
#!/bin/bash
# Unset SLURM/PMI/PMIX env vars to prevent MPI initialization issues
for i in $(env | grep ^SLURM_ | cut -d"=" -f 1); do unset -v $i; done
for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
🧰 Tools
🪛 Shellcheck (0.11.0)

[error] 1-1: Tips depend on target shell and yours is unknown. Add a shebang or a 'shell' directive.

(SC2148)

🤖 Prompt for AI Agents
In `@examples/evaluation/deploy.sh` around lines 1 - 4, The shell script is
missing a shebang so its interpreter is ambiguous; add a shebang as the very
first line (for example "#!/usr/bin/env bash") to guarantee POSIX/Bash behavior
for the for-loop unsets (the lines using env | grep ^SLURM_/PMI_/PMIX_ and unset
-v), and optionally make the script executable (chmod +x) after updating the
file.


MEGATRON_CHECKPOINT=$1
NUM_REPLICAS=$2
NUM_GPUS=$3
python \
/opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_inframework.py \
--megatron_checkpoint "$MEGATRON_CHECKPOINT" \
--model_id megatron_model \
--host 0.0.0.0 \
--port 8000 \
--num_gpus "$NUM_GPUS" \
--num_replicas "$NUM_REPLICAS" \
--tensor_model_parallel_size 1 \
--pipeline_model_parallel_size 1 \
--context_parallel_size 1
Loading
Loading