Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions examples/evaluation/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
MEGATRON_CHECKPOINT=$1
NUM_REPLICAS=$2
NUM_GPUS=$3
TP=$4
PP=$5
CP=$6
python \
/opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_inframework.py \
--megatron_checkpoint "$MEGATRON_CHECKPOINT" \
Expand All @@ -14,6 +17,6 @@ python \
--port 8000 \
--num_gpus "$NUM_GPUS" \
--num_replicas "$NUM_REPLICAS" \
--tensor_model_parallel_size 1 \
--pipeline_model_parallel_size 1 \
--context_parallel_size 1
--tensor_model_parallel_size "$TP" \
--pipeline_model_parallel_size "$PP" \
--context_parallel_size "$CP"
14 changes: 13 additions & 1 deletion examples/evaluation/launch_evaluation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,19 @@ def main(args):
executor=executor,
)
job.start(
command=f"bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh {args.megatron_checkpoint} {args.num_replicas} {args.num_gpus} | tee -a deploy.log & sleep 120; bash /opt/Megatron-Bridge/examples/evaluation/eval.sh {args.output_dir} {args.parallelism} | tee -a eval.log",
command=f"""
bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh \
{args.megatron_checkpoint} \
{args.num_replicas} \
{args.num_gpus} \
{args.tensor_model_parallel_size} \
{args.pipeline_model_parallel_size} \
{args.context_model_parallel_size} | tee -a deploy.log & \
sleep 120; \
bash /opt/Megatron-Bridge/examples/evaluation/eval.sh \
{args.output_dir} \
{args.parallelism} | tee -a eval.log
""",
workdir=None,
)

Expand Down
2 changes: 1 addition & 1 deletion scripts/performance/setup_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def main(
error_msg = None
n_attempts = 0
exp_name = (
exp_name[:37] if dgxc_cluster is not None else exp_name
exp_name[:33] if dgxc_cluster is not None else exp_name
) # Some k8s clusters have a limit on the length of the experiment name.
wandb_run_id = None
while n_attempts <= max_retries:
Expand Down
Loading