diff --git a/examples/evaluation/deploy.sh b/examples/evaluation/deploy.sh index c1418f9ed5..bac60d966e 100644 --- a/examples/evaluation/deploy.sh +++ b/examples/evaluation/deploy.sh @@ -13,7 +13,4 @@ python \ --host 0.0.0.0 \ --port 8000 \ --num_gpus "$NUM_GPUS" \ - --num_replicas "$NUM_REPLICAS" \ - --tensor_model_parallel_size 1 \ - --pipeline_model_parallel_size 1 \ - --context_parallel_size 1 + --num_replicas "$NUM_REPLICAS" diff --git a/examples/evaluation/launch_evaluation_pipeline.py b/examples/evaluation/launch_evaluation_pipeline.py index 1c3a1aaadb..5b8910ad69 100644 --- a/examples/evaluation/launch_evaluation_pipeline.py +++ b/examples/evaluation/launch_evaluation_pipeline.py @@ -111,7 +111,16 @@ def main(args): executor=executor, ) job.start( - command=f"bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh {args.megatron_checkpoint} {args.num_replicas} {args.num_gpus} | tee -a deploy.log & sleep 120; bash /opt/Megatron-Bridge/examples/evaluation/eval.sh {args.output_dir} {args.parallelism} | tee -a eval.log", + command=f""" + bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh \ + {args.megatron_checkpoint} \ + {args.num_replicas} \ + {args.num_gpus}| tee -a deploy.log & \ + sleep 120; \ + bash /opt/Megatron-Bridge/examples/evaluation/eval.sh \ + {args.output_dir} \ + {args.parallelism} | tee -a eval.log + """, workdir=None, ) diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 045b45f184..b140960493 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -403,7 +403,7 @@ def main( error_msg = None n_attempts = 0 exp_name = ( - exp_name[:37] if dgxc_cluster is not None else exp_name + exp_name[:33] if dgxc_cluster is not None else exp_name ) # Some k8s clusters have a limit on the length of the experiment name. wandb_run_id = None while n_attempts <= max_retries: