Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions examples/evaluation/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,4 @@ python \
--host 0.0.0.0 \
--port 8000 \
--num_gpus "$NUM_GPUS" \
--num_replicas "$NUM_REPLICAS" \
--tensor_model_parallel_size 1 \
--pipeline_model_parallel_size 1 \
--context_parallel_size 1
--num_replicas "$NUM_REPLICAS"
11 changes: 10 additions & 1 deletion examples/evaluation/launch_evaluation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,16 @@ def main(args):
executor=executor,
)
job.start(
command=f"bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh {args.megatron_checkpoint} {args.num_replicas} {args.num_gpus} | tee -a deploy.log & sleep 120; bash /opt/Megatron-Bridge/examples/evaluation/eval.sh {args.output_dir} {args.parallelism} | tee -a eval.log",
command=f"""
bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh \
{args.megatron_checkpoint} \
{args.num_replicas} \
{args.num_gpus}| tee -a deploy.log & \
sleep 120; \
bash /opt/Megatron-Bridge/examples/evaluation/eval.sh \
{args.output_dir} \
{args.parallelism} | tee -a eval.log
""",
workdir=None,
)

Expand Down
2 changes: 1 addition & 1 deletion scripts/performance/setup_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def main(
error_msg = None
n_attempts = 0
exp_name = (
exp_name[:37] if dgxc_cluster is not None else exp_name
exp_name[:33] if dgxc_cluster is not None else exp_name
) # Some k8s clusters have a limit on the length of the experiment name.
wandb_run_id = None
while n_attempts <= max_retries:
Expand Down
Loading