diff --git a/BingBertSquad/nvidia_run_squad_deepspeed.py b/BingBertSquad/nvidia_run_squad_deepspeed.py index 7c112b725..454d2c7c9 100755 --- a/BingBertSquad/nvidia_run_squad_deepspeed.py +++ b/BingBertSquad/nvidia_run_squad_deepspeed.py @@ -741,6 +741,8 @@ def set_optimizer_params_grad(named_params_optimizer, def main(): parser = get_argument_parser() + torch.distributed.init_process_group(backend='nccl') + # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) diff --git a/BingBertSquad/run_squad_deepspeed.sh b/BingBertSquad/run_squad_deepspeed.sh index 3c5c15b72..6095c76ca 100755 --- a/BingBertSquad/run_squad_deepspeed.sh +++ b/BingBertSquad/run_squad_deepspeed.sh @@ -32,7 +32,7 @@ else GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE)) fi JOB_NAME="deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size" -config_json=onebit_deepspeed_bsz24_config.json +config_json=deepspeed_bsz24_config.json run_cmd="deepspeed --num_nodes ${NUM_NODES} --num_gpus ${NGPU_PER_NODE} \ --master_port=${MASTER_PORT} \ --hostfile ${HOSTFILE} \ diff --git a/BingBertSquad/turing/nvidia_modelingpreln.py b/BingBertSquad/turing/nvidia_modelingpreln.py index 91e9b3c4f..ebe4e75f9 100755 --- a/BingBertSquad/turing/nvidia_modelingpreln.py +++ b/BingBertSquad/turing/nvidia_modelingpreln.py @@ -529,7 +529,6 @@ def __init__(self, config, args): cuda_config = DeepSpeedTransformerConfig( batch_size=ds_config.train_micro_batch_size_per_gpu, - max_seq_length=args.max_seq_length, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, heads=config.num_attention_heads, @@ -537,6 +536,7 @@ def __init__(self, config, args): hidden_dropout_ratio=config.hidden_dropout_prob, num_hidden_layers=config.num_hidden_layers, initializer_range=config.initializer_range, + local_rank=args.local_rank, seed=args.seed, fp16=ds_config.fp16_enabled, pre_layer_norm=True) diff --git a/bing_bert/nvidia/modelingpreln.py b/bing_bert/nvidia/modelingpreln.py index 099b894f3..3a9d36f81 100755 --- a/bing_bert/nvidia/modelingpreln.py +++ b/bing_bert/nvidia/modelingpreln.py @@ -574,7 +574,6 @@ def __init__(self, config, args, sparse_attention_config=None): ds_config = get_deepspeed_config(args) cuda_config = DeepSpeedTransformerConfig( batch_size=ds_config.train_micro_batch_size_per_gpu, - max_seq_length=args.max_seq_length, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, heads=config.num_attention_heads,