diff --git a/.gitignore b/.gitignore index faf4e2aff..0e8a0cfad 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,7 @@ __pycache__/ # C extensions *.so - +.vscode # Distribution / packaging .idea .Python @@ -129,3 +129,8 @@ dmypy.json # Pyre type checker .pyre/ +applications/DeepSpeed-Chat/training/output/ds_tensorboard_logs/* +applications/DeepSpeed-Chat/training/output/* +applications/DeepSpeed-Chat/training/step1_supervised_finetuning/output/* +applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/output/* +applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/output/* \ No newline at end of file diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index f7b91fa17..0fbc6f02d 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -63,9 +63,17 @@ def parse_args(): parser.add_argument( "--model_name_or_path", type=str, + default="facebook/opt-1.3b", help= "Path to pretrained model or model identifier from huggingface.co/models.", - required=True, + # required=True, + ) + parser.add_argument( + "--tokenizer_name_or_path", + type=str, + default=None, + help= + "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models, if None, use `model_name_or_path`" ) parser.add_argument( "--per_device_train_batch_size", @@ -209,7 +217,10 @@ def main(): torch.distributed.barrier() # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family - tokenizer = load_hf_tokenizer(args.model_name_or_path, fast_tokenizer=True) + # Occasionally , some repo owners of huggingface hub, such as bigscience, would like to separate model and tokenizer + tokenizer_name_or_path = args.model_name_or_path if not args.tokenizer_name_or_path else args.tokenizer_name_or_path + + tokenizer = load_hf_tokenizer(tokenizer_name_or_path, fast_tokenizer=True) model = create_hf_model(AutoModelForCausalLM, args.model_name_or_path, tokenizer, diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_log_output/opt-1.3b-globalBatchSize128.log b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_log_output/opt-1.3b-globalBatchSize128.log deleted file mode 100644 index ed4b8e003..000000000 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_log_output/opt-1.3b-globalBatchSize128.log +++ /dev/null @@ -1,1680 +0,0 @@ -cmd = deepspeed main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets --data_split 2,4,4 --model_name_or_path facebook/opt-1.3b --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0. --num_train_epochs 16 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 2 --deepspeed --output_dir ./output_fourDatasets_reproduce_withDropout -***** Running training ***** -***** Evaluating perplexity, Epoch 0/16 ***** -ppl: 4998.29638671875 -Beginning of Epoch 1/16, Total Micro Batches 460 -[2023-04-18 01:46:52,554] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 01:46:53,259] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 01:46:53,967] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384 -[2023-04-18 01:46:54,678] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192 -[2023-04-18 01:46:55,387] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096 -[2023-04-18 01:46:57,680] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048 -[2023-04-18 01:46:59,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=6, lr=[9.649992967150629e-06, 9.649992967150629e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:46:59,171] [INFO] [timer.py:199:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=173.4455644119137, CurrSamplesPerSec=171.97903593624417, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:47:06,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=6, lr=[9.649913847830647e-06, 9.649913847830647e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:47:06,619] [INFO] [timer.py:199:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=172.66319574149418, CurrSamplesPerSec=171.4073346136102, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:47:14,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=6, lr=[9.649746819575313e-06, 9.649746819575313e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:47:14,074] [INFO] [timer.py:199:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=172.38095135217128, CurrSamplesPerSec=171.8685942346471, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:47:21,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=6, lr=[9.649491885427845e-06, 9.649491885427845e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:47:21,522] [INFO] [timer.py:199:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=172.29462323261063, CurrSamplesPerSec=172.31751921224577, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:47:28,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=6, lr=[9.649149050033092e-06, 9.649149050033092e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:47:28,973] [INFO] [timer.py:199:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=172.2269859085574, CurrSamplesPerSec=172.28478301819888, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:47:36,399] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=6, lr=[9.648718319637444e-06, 9.648718319637444e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:47:36,427] [INFO] [timer.py:199:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=172.16926659466986, CurrSamplesPerSec=171.84587384312238, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:47:43,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=6, lr=[9.64819970208872e-06, 9.64819970208872e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:47:43,882] [INFO] [timer.py:199:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=172.1259539036514, CurrSamplesPerSec=171.66114319917736, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:47:51,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=6, lr=[9.647593206836023e-06, 9.647593206836023e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:47:51,343] [INFO] [timer.py:199:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=172.07719993527658, CurrSamplesPerSec=172.17173979204222, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:47:58,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=6, lr=[9.646898844929575e-06, 9.646898844929575e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:47:58,805] [INFO] [timer.py:199:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=172.0364017305206, CurrSamplesPerSec=171.45145463795086, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:48:06,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=6, lr=[9.646116629020505e-06, 9.646116629020505e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:48:06,269] [INFO] [timer.py:199:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=171.9985274515761, CurrSamplesPerSec=171.50260973455426, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:48:13,710] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=6, lr=[9.645246573360623e-06, 9.645246573360623e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:48:13,739] [INFO] [timer.py:199:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=171.9544086575348, CurrSamplesPerSec=171.72445192494382, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:48:21,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=6, lr=[9.644288693802169e-06, 9.644288693802169e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:48:21,194] [INFO] [timer.py:199:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=171.94589011187213, CurrSamplesPerSec=171.57682241939304, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:48:28,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=6, lr=[9.643243007797506e-06, 9.643243007797506e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:48:28,653] [INFO] [timer.py:199:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=171.9330502115857, CurrSamplesPerSec=171.58789953717658, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:48:36,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=6, lr=[9.64210953439882e-06, 9.64210953439882e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:48:36,103] [INFO] [timer.py:199:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=171.9366358553962, CurrSamplesPerSec=172.04952628536196, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:48:43,537] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=6, lr=[9.640888294257762e-06, 9.640888294257762e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:48:43,565] [INFO] [timer.py:199:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=171.92053870967507, CurrSamplesPerSec=171.6720665237321, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:48:51,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=6, lr=[9.639579309625075e-06, 9.639579309625075e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:48:51,032] [INFO] [timer.py:199:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=171.9012752098011, CurrSamplesPerSec=171.94576737157897, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:48:58,454] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=6, lr=[9.63818260435019e-06, 9.63818260435019e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:48:58,483] [INFO] [timer.py:199:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=171.90498511270908, CurrSamplesPerSec=171.7715931534641, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:49:05,915] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=6, lr=[9.636698203880791e-06, 9.636698203880791e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:49:05,943] [INFO] [timer.py:199:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=171.89603026552297, CurrSamplesPerSec=171.13692588403754, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:49:13,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=6, lr=[9.635126135262344e-06, 9.635126135262344e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:49:13,403] [INFO] [timer.py:199:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=171.88860494064227, CurrSamplesPerSec=171.63128954542898, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:49:20,830] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=6, lr=[9.633466427137616e-06, 9.633466427137616e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:49:20,858] [INFO] [timer.py:199:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=171.88763615643478, CurrSamplesPerSec=171.59513882252318, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:49:28,279] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=6, lr=[9.63171910974615e-06, 9.63171910974615e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:49:28,307] [INFO] [timer.py:199:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=171.89352099985894, CurrSamplesPerSec=172.0804081177377, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:49:35,734] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=6, lr=[9.629884214923708e-06, 9.629884214923708e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:49:35,763] [INFO] [timer.py:199:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=171.89231328806113, CurrSamplesPerSec=171.38998841801885, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:49:43,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=6, lr=[9.62796177610169e-06, 9.62796177610169e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:49:43,224] [INFO] [timer.py:199:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=171.88530080226192, CurrSamplesPerSec=171.93337757065376, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:49:50,654] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=6, lr=[9.625951828306541e-06, 9.625951828306541e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:49:50,682] [INFO] [timer.py:199:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=171.88128683583537, CurrSamplesPerSec=171.95893004937722, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:49:58,115] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=6, lr=[9.623854408159094e-06, 9.623854408159094e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:49:58,144] [INFO] [timer.py:199:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=171.8757891092885, CurrSamplesPerSec=171.67914821551457, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:50:05,573] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=6, lr=[9.621669553873909e-06, 9.621669553873909e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:50:05,601] [INFO] [timer.py:199:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=171.87339691841152, CurrSamplesPerSec=171.914823741734, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:50:13,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=6, lr=[9.619397305258584e-06, 9.619397305258584e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:50:13,058] [INFO] [timer.py:199:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=171.87169636957984, CurrSamplesPerSec=171.99550974170762, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:50:20,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=6, lr=[9.617037703713017e-06, 9.617037703713017e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:50:20,515] [INFO] [timer.py:199:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=171.87001072289559, CurrSamplesPerSec=172.20752628400032, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:50:27,951] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=6, lr=[9.614590792228664e-06, 9.614590792228664e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:50:27,980] [INFO] [timer.py:199:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=171.86220676479607, CurrSamplesPerSec=172.3759444026155, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:50:35,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=6, lr=[9.612056615387746e-06, 9.612056615387746e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:50:35,434] [INFO] [timer.py:199:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=171.86358383822733, CurrSamplesPerSec=171.66800442159393, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:50:42,865] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=6, lr=[9.609435219362444e-06, 9.609435219362444e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:50:42,894] [INFO] [timer.py:199:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=171.85987931370724, CurrSamplesPerSec=171.21327939509231, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:50:50,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=6, lr=[9.606726651914051e-06, 9.606726651914051e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:50:50,364] [INFO] [timer.py:199:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=171.84931800059496, CurrSamplesPerSec=171.59519366792534, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:50:57,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=6, lr=[9.60393096239211e-06, 9.60393096239211e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:50:57,828] [INFO] [timer.py:199:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=171.84357369309876, CurrSamplesPerSec=171.63507555517475, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:51:05,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=6, lr=[9.601048201733503e-06, 9.601048201733503e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:51:05,300] [INFO] [timer.py:199:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=171.83282739708662, CurrSamplesPerSec=171.62991784398895, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:51:12,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=6, lr=[9.598078422461542e-06, 9.598078422461542e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:51:12,759] [INFO] [timer.py:199:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=171.83152264224154, CurrSamplesPerSec=171.91350255354536, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:51:20,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=6, lr=[9.595021678684986e-06, 9.595021678684986e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:51:20,223] [INFO] [timer.py:199:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=171.826180803588, CurrSamplesPerSec=171.81606588058185, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:51:27,649] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=6, lr=[9.59187802609708e-06, 9.59187802609708e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:51:27,677] [INFO] [timer.py:199:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=171.82845252797784, CurrSamplesPerSec=172.36000633101784, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:51:35,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=6, lr=[9.588647521974525e-06, 9.588647521974525e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:51:35,136] [INFO] [timer.py:199:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=171.82720912003316, CurrSamplesPerSec=171.83432340530132, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:51:42,573] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=6, lr=[9.585330225176441e-06, 9.585330225176441e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:51:42,602] [INFO] [timer.py:199:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=171.8217298392455, CurrSamplesPerSec=172.15103681005988, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:51:50,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=6, lr=[9.58192619614329e-06, 9.58192619614329e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:51:50,058] [INFO] [timer.py:199:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=171.82201307778388, CurrSamplesPerSec=171.5359262467382, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:51:57,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=6, lr=[9.578435496895777e-06, 9.578435496895777e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:51:57,532] [INFO] [timer.py:199:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=171.81378544522852, CurrSamplesPerSec=171.16709893828565, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:52:04,979] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=6, lr=[9.574858191033728e-06, 9.574858191033728e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:52:05,008] [INFO] [timer.py:199:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=171.8038774564413, CurrSamplesPerSec=171.170700773004, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:52:12,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=6, lr=[9.571194343734914e-06, 9.571194343734914e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:52:12,487] [INFO] [timer.py:199:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=171.79267094037905, CurrSamplesPerSec=171.72011271642097, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:52:19,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=6, lr=[9.56744402175388e-06, 9.56744402175388e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:52:19,960] [INFO] [timer.py:199:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=171.78476506868265, CurrSamplesPerSec=171.1726654740152, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:52:27,404] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=6, lr=[9.563607293420714e-06, 9.563607293420714e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:52:27,432] [INFO] [timer.py:199:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=171.777961675585, CurrSamplesPerSec=171.17817779262884, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:52:34,866] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=6, lr=[9.559684228639823e-06, 9.559684228639823e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:52:34,894] [INFO] [timer.py:199:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=171.77639769395282, CurrSamplesPerSec=171.47982168198592, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 1/16 ***** -ppl: 2.0179834365844727 -Beginning of Epoch 2/16, Total Micro Batches 460 -[2023-04-18 01:52:50,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=6, lr=[9.55567489888863e-06, 9.55567489888863e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:52:50,573] [INFO] [timer.py:199:stop] epoch=1/micro_step=10/global_step=470, RunningAvgSamplesPerSec=171.74928288886312, CurrSamplesPerSec=171.30970541685812, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:52:58,017] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=6, lr=[9.551579377216302e-06, 9.551579377216302e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:52:58,046] [INFO] [timer.py:199:stop] epoch=1/micro_step=20/global_step=480, RunningAvgSamplesPerSec=171.7433379992504, CurrSamplesPerSec=171.72780260308545, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:53:05,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=6, lr=[9.547397738242398e-06, 9.547397738242398e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:53:05,531] [INFO] [timer.py:199:stop] epoch=1/micro_step=30/global_step=490, RunningAvgSamplesPerSec=171.73157215972623, CurrSamplesPerSec=171.53159656036394, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:53:12,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=6, lr=[9.543130058155516e-06, 9.543130058155516e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:53:13,003] [INFO] [timer.py:199:stop] epoch=1/micro_step=40/global_step=500, RunningAvgSamplesPerSec=171.72650381098111, CurrSamplesPerSec=171.49921305419625, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:53:19,688] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 01:53:20,393] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 01:53:20,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=8, lr=[9.539654016684232e-06, 9.539654016684232e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:53:20,394] [INFO] [timer.py:199:stop] epoch=1/micro_step=50/global_step=510, RunningAvgSamplesPerSec=171.75800656076402, CurrSamplesPerSec=181.55637378899496, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:53:27,831] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=8, lr=[9.535231659592303e-06, 9.535231659592303e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:53:27,859] [INFO] [timer.py:199:stop] epoch=1/micro_step=60/global_step=520, RunningAvgSamplesPerSec=171.75564910508115, CurrSamplesPerSec=171.66619300559057, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:53:35,301] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=8, lr=[9.530723483050843e-06, 9.530723483050843e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:53:35,329] [INFO] [timer.py:199:stop] epoch=1/micro_step=70/global_step=530, RunningAvgSamplesPerSec=171.7513182463925, CurrSamplesPerSec=171.68150890927888, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:53:42,768] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=8, lr=[9.526129569197897e-06, 9.526129569197897e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:53:42,797] [INFO] [timer.py:199:stop] epoch=1/micro_step=80/global_step=540, RunningAvgSamplesPerSec=171.74791156276183, CurrSamplesPerSec=171.4919270808378, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:53:50,238] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=8, lr=[9.521450001733628e-06, 9.521450001733628e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:53:50,267] [INFO] [timer.py:199:stop] epoch=1/micro_step=90/global_step=550, RunningAvgSamplesPerSec=171.74377976253504, CurrSamplesPerSec=171.54359964558304, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:53:57,710] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=8, lr=[9.51668486591879e-06, 9.51668486591879e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:53:57,739] [INFO] [timer.py:199:stop] epoch=1/micro_step=100/global_step=560, RunningAvgSamplesPerSec=171.73902662439036, CurrSamplesPerSec=171.32709003887538, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:54:05,190] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=8, lr=[9.511834248573178e-06, 9.511834248573178e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:54:05,219] [INFO] [timer.py:199:stop] epoch=1/micro_step=110/global_step=570, RunningAvgSamplesPerSec=171.7310081508191, CurrSamplesPerSec=171.40197169239963, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:54:12,666] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=8, lr=[9.506898238074036e-06, 9.506898238074036e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:54:12,695] [INFO] [timer.py:199:stop] epoch=1/micro_step=120/global_step=580, RunningAvgSamplesPerSec=171.72508207965402, CurrSamplesPerSec=170.96983010365747, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:54:20,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=8, lr=[9.501876924354459e-06, 9.501876924354459e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:54:20,167] [INFO] [timer.py:199:stop] epoch=1/micro_step=130/global_step=590, RunningAvgSamplesPerSec=171.72049271067945, CurrSamplesPerSec=171.61142944818613, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:54:27,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=8, lr=[9.496770398901747e-06, 9.496770398901747e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:54:27,645] [INFO] [timer.py:199:stop] epoch=1/micro_step=140/global_step=600, RunningAvgSamplesPerSec=171.71432435414476, CurrSamplesPerSec=171.73917391188908, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:54:35,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=8, lr=[9.491578754755742e-06, 9.491578754755742e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:54:35,111] [INFO] [timer.py:199:stop] epoch=1/micro_step=150/global_step=610, RunningAvgSamplesPerSec=171.7126021001861, CurrSamplesPerSec=171.62958863890586, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:54:35,816] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 01:54:36,521] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 01:54:42,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=10, lr=[9.487364217492641e-06, 9.487364217492641e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:54:42,500] [INFO] [timer.py:199:stop] epoch=1/micro_step=160/global_step=620, RunningAvgSamplesPerSec=171.7392848537013, CurrSamplesPerSec=171.41828039215372, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:54:49,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=10, lr=[9.482019599108305e-06, 9.482019599108305e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:54:49,976] [INFO] [timer.py:199:stop] epoch=1/micro_step=170/global_step=630, RunningAvgSamplesPerSec=171.73352472098838, CurrSamplesPerSec=171.52934959409336, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:54:57,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=10, lr=[9.476590130787202e-06, 9.476590130787202e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:54:57,452] [INFO] [timer.py:199:stop] epoch=1/micro_step=180/global_step=640, RunningAvgSamplesPerSec=171.72836464957518, CurrSamplesPerSec=171.20028521108458, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:55:04,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=10, lr=[9.471075911453133e-06, 9.471075911453133e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:55:04,921] [INFO] [timer.py:199:stop] epoch=1/micro_step=190/global_step=650, RunningAvgSamplesPerSec=171.72551200948914, CurrSamplesPerSec=171.99892610982002, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:55:12,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=10, lr=[9.46547704157403e-06, 9.46547704157403e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:55:12,388] [INFO] [timer.py:199:stop] epoch=1/micro_step=200/global_step=660, RunningAvgSamplesPerSec=171.723585267757, CurrSamplesPerSec=170.92535897384468, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:55:19,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=10, lr=[9.459793623160152e-06, 9.459793623160152e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:55:19,854] [INFO] [timer.py:199:stop] epoch=1/micro_step=210/global_step=670, RunningAvgSamplesPerSec=171.72198735060735, CurrSamplesPerSec=171.7000124088684, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:55:27,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=10, lr=[9.454025759762207e-06, 9.454025759762207e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:55:27,329] [INFO] [timer.py:199:stop] epoch=1/micro_step=220/global_step=680, RunningAvgSamplesPerSec=171.7171984034206, CurrSamplesPerSec=171.40093198120712, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:55:34,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=10, lr=[9.448173556469482e-06, 9.448173556469482e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:55:34,794] [INFO] [timer.py:199:stop] epoch=1/micro_step=230/global_step=690, RunningAvgSamplesPerSec=171.7157280648616, CurrSamplesPerSec=171.60396938386074, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:55:42,228] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=10, lr=[9.442237119907909e-06, 9.442237119907909e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:55:42,256] [INFO] [timer.py:199:stop] epoch=1/micro_step=240/global_step=700, RunningAvgSamplesPerSec=171.71560172070284, CurrSamplesPerSec=171.64555654027703, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:55:49,697] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=10, lr=[9.43621655823814e-06, 9.43621655823814e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:55:49,725] [INFO] [timer.py:199:stop] epoch=1/micro_step=250/global_step=710, RunningAvgSamplesPerSec=171.71313081520353, CurrSamplesPerSec=171.35065784791192, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:55:51,924] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 01:55:52,631] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 01:55:57,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=12, lr=[9.431339612486935e-06, 9.431339612486935e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:55:57,113] [INFO] [timer.py:199:stop] epoch=1/micro_step=260/global_step=720, RunningAvgSamplesPerSec=171.73647076945858, CurrSamplesPerSec=171.1418357829183, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:56:04,549] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=12, lr=[9.425167903078489e-06, 9.425167903078489e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:56:04,578] [INFO] [timer.py:199:stop] epoch=1/micro_step=270/global_step=730, RunningAvgSamplesPerSec=171.73513904377364, CurrSamplesPerSec=171.77511055282332, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:56:12,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=12, lr=[9.418912379559457e-06, 9.418912379559457e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:56:12,043] [INFO] [timer.py:199:stop] epoch=1/micro_step=280/global_step=740, RunningAvgSamplesPerSec=171.73403793065222, CurrSamplesPerSec=171.95204555230538, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:56:19,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=12, lr=[9.41257315590419e-06, 9.41257315590419e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:56:19,515] [INFO] [timer.py:199:stop] epoch=1/micro_step=290/global_step=750, RunningAvgSamplesPerSec=171.73032480384842, CurrSamplesPerSec=171.46706082699572, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:56:26,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=12, lr=[9.406150347612033e-06, 9.406150347612033e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:56:26,992] [INFO] [timer.py:199:stop] epoch=1/micro_step=300/global_step=760, RunningAvgSamplesPerSec=171.72551650726314, CurrSamplesPerSec=171.0122543656038, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:56:34,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=12, lr=[9.399644071705231e-06, 9.399644071705231e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:56:34,462] [INFO] [timer.py:199:stop] epoch=1/micro_step=310/global_step=770, RunningAvgSamplesPerSec=171.72284672408261, CurrSamplesPerSec=171.49893913422363, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:56:41,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=12, lr=[9.393054446726786e-06, 9.393054446726786e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:56:41,931] [INFO] [timer.py:199:stop] epoch=1/micro_step=320/global_step=780, RunningAvgSamplesPerSec=171.72033418286748, CurrSamplesPerSec=171.740327607066, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:56:49,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=12, lr=[9.38638159273831e-06, 9.38638159273831e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:56:49,411] [INFO] [timer.py:199:stop] epoch=1/micro_step=330/global_step=790, RunningAvgSamplesPerSec=171.71507472238937, CurrSamplesPerSec=171.53028125589077, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:56:56,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=12, lr=[9.379625631317826e-06, 9.379625631317826e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:56:56,867] [INFO] [timer.py:199:stop] epoch=1/micro_step=340/global_step=800, RunningAvgSamplesPerSec=171.7164949497059, CurrSamplesPerSec=171.96983622100714, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:57:04,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=12, lr=[9.372786685557555e-06, 9.372786685557555e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:57:04,344] [INFO] [timer.py:199:stop] epoch=1/micro_step=350/global_step=810, RunningAvgSamplesPerSec=171.71185813940698, CurrSamplesPerSec=171.50885559475333, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:57:08,037] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 01:57:08,742] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 01:57:11,697] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=14, lr=[9.367255863907959e-06, 9.367255863907959e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:57:11,726] [INFO] [timer.py:199:stop] epoch=1/micro_step=360/global_step=820, RunningAvgSamplesPerSec=171.73439065475588, CurrSamplesPerSec=171.70737099181173, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:57:19,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=14, lr=[9.3602678613532e-06, 9.3602678613532e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:57:19,194] [INFO] [timer.py:199:stop] epoch=1/micro_step=370/global_step=830, RunningAvgSamplesPerSec=171.73231754815504, CurrSamplesPerSec=171.1907319222805, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:57:26,633] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=14, lr=[9.353197227153232e-06, 9.353197227153232e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:57:26,662] [INFO] [timer.py:199:stop] epoch=1/micro_step=380/global_step=840, RunningAvgSamplesPerSec=171.73032549739324, CurrSamplesPerSec=171.9638321472877, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:57:34,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=14, lr=[9.346044090133554e-06, 9.346044090133554e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:57:34,131] [INFO] [timer.py:199:stop] epoch=1/micro_step=390/global_step=850, RunningAvgSamplesPerSec=171.72807414536518, CurrSamplesPerSec=171.67629350977927, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:57:41,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=14, lr=[9.338808580622845e-06, 9.338808580622845e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:57:41,602] [INFO] [timer.py:199:stop] epoch=1/micro_step=400/global_step=860, RunningAvgSamplesPerSec=171.72513078233638, CurrSamplesPerSec=171.60786389534837, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:57:49,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=14, lr=[9.3314908304506e-06, 9.3314908304506e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:57:49,068] [INFO] [timer.py:199:stop] epoch=1/micro_step=410/global_step=870, RunningAvgSamplesPerSec=171.72369343987896, CurrSamplesPerSec=171.47209922093984, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:57:56,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=14, lr=[9.324090972944714e-06, 9.324090972944714e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:57:56,533] [INFO] [timer.py:199:stop] epoch=1/micro_step=420/global_step=880, RunningAvgSamplesPerSec=171.72264006820868, CurrSamplesPerSec=171.62980810882092, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:58:03,969] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=14, lr=[9.316609142929066e-06, 9.316609142929066e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:58:03,997] [INFO] [timer.py:199:stop] epoch=1/micro_step=430/global_step=890, RunningAvgSamplesPerSec=171.7218321801215, CurrSamplesPerSec=171.86325742855075, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:58:11,440] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=14, lr=[9.309045476721057e-06, 9.309045476721057e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:58:11,468] [INFO] [timer.py:199:stop] epoch=1/micro_step=440/global_step=900, RunningAvgSamplesPerSec=171.71921770778084, CurrSamplesPerSec=171.48119098526695, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:58:18,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=14, lr=[9.301400112129126e-06, 9.301400112129126e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:58:18,939] [INFO] [timer.py:199:stop] epoch=1/micro_step=450/global_step=910, RunningAvgSamplesPerSec=171.71702455449616, CurrSamplesPerSec=171.73681163203275, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:58:24,124] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 01:58:24,829] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 01:58:26,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=16, lr=[9.295225091176768e-06, 9.295225091176768e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:58:26,323] [INFO] [timer.py:199:stop] epoch=1/micro_step=460/global_step=920, RunningAvgSamplesPerSec=171.73617448468602, CurrSamplesPerSec=171.39611664123308, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 2/16 ***** -ppl: 1.9718239307403564 -Beginning of Epoch 3/16, Total Micro Batches 460 -[2023-04-18 01:58:41,966] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=16, lr=[9.2874330215208e-06, 9.2874330215208e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:58:41,995] [INFO] [timer.py:199:stop] epoch=2/micro_step=10/global_step=930, RunningAvgSamplesPerSec=171.72739139690447, CurrSamplesPerSec=171.75785469246833, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:58:49,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=16, lr=[9.279559647255388e-06, 9.279559647255388e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:58:49,462] [INFO] [timer.py:199:stop] epoch=2/micro_step=20/global_step=940, RunningAvgSamplesPerSec=171.72572863161994, CurrSamplesPerSec=171.7845093109113, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:58:56,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=16, lr=[9.271605111831786e-06, 9.271605111831786e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:58:56,931] [INFO] [timer.py:199:stop] epoch=2/micro_step=30/global_step=950, RunningAvgSamplesPerSec=171.7237482342529, CurrSamplesPerSec=171.28735113639954, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:59:04,368] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=16, lr=[9.263569560179989e-06, 9.263569560179989e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:59:04,394] [INFO] [timer.py:199:stop] epoch=2/micro_step=40/global_step=960, RunningAvgSamplesPerSec=171.72324449043057, CurrSamplesPerSec=171.60682168481065, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:59:11,828] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=16, lr=[9.255453138706092e-06, 9.255453138706092e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:59:11,856] [INFO] [timer.py:199:stop] epoch=2/micro_step=50/global_step=970, RunningAvgSamplesPerSec=171.72294135933964, CurrSamplesPerSec=171.89539331246186, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:59:19,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=16, lr=[9.247255995289618e-06, 9.247255995289618e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:59:19,323] [INFO] [timer.py:199:stop] epoch=2/micro_step=60/global_step=980, RunningAvgSamplesPerSec=171.72158869216292, CurrSamplesPerSec=171.69929855005535, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:59:26,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=16, lr=[9.238978279280831e-06, 9.238978279280831e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:59:26,785] [INFO] [timer.py:199:stop] epoch=2/micro_step=70/global_step=990, RunningAvgSamplesPerSec=171.7215426207357, CurrSamplesPerSec=171.59053193390034, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:59:34,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=16, lr=[9.23062014149801e-06, 9.23062014149801e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:59:34,252] [INFO] [timer.py:199:stop] epoch=2/micro_step=80/global_step=1000, RunningAvgSamplesPerSec=171.72011282649171, CurrSamplesPerSec=171.94719919700248, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:59:41,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=16, lr=[9.2221817342247e-06, 9.2221817342247e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:59:41,726] [INFO] [timer.py:199:stop] epoch=2/micro_step=90/global_step=1010, RunningAvgSamplesPerSec=171.71735979324785, CurrSamplesPerSec=171.67426233465983, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:59:48,404] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 01:59:49,112] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 01:59:49,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=18, lr=[9.215373317641175e-06, 9.215373317641175e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:59:49,113] [INFO] [timer.py:199:stop] epoch=2/micro_step=100/global_step=1020, RunningAvgSamplesPerSec=171.73398841131632, CurrSamplesPerSec=180.83512426730692, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 01:59:56,561] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=18, lr=[9.206790813706018e-06, 9.206790813706018e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 01:59:56,590] [INFO] [timer.py:199:stop] epoch=2/micro_step=110/global_step=1030, RunningAvgSamplesPerSec=171.73053633350855, CurrSamplesPerSec=171.58702208955032, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:00:04,041] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=18, lr=[9.198128474445814e-06, 9.198128474445814e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:00:04,069] [INFO] [timer.py:199:stop] epoch=2/micro_step=120/global_step=1040, RunningAvgSamplesPerSec=171.72641718728505, CurrSamplesPerSec=170.84599987398272, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:00:11,506] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=18, lr=[9.189386457686596e-06, 9.189386457686596e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:00:11,535] [INFO] [timer.py:199:stop] epoch=2/micro_step=130/global_step=1050, RunningAvgSamplesPerSec=171.72545300548566, CurrSamplesPerSec=171.8516496443544, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:00:18,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=18, lr=[9.180564922706106e-06, 9.180564922706106e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:00:19,001] [INFO] [timer.py:199:stop] epoch=2/micro_step=140/global_step=1060, RunningAvgSamplesPerSec=171.72426445322972, CurrSamplesPerSec=171.3855019610322, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:00:26,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=18, lr=[9.171664030230894e-06, 9.171664030230894e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:00:26,477] [INFO] [timer.py:199:stop] epoch=2/micro_step=150/global_step=1070, RunningAvgSamplesPerSec=171.72111145759462, CurrSamplesPerSec=171.58751565273587, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:00:33,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=18, lr=[9.162683942433385e-06, 9.162683942433385e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:00:33,951] [INFO] [timer.py:199:stop] epoch=2/micro_step=160/global_step=1080, RunningAvgSamplesPerSec=171.71845712684373, CurrSamplesPerSec=170.97718068225788, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:00:41,391] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=18, lr=[9.153624822928926e-06, 9.153624822928926e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:00:41,420] [INFO] [timer.py:199:stop] epoch=2/micro_step=170/global_step=1090, RunningAvgSamplesPerSec=171.71682236009843, CurrSamplesPerSec=171.42895387228825, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:00:48,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=18, lr=[9.144486836772807e-06, 9.144486836772807e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:00:48,890] [INFO] [timer.py:199:stop] epoch=2/micro_step=180/global_step=1100, RunningAvgSamplesPerSec=171.71491042743668, CurrSamplesPerSec=171.60391453284865, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:00:56,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=18, lr=[9.135270150457251e-06, 9.135270150457251e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:00:56,352] [INFO] [timer.py:199:stop] epoch=2/micro_step=190/global_step=1110, RunningAvgSamplesPerSec=171.7147998988086, CurrSamplesPerSec=171.70369162149964, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:01:03,791] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=18, lr=[9.125974931908382e-06, 9.125974931908382e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:01:03,820] [INFO] [timer.py:199:stop] epoch=2/micro_step=200/global_step=1120, RunningAvgSamplesPerSec=171.71349889377774, CurrSamplesPerSec=171.76977955070603, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:01:04,526] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:01:05,233] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:01:11,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=20, lr=[9.118482327621185e-06, 9.118482327621185e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:01:11,208] [INFO] [timer.py:199:stop] epoch=2/micro_step=210/global_step=1130, RunningAvgSamplesPerSec=171.72845017829988, CurrSamplesPerSec=171.0141064817189, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:01:18,646] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=20, lr=[9.10904617879146e-06, 9.10904617879146e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:01:18,674] [INFO] [timer.py:199:stop] epoch=2/micro_step=220/global_step=1140, RunningAvgSamplesPerSec=171.72730948750296, CurrSamplesPerSec=171.5252942430002, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:01:26,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=20, lr=[9.099531975523781e-06, 9.099531975523781e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:01:26,133] [INFO] [timer.py:199:stop] epoch=2/micro_step=230/global_step=1150, RunningAvgSamplesPerSec=171.7279580634092, CurrSamplesPerSec=171.82585407101155, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:01:33,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=20, lr=[9.089939891164966e-06, 9.089939891164966e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:01:33,603] [INFO] [timer.py:199:stop] epoch=2/micro_step=240/global_step=1160, RunningAvgSamplesPerSec=171.72623060750578, CurrSamplesPerSec=171.32348161962275, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:01:41,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=20, lr=[9.080270100480813e-06, 9.080270100480813e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:01:41,069] [INFO] [timer.py:199:stop] epoch=2/micro_step=250/global_step=1170, RunningAvgSamplesPerSec=171.72541801557455, CurrSamplesPerSec=171.4676632280784, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:01:48,508] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=20, lr=[9.070522779652917e-06, 9.070522779652917e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:01:48,537] [INFO] [timer.py:199:stop] epoch=2/micro_step=260/global_step=1180, RunningAvgSamplesPerSec=171.72401128911508, CurrSamplesPerSec=171.7809915266331, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:01:55,984] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=20, lr=[9.060698106275454e-06, 9.060698106275454e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:01:56,013] [INFO] [timer.py:199:stop] epoch=2/micro_step=270/global_step=1190, RunningAvgSamplesPerSec=171.7212405418612, CurrSamplesPerSec=172.0786982978703, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:02:03,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=20, lr=[9.050796259351945e-06, 9.050796259351945e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:02:03,473] [INFO] [timer.py:199:stop] epoch=2/micro_step=280/global_step=1200, RunningAvgSamplesPerSec=171.72145637954995, CurrSamplesPerSec=171.5682139640892, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:02:10,915] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=20, lr=[9.040817419292e-06, 9.040817419292e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:02:10,943] [INFO] [timer.py:199:stop] epoch=2/micro_step=290/global_step=1210, RunningAvgSamplesPerSec=171.71982299503398, CurrSamplesPerSec=171.4598871030843, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:02:18,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=20, lr=[9.030761767908025e-06, 9.030761767908025e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:02:18,417] [INFO] [timer.py:199:stop] epoch=2/micro_step=300/global_step=1220, RunningAvgSamplesPerSec=171.71740843193538, CurrSamplesPerSec=171.0290338231504, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:02:20,617] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:02:21,323] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:02:25,780] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=22, lr=[9.022662065719001e-06, 9.022662065719001e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:02:25,809] [INFO] [timer.py:199:stop] epoch=2/micro_step=310/global_step=1230, RunningAvgSamplesPerSec=171.73073837839212, CurrSamplesPerSec=171.60961922592165, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:02:33,245] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=22, lr=[9.012468616584076e-06, 9.012468616584076e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:02:33,273] [INFO] [timer.py:199:stop] epoch=2/micro_step=320/global_step=1240, RunningAvgSamplesPerSec=171.73014151763317, CurrSamplesPerSec=171.68925024432426, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:02:40,711] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=22, lr=[9.002198872634462e-06, 9.002198872634462e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:02:40,740] [INFO] [timer.py:199:stop] epoch=2/micro_step=330/global_step=1250, RunningAvgSamplesPerSec=171.7290888954101, CurrSamplesPerSec=171.3187799680511, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:02:48,174] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=22, lr=[8.991853020982774e-06, 8.991853020982774e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:02:48,203] [INFO] [timer.py:199:stop] epoch=2/micro_step=340/global_step=1260, RunningAvgSamplesPerSec=171.7287726203879, CurrSamplesPerSec=171.1018010484019, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:02:55,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=22, lr=[8.9814312501283e-06, 8.9814312501283e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:02:55,670] [INFO] [timer.py:199:stop] epoch=2/micro_step=350/global_step=1270, RunningAvgSamplesPerSec=171.7282312158373, CurrSamplesPerSec=171.32266154553602, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:03:03,115] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=22, lr=[8.970933749953554e-06, 8.970933749953554e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:03:03,143] [INFO] [timer.py:199:stop] epoch=2/micro_step=360/global_step=1280, RunningAvgSamplesPerSec=171.7262635391197, CurrSamplesPerSec=171.64050794787525, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:03:10,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=22, lr=[8.960360711720823e-06, 8.960360711720823e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:03:10,613] [INFO] [timer.py:199:stop] epoch=2/micro_step=370/global_step=1290, RunningAvgSamplesPerSec=171.72462860760044, CurrSamplesPerSec=171.78340998784748, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:03:18,057] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=22, lr=[8.949712328068686e-06, 8.949712328068686e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:03:18,086] [INFO] [timer.py:199:stop] epoch=2/micro_step=380/global_step=1300, RunningAvgSamplesPerSec=171.72261748214746, CurrSamplesPerSec=171.78868686687687, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:03:25,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=1310, skipped=22, lr=[8.938988793008496e-06, 8.938988793008496e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:03:25,569] [INFO] [timer.py:199:stop] epoch=2/micro_step=390/global_step=1310, RunningAvgSamplesPerSec=171.71880598287805, CurrSamplesPerSec=172.03783819812244, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:03:33,006] [INFO] [logging.py:96:log_dist] [Rank 0] step=1320, skipped=22, lr=[8.92819030192085e-06, 8.92819030192085e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:03:33,034] [INFO] [timer.py:199:stop] epoch=2/micro_step=400/global_step=1320, RunningAvgSamplesPerSec=171.71820318742775, CurrSamplesPerSec=172.29993302115048, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:03:36,723] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:03:37,428] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:03:40,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=1330, skipped=24, lr=[8.91949767287867e-06, 8.91949767287867e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:03:40,414] [INFO] [timer.py:199:stop] epoch=2/micro_step=410/global_step=1330, RunningAvgSamplesPerSec=171.73217036996667, CurrSamplesPerSec=171.34294701580947, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:03:47,855] [INFO] [logging.py:96:log_dist] [Rank 0] step=1340, skipped=24, lr=[8.908564757657683e-06, 8.908564757657683e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:03:47,883] [INFO] [timer.py:199:stop] epoch=2/micro_step=420/global_step=1340, RunningAvgSamplesPerSec=171.73073735575207, CurrSamplesPerSec=171.98079886830567, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:03:55,318] [INFO] [logging.py:96:log_dist] [Rank 0] step=1350, skipped=24, lr=[8.89755744072889e-06, 8.89755744072889e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:03:55,346] [INFO] [timer.py:199:stop] epoch=2/micro_step=430/global_step=1350, RunningAvgSamplesPerSec=171.73036578002265, CurrSamplesPerSec=172.2752741584319, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:04:02,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=1360, skipped=24, lr=[8.886475922643337e-06, 8.886475922643337e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:04:02,826] [INFO] [timer.py:199:stop] epoch=2/micro_step=440/global_step=1360, RunningAvgSamplesPerSec=171.72714384744634, CurrSamplesPerSec=171.1929154371507, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:04:10,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=1370, skipped=24, lr=[8.875320405303997e-06, 8.875320405303997e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:04:10,299] [INFO] [timer.py:199:stop] epoch=2/micro_step=450/global_step=1370, RunningAvgSamplesPerSec=171.7251878358574, CurrSamplesPerSec=170.86872853283631, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:04:17,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=1380, skipped=24, lr=[8.864091091962097e-06, 8.864091091962097e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:04:17,757] [INFO] [timer.py:199:stop] epoch=2/micro_step=460/global_step=1380, RunningAvgSamplesPerSec=171.7256568832341, CurrSamplesPerSec=172.02918342111494, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 3/16 ***** -ppl: 1.934177279472351 -Beginning of Epoch 4/16, Total Micro Batches 460 -[2023-04-18 02:04:33,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=1390, skipped=24, lr=[8.852788187213409e-06, 8.852788187213409e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:04:33,446] [INFO] [timer.py:199:stop] epoch=3/micro_step=10/global_step=1390, RunningAvgSamplesPerSec=171.71794149285893, CurrSamplesPerSec=171.69578425474907, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:04:40,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=24, lr=[8.841411896994526e-06, 8.841411896994526e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:04:40,916] [INFO] [timer.py:199:stop] epoch=3/micro_step=20/global_step=1400, RunningAvgSamplesPerSec=171.71655722142134, CurrSamplesPerSec=172.0620982649919, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:04:48,357] [INFO] [logging.py:96:log_dist] [Rank 0] step=1410, skipped=24, lr=[8.829962428579115e-06, 8.829962428579115e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:04:48,385] [INFO] [timer.py:199:stop] epoch=3/micro_step=30/global_step=1410, RunningAvgSamplesPerSec=171.7152129185277, CurrSamplesPerSec=171.13245266587742, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:04:55,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=1420, skipped=24, lr=[8.818439990574122e-06, 8.818439990574122e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:04:55,860] [INFO] [timer.py:199:stop] epoch=3/micro_step=40/global_step=1420, RunningAvgSamplesPerSec=171.7131965127464, CurrSamplesPerSec=171.41685736188924, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:05:01,044] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:05:01,751] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:05:03,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=1430, skipped=26, lr=[8.809169643098351e-06, 8.809169643098351e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:05:03,245] [INFO] [timer.py:199:stop] epoch=3/micro_step=50/global_step=1430, RunningAvgSamplesPerSec=171.72557765020426, CurrSamplesPerSec=171.45621833125642, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:05:10,699] [INFO] [logging.py:96:log_dist] [Rank 0] step=1440, skipped=26, lr=[8.797516389762936e-06, 8.797516389762936e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:05:10,728] [INFO] [timer.py:199:stop] epoch=3/micro_step=60/global_step=1440, RunningAvgSamplesPerSec=171.7220927238255, CurrSamplesPerSec=171.51767727754847, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:05:18,168] [INFO] [logging.py:96:log_dist] [Rank 0] step=1450, skipped=26, lr=[8.785790757998078e-06, 8.785790757998078e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:05:18,197] [INFO] [timer.py:199:stop] epoch=3/micro_step=70/global_step=1450, RunningAvgSamplesPerSec=171.72081362379495, CurrSamplesPerSec=171.40246419157762, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:05:25,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=1460, skipped=26, lr=[8.773992961442371e-06, 8.773992961442371e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:05:25,666] [INFO] [timer.py:199:stop] epoch=3/micro_step=80/global_step=1460, RunningAvgSamplesPerSec=171.71951208133456, CurrSamplesPerSec=171.55593333209774, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:05:33,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=1470, skipped=26, lr=[8.762123215049236e-06, 8.762123215049236e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:05:33,136] [INFO] [timer.py:199:stop] epoch=3/micro_step=90/global_step=1470, RunningAvgSamplesPerSec=171.71827228954007, CurrSamplesPerSec=171.41696682491693, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:05:40,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=1480, skipped=26, lr=[8.750181735083004e-06, 8.750181735083004e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:05:40,605] [INFO] [timer.py:199:stop] epoch=3/micro_step=100/global_step=1480, RunningAvgSamplesPerSec=171.7169287058911, CurrSamplesPerSec=171.2390552093271, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:05:48,045] [INFO] [logging.py:96:log_dist] [Rank 0] step=1490, skipped=26, lr=[8.738168739114978e-06, 8.738168739114978e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:05:48,074] [INFO] [timer.py:199:stop] epoch=3/micro_step=110/global_step=1490, RunningAvgSamplesPerSec=171.71588977324467, CurrSamplesPerSec=171.38987898944407, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:05:55,514] [INFO] [logging.py:96:log_dist] [Rank 0] step=1500, skipped=26, lr=[8.726084446019468e-06, 8.726084446019468e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:05:55,542] [INFO] [timer.py:199:stop] epoch=3/micro_step=120/global_step=1500, RunningAvgSamplesPerSec=171.71480620889938, CurrSamplesPerSec=171.45150939154252, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:06:02,980] [INFO] [logging.py:96:log_dist] [Rank 0] step=1510, skipped=26, lr=[8.7139290759698e-06, 8.7139290759698e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:06:03,009] [INFO] [timer.py:199:stop] epoch=3/micro_step=130/global_step=1510, RunningAvgSamplesPerSec=171.71391381126156, CurrSamplesPerSec=171.55905814757946, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:06:10,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=1520, skipped=26, lr=[8.70170285043431e-06, 8.70170285043431e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:06:10,474] [INFO] [timer.py:199:stop] epoch=3/micro_step=140/global_step=1520, RunningAvgSamplesPerSec=171.7134993206875, CurrSamplesPerSec=171.99165271501403, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:06:17,152] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:06:17,858] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:06:17,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=1530, skipped=28, lr=[8.691871003707109e-06, 8.691871003707109e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:06:17,859] [INFO] [timer.py:199:stop] epoch=3/micro_step=150/global_step=1530, RunningAvgSamplesPerSec=171.7249116252993, CurrSamplesPerSec=181.32397382092503, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:06:25,294] [INFO] [logging.py:96:log_dist] [Rank 0] step=1540, skipped=28, lr=[8.67951780051554e-06, 8.67951780051554e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:06:25,323] [INFO] [timer.py:199:stop] epoch=3/micro_step=160/global_step=1540, RunningAvgSamplesPerSec=171.7245048530562, CurrSamplesPerSec=171.60495670807487, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:06:32,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=1550, skipped=28, lr=[8.667094368804494e-06, 8.667094368804494e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:06:32,794] [INFO] [timer.py:199:stop] epoch=3/micro_step=170/global_step=1550, RunningAvgSamplesPerSec=171.72302465837373, CurrSamplesPerSec=171.37910095953873, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:06:40,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=1560, skipped=28, lr=[8.654600934926332e-06, 8.654600934926332e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:06:40,270] [INFO] [timer.py:199:stop] epoch=3/micro_step=180/global_step=1560, RunningAvgSamplesPerSec=171.72082209478907, CurrSamplesPerSec=171.13354367313477, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:06:47,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=1570, skipped=28, lr=[8.642037726508847e-06, 8.642037726508847e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:06:47,741] [INFO] [timer.py:199:stop] epoch=3/micro_step=190/global_step=1570, RunningAvgSamplesPerSec=171.71927591530394, CurrSamplesPerSec=171.29177780443666, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:06:55,180] [INFO] [logging.py:96:log_dist] [Rank 0] step=1580, skipped=28, lr=[8.629404972451102e-06, 8.629404972451102e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:06:55,208] [INFO] [timer.py:199:stop] epoch=3/micro_step=200/global_step=1580, RunningAvgSamplesPerSec=171.71853723095109, CurrSamplesPerSec=171.70039679684174, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:07:02,647] [INFO] [logging.py:96:log_dist] [Rank 0] step=1590, skipped=28, lr=[8.616702902919272e-06, 8.616702902919272e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:07:02,676] [INFO] [timer.py:199:stop] epoch=3/micro_step=210/global_step=1590, RunningAvgSamplesPerSec=171.717678985868, CurrSamplesPerSec=171.93464400170504, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:07:10,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=1600, skipped=28, lr=[8.603931749342444e-06, 8.603931749342444e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:07:10,140] [INFO] [timer.py:199:stop] epoch=3/micro_step=220/global_step=1600, RunningAvgSamplesPerSec=171.71726513763085, CurrSamplesPerSec=171.11772542929958, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:07:17,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=1610, skipped=28, lr=[8.591091744408404e-06, 8.591091744408404e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:07:17,618] [INFO] [timer.py:199:stop] epoch=3/micro_step=230/global_step=1610, RunningAvgSamplesPerSec=171.71498326313906, CurrSamplesPerSec=171.47993122544364, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:07:25,056] [INFO] [logging.py:96:log_dist] [Rank 0] step=1620, skipped=28, lr=[8.578183122059393e-06, 8.578183122059393e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:07:25,085] [INFO] [timer.py:199:stop] epoch=3/micro_step=240/global_step=1620, RunningAvgSamplesPerSec=171.71416618907068, CurrSamplesPerSec=171.6361729805282, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:07:32,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=1630, skipped=28, lr=[8.56520611748785e-06, 8.56520611748785e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:07:32,565] [INFO] [timer.py:199:stop] epoch=3/micro_step=250/global_step=1630, RunningAvgSamplesPerSec=171.71155016057583, CurrSamplesPerSec=171.15171098819087, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:07:33,271] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:07:33,977] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:07:39,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=1640, skipped=30, lr=[8.554775437475205e-06, 8.554775437475205e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:07:39,944] [INFO] [timer.py:199:stop] epoch=3/micro_step=260/global_step=1640, RunningAvgSamplesPerSec=171.72310988621712, CurrSamplesPerSec=171.49641911153077, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:07:47,383] [INFO] [logging.py:96:log_dist] [Rank 0] step=1650, skipped=30, lr=[8.54167594156224e-06, 8.54167594156224e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:07:47,412] [INFO] [timer.py:199:stop] epoch=3/micro_step=270/global_step=1650, RunningAvgSamplesPerSec=171.72210411500566, CurrSamplesPerSec=171.85819602179058, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:07:54,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=1660, skipped=30, lr=[8.528508728580041e-06, 8.528508728580041e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:07:54,876] [INFO] [timer.py:199:stop] epoch=3/micro_step=280/global_step=1660, RunningAvgSamplesPerSec=171.72172832394043, CurrSamplesPerSec=171.7609868681627, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:08:02,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=1670, skipped=30, lr=[8.515274038432512e-06, 8.515274038432512e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:08:02,339] [INFO] [timer.py:199:stop] epoch=3/micro_step=290/global_step=1670, RunningAvgSamplesPerSec=171.7214215818611, CurrSamplesPerSec=171.88724815024702, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:08:09,774] [INFO] [logging.py:96:log_dist] [Rank 0] step=1680, skipped=30, lr=[8.501972112252983e-06, 8.501972112252983e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:08:09,802] [INFO] [timer.py:199:stop] epoch=3/micro_step=300/global_step=1680, RunningAvgSamplesPerSec=171.7211461890102, CurrSamplesPerSec=171.46125608801879, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:08:17,259] [INFO] [logging.py:96:log_dist] [Rank 0] step=1690, skipped=30, lr=[8.488603192399804e-06, 8.488603192399804e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:08:17,287] [INFO] [timer.py:199:stop] epoch=3/micro_step=310/global_step=1690, RunningAvgSamplesPerSec=171.71785605803674, CurrSamplesPerSec=170.20470435911238, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:08:24,733] [INFO] [logging.py:96:log_dist] [Rank 0] step=1700, skipped=30, lr=[8.475167522451937e-06, 8.475167522451937e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:08:24,761] [INFO] [timer.py:199:stop] epoch=3/micro_step=320/global_step=1700, RunningAvgSamplesPerSec=171.7161269399177, CurrSamplesPerSec=171.59459037042978, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:08:32,206] [INFO] [logging.py:96:log_dist] [Rank 0] step=1710, skipped=30, lr=[8.461665347204519e-06, 8.461665347204519e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:08:32,235] [INFO] [timer.py:199:stop] epoch=3/micro_step=330/global_step=1710, RunningAvgSamplesPerSec=171.71456592243348, CurrSamplesPerSec=171.6868344336298, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:08:39,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=1720, skipped=30, lr=[8.448096912664396e-06, 8.448096912664396e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:08:39,696] [INFO] [timer.py:199:stop] epoch=3/micro_step=340/global_step=1720, RunningAvgSamplesPerSec=171.71450060815874, CurrSamplesPerSec=171.4999252502201, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:08:47,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=1730, skipped=30, lr=[8.434462466045652e-06, 8.434462466045652e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:08:47,167] [INFO] [timer.py:199:stop] epoch=3/micro_step=350/global_step=1730, RunningAvgSamplesPerSec=171.71319494986975, CurrSamplesPerSec=171.38708860799903, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:08:49,371] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:08:50,077] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:08:54,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=1740, skipped=32, lr=[8.423507546949846e-06, 8.423507546949846e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:08:54,555] [INFO] [timer.py:199:stop] epoch=3/micro_step=360/global_step=1740, RunningAvgSamplesPerSec=171.72298433382502, CurrSamplesPerSec=172.35369832519888, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:09:01,999] [INFO] [logging.py:96:log_dist] [Rank 0] step=1750, skipped=32, lr=[8.409754905405272e-06, 8.409754905405272e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:09:02,027] [INFO] [timer.py:199:stop] epoch=3/micro_step=370/global_step=1750, RunningAvgSamplesPerSec=171.72158433750252, CurrSamplesPerSec=171.82035495035214, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:09:09,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=1760, skipped=32, lr=[8.395936950365541e-06, 8.395936950365541e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:09:09,493] [INFO] [timer.py:199:stop] epoch=3/micro_step=380/global_step=1760, RunningAvgSamplesPerSec=171.72087011406032, CurrSamplesPerSec=172.10120445313956, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:09:16,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=1770, skipped=32, lr=[8.382053933590945e-06, 8.382053933590945e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:09:16,952] [INFO] [timer.py:199:stop] epoch=3/micro_step=390/global_step=1770, RunningAvgSamplesPerSec=171.7211834558278, CurrSamplesPerSec=171.68381476586387, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:09:24,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=1780, skipped=32, lr=[8.368106108027184e-06, 8.368106108027184e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:09:24,414] [INFO] [timer.py:199:stop] epoch=3/micro_step=400/global_step=1780, RunningAvgSamplesPerSec=171.72111549518812, CurrSamplesPerSec=171.94466598363346, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:09:31,847] [INFO] [logging.py:96:log_dist] [Rank 0] step=1790, skipped=32, lr=[8.354093727800765e-06, 8.354093727800765e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:09:31,875] [INFO] [timer.py:199:stop] epoch=3/micro_step=410/global_step=1790, RunningAvgSamplesPerSec=171.72105477653548, CurrSamplesPerSec=171.45994186206195, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:09:39,318] [INFO] [logging.py:96:log_dist] [Rank 0] step=1800, skipped=32, lr=[8.340017048214367e-06, 8.340017048214367e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:09:39,347] [INFO] [timer.py:199:stop] epoch=3/micro_step=420/global_step=1800, RunningAvgSamplesPerSec=171.71971693745775, CurrSamplesPerSec=171.6022141688652, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:09:46,787] [INFO] [logging.py:96:log_dist] [Rank 0] step=1810, skipped=32, lr=[8.325876325742187e-06, 8.325876325742187e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:09:46,816] [INFO] [timer.py:199:stop] epoch=3/micro_step=430/global_step=1810, RunningAvgSamplesPerSec=171.7187439085993, CurrSamplesPerSec=171.91867732327745, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:09:54,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=1820, skipped=32, lr=[8.311671818025275e-06, 8.311671818025275e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:09:54,286] [INFO] [timer.py:199:stop] epoch=3/micro_step=440/global_step=1820, RunningAvgSamplesPerSec=171.71762398372076, CurrSamplesPerSec=171.41609112460986, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:10:01,727] [INFO] [logging.py:96:log_dist] [Rank 0] step=1830, skipped=32, lr=[8.297403783866833e-06, 8.297403783866833e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:10:01,755] [INFO] [timer.py:199:stop] epoch=3/micro_step=450/global_step=1830, RunningAvgSamplesPerSec=171.71663500565327, CurrSamplesPerSec=171.85599550314058, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:10:05,449] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:10:06,155] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:10:09,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=1840, skipped=34, lr=[8.285943792156956e-06, 8.285943792156956e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:10:09,142] [INFO] [timer.py:199:stop] epoch=3/micro_step=460/global_step=1840, RunningAvgSamplesPerSec=171.72605952764602, CurrSamplesPerSec=171.36444063826738, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 4/16 ***** -ppl: 1.9056392908096313 -Beginning of Epoch 5/16, Total Micro Batches 460 -[2023-04-18 02:10:24,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=1850, skipped=34, lr=[8.271562066279346e-06, 8.271562066279346e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:10:24,817] [INFO] [timer.py:199:stop] epoch=4/micro_step=10/global_step=1850, RunningAvgSamplesPerSec=171.71164560205727, CurrSamplesPerSec=171.58329303725276, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:10:32,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=1860, skipped=34, lr=[8.257117544751607e-06, 8.257117544751607e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:10:32,279] [INFO] [timer.py:199:stop] epoch=4/micro_step=20/global_step=1860, RunningAvgSamplesPerSec=171.71159058302052, CurrSamplesPerSec=171.93172573210248, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:10:39,716] [INFO] [logging.py:96:log_dist] [Rank 0] step=1870, skipped=34, lr=[8.242610490749946e-06, 8.242610490749946e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:10:39,745] [INFO] [timer.py:199:stop] epoch=4/micro_step=30/global_step=1870, RunningAvgSamplesPerSec=171.7109944831215, CurrSamplesPerSec=171.52217065735303, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:10:47,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=1880, skipped=34, lr=[8.228041168589898e-06, 8.228041168589898e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:10:47,206] [INFO] [timer.py:199:stop] epoch=4/micro_step=40/global_step=1880, RunningAvgSamplesPerSec=171.71108856576365, CurrSamplesPerSec=171.86738379595556, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:10:54,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=1890, skipped=34, lr=[8.213409843721504e-06, 8.213409843721504e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:10:54,687] [INFO] [timer.py:199:stop] epoch=4/micro_step=50/global_step=1890, RunningAvgSamplesPerSec=171.7086976526796, CurrSamplesPerSec=171.34365791454948, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:11:02,140] [INFO] [logging.py:96:log_dist] [Rank 0] step=1900, skipped=34, lr=[8.198716782724485e-06, 8.198716782724485e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:11:02,169] [INFO] [timer.py:199:stop] epoch=4/micro_step=60/global_step=1900, RunningAvgSamplesPerSec=171.70623394232194, CurrSamplesPerSec=171.65916726884387, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:11:09,608] [INFO] [logging.py:96:log_dist] [Rank 0] step=1910, skipped=34, lr=[8.18396225330339e-06, 8.18396225330339e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:11:09,637] [INFO] [timer.py:199:stop] epoch=4/micro_step=70/global_step=1910, RunningAvgSamplesPerSec=171.70547424112425, CurrSamplesPerSec=171.68156380990632, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:11:17,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=1920, skipped=34, lr=[8.169146524282695e-06, 8.169146524282695e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:11:17,098] [INFO] [timer.py:199:stop] epoch=4/micro_step=80/global_step=1920, RunningAvgSamplesPerSec=171.70560106290168, CurrSamplesPerSec=171.81656076231928, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:11:24,537] [INFO] [logging.py:96:log_dist] [Rank 0] step=1930, skipped=34, lr=[8.154269865601928e-06, 8.154269865601928e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:11:24,566] [INFO] [timer.py:199:stop] epoch=4/micro_step=90/global_step=1930, RunningAvgSamplesPerSec=171.70481379336366, CurrSamplesPerSec=171.69880434358507, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:11:29,748] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:11:30,454] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:11:31,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=1940, skipped=36, lr=[8.142324851410258e-06, 8.142324851410258e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:11:31,948] [INFO] [timer.py:199:stop] epoch=4/micro_step=100/global_step=1940, RunningAvgSamplesPerSec=171.7142794110302, CurrSamplesPerSec=171.5509448275179, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:11:39,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=1950, skipped=36, lr=[8.127339203129347e-06, 8.127339203129347e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:11:39,417] [INFO] [timer.py:199:stop] epoch=4/micro_step=110/global_step=1950, RunningAvgSamplesPerSec=171.713382741549, CurrSamplesPerSec=171.73807516898836, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:11:46,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=1960, skipped=36, lr=[8.112293386909134e-06, 8.112293386909134e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:11:46,887] [INFO] [timer.py:199:stop] epoch=4/micro_step=120/global_step=1960, RunningAvgSamplesPerSec=171.7123891861792, CurrSamplesPerSec=171.55626225468265, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:11:54,321] [INFO] [logging.py:96:log_dist] [Rank 0] step=1970, skipped=36, lr=[8.097187676881293e-06, 8.097187676881293e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:11:54,350] [INFO] [timer.py:199:stop] epoch=4/micro_step=130/global_step=1970, RunningAvgSamplesPerSec=171.71219256289453, CurrSamplesPerSec=171.45884668915437, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:12:01,792] [INFO] [logging.py:96:log_dist] [Rank 0] step=1980, skipped=36, lr=[8.082022348268742e-06, 8.082022348268742e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:12:01,820] [INFO] [timer.py:199:stop] epoch=4/micro_step=140/global_step=1980, RunningAvgSamplesPerSec=171.71108605923897, CurrSamplesPerSec=171.5101157795939, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:12:09,263] [INFO] [logging.py:96:log_dist] [Rank 0] step=1990, skipped=36, lr=[8.06679767738064e-06, 8.06679767738064e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:12:09,291] [INFO] [timer.py:199:stop] epoch=4/micro_step=150/global_step=1990, RunningAvgSamplesPerSec=171.71003302321958, CurrSamplesPerSec=171.7294505255481, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:12:16,731] [INFO] [logging.py:96:log_dist] [Rank 0] step=2000, skipped=36, lr=[8.051513941607355e-06, 8.051513941607355e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:12:16,760] [INFO] [timer.py:199:stop] epoch=4/micro_step=160/global_step=2000, RunningAvgSamplesPerSec=171.70918179266272, CurrSamplesPerSec=171.52381463561755, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:12:24,203] [INFO] [logging.py:96:log_dist] [Rank 0] step=2010, skipped=36, lr=[8.036171419415398e-06, 8.036171419415398e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:12:24,231] [INFO] [timer.py:199:stop] epoch=4/micro_step=170/global_step=2010, RunningAvgSamplesPerSec=171.70810491016908, CurrSamplesPerSec=171.48475127614373, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:12:31,674] [INFO] [logging.py:96:log_dist] [Rank 0] step=2020, skipped=36, lr=[8.02077039034236e-06, 8.02077039034236e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:12:31,702] [INFO] [timer.py:199:stop] epoch=4/micro_step=180/global_step=2020, RunningAvgSamplesPerSec=171.7070297877095, CurrSamplesPerSec=171.33168279231202, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:12:39,137] [INFO] [logging.py:96:log_dist] [Rank 0] step=2030, skipped=36, lr=[8.005311134991816e-06, 8.005311134991816e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:12:39,166] [INFO] [timer.py:199:stop] epoch=4/micro_step=190/global_step=2030, RunningAvgSamplesPerSec=171.7068514197215, CurrSamplesPerSec=171.5416264203116, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:12:45,837] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:12:46,545] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:12:46,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=2040, skipped=38, lr=[7.99290199703457e-06, 7.99290199703457e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:12:46,546] [INFO] [timer.py:199:stop] epoch=4/micro_step=200/global_step=2040, RunningAvgSamplesPerSec=171.71601589328668, CurrSamplesPerSec=181.00016081533713, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:12:53,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=2050, skipped=38, lr=[7.977338644888544e-06, 7.977338644888544e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:12:54,018] [INFO] [timer.py:199:stop] epoch=4/micro_step=210/global_step=2050, RunningAvgSamplesPerSec=171.71482779377982, CurrSamplesPerSec=171.80374974239243, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:13:01,454] [INFO] [logging.py:96:log_dist] [Rank 0] step=2060, skipped=38, lr=[7.96171785778246e-06, 7.96171785778246e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:13:01,482] [INFO] [timer.py:199:stop] epoch=4/micro_step=220/global_step=2060, RunningAvgSamplesPerSec=171.71448738079127, CurrSamplesPerSec=171.93657121518888, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:13:08,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=2070, skipped=38, lr=[7.946039920323833e-06, 7.946039920323833e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:13:08,952] [INFO] [timer.py:199:stop] epoch=4/micro_step=230/global_step=2070, RunningAvgSamplesPerSec=171.7135305643368, CurrSamplesPerSec=171.59738751276103, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:13:16,395] [INFO] [logging.py:96:log_dist] [Rank 0] step=2080, skipped=38, lr=[7.930305118161453e-06, 7.930305118161453e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:13:16,424] [INFO] [timer.py:199:stop] epoch=4/micro_step=240/global_step=2080, RunningAvgSamplesPerSec=171.7124224612466, CurrSamplesPerSec=171.49680258846746, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:13:23,870] [INFO] [logging.py:96:log_dist] [Rank 0] step=2090, skipped=38, lr=[7.914513737980172e-06, 7.914513737980172e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:13:23,899] [INFO] [timer.py:199:stop] epoch=4/micro_step=250/global_step=2090, RunningAvgSamplesPerSec=171.7109233738895, CurrSamplesPerSec=170.77535128697602, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:13:31,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=2100, skipped=38, lr=[7.898666067495678e-06, 7.898666067495678e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:13:31,365] [INFO] [timer.py:199:stop] epoch=4/micro_step=260/global_step=2100, RunningAvgSamplesPerSec=171.71045915552952, CurrSamplesPerSec=171.35432210357524, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:13:38,802] [INFO] [logging.py:96:log_dist] [Rank 0] step=2110, skipped=38, lr=[7.88276239544926e-06, 7.88276239544926e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:13:38,830] [INFO] [timer.py:199:stop] epoch=4/micro_step=270/global_step=2110, RunningAvgSamplesPerSec=171.71013161302932, CurrSamplesPerSec=171.99512403125362, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:13:46,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=2120, skipped=38, lr=[7.866803011602546e-06, 7.866803011602546e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:13:46,290] [INFO] [timer.py:199:stop] epoch=4/micro_step=280/global_step=2120, RunningAvgSamplesPerSec=171.710370537094, CurrSamplesPerSec=171.47494713980007, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:13:53,731] [INFO] [logging.py:96:log_dist] [Rank 0] step=2130, skipped=38, lr=[7.850788206732214e-06, 7.850788206732214e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:13:53,760] [INFO] [timer.py:199:stop] epoch=4/micro_step=290/global_step=2130, RunningAvgSamplesPerSec=171.7094931220681, CurrSamplesPerSec=171.89104546090525, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:14:01,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=2140, skipped=38, lr=[7.834718272624709e-06, 7.834718272624709e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:14:01,221] [INFO] [timer.py:199:stop] epoch=4/micro_step=300/global_step=2140, RunningAvgSamplesPerSec=171.70955085538404, CurrSamplesPerSec=171.31085334913473, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:14:01,925] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:14:02,630] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:14:08,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=2150, skipped=40, lr=[7.821822829009803e-06, 7.821822829009803e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:14:08,593] [INFO] [timer.py:199:stop] epoch=4/micro_step=310/global_step=2150, RunningAvgSamplesPerSec=171.71907502314986, CurrSamplesPerSec=171.679697208271, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:14:16,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=2160, skipped=40, lr=[7.805654400779998e-06, 7.805654400779998e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:14:16,058] [INFO] [timer.py:199:stop] epoch=4/micro_step=320/global_step=2160, RunningAvgSamplesPerSec=171.71872027712777, CurrSamplesPerSec=171.51888279536297, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:14:23,499] [INFO] [logging.py:96:log_dist] [Rank 0] step=2170, skipped=40, lr=[7.789431665641658e-06, 7.789431665641658e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:14:23,528] [INFO] [timer.py:199:stop] epoch=4/micro_step=330/global_step=2170, RunningAvgSamplesPerSec=171.71789006920181, CurrSamplesPerSec=171.6180672565738, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:14:30,958] [INFO] [logging.py:96:log_dist] [Rank 0] step=2180, skipped=40, lr=[7.77315491916967e-06, 7.77315491916967e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:14:30,986] [INFO] [timer.py:199:stop] epoch=4/micro_step=340/global_step=2180, RunningAvgSamplesPerSec=171.71819544633112, CurrSamplesPerSec=171.63639246728295, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:14:38,434] [INFO] [logging.py:96:log_dist] [Rank 0] step=2190, skipped=40, lr=[7.756824457923e-06, 7.756824457923e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:14:38,463] [INFO] [timer.py:199:stop] epoch=4/micro_step=350/global_step=2190, RunningAvgSamplesPerSec=171.71665680964261, CurrSamplesPerSec=171.28664070481142, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:14:45,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=2200, skipped=40, lr=[7.740440579439282e-06, 7.740440579439282e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:14:45,926] [INFO] [timer.py:199:stop] epoch=4/micro_step=360/global_step=2200, RunningAvgSamplesPerSec=171.7164116104708, CurrSamplesPerSec=171.78192592402982, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:14:53,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=2210, skipped=40, lr=[7.724003582229405e-06, 7.724003582229405e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:14:53,383] [INFO] [timer.py:199:stop] epoch=4/micro_step=370/global_step=2210, RunningAvgSamplesPerSec=171.7169120090668, CurrSamplesPerSec=171.5147731451265, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:15:00,812] [INFO] [logging.py:96:log_dist] [Rank 0] step=2220, skipped=40, lr=[7.70751376577207e-06, 7.70751376577207e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:15:00,841] [INFO] [timer.py:199:stop] epoch=4/micro_step=380/global_step=2220, RunningAvgSamplesPerSec=171.71731065548266, CurrSamplesPerSec=171.92913791551925, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:15:08,278] [INFO] [logging.py:96:log_dist] [Rank 0] step=2230, skipped=40, lr=[7.69097143050833e-06, 7.69097143050833e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:15:08,307] [INFO] [timer.py:199:stop] epoch=4/micro_step=390/global_step=2230, RunningAvgSamplesPerSec=171.7168499492179, CurrSamplesPerSec=171.73950353750084, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:15:15,742] [INFO] [logging.py:96:log_dist] [Rank 0] step=2240, skipped=40, lr=[7.674376877836124e-06, 7.674376877836124e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:15:15,770] [INFO] [timer.py:199:stop] epoch=4/micro_step=400/global_step=2240, RunningAvgSamplesPerSec=171.71667509450546, CurrSamplesPerSec=171.74082205260115, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:15:17,967] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:15:18,672] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:15:23,119] [INFO] [logging.py:96:log_dist] [Rank 0] step=2250, skipped=42, lr=[7.661063842311183e-06, 7.661063842311183e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:15:23,147] [INFO] [timer.py:199:stop] epoch=4/micro_step=410/global_step=2250, RunningAvgSamplesPerSec=171.7253655244842, CurrSamplesPerSec=171.68551674736014, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:15:30,578] [INFO] [logging.py:96:log_dist] [Rank 0] step=2260, skipped=42, lr=[7.64437606085986e-06, 7.64437606085986e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:15:30,607] [INFO] [timer.py:199:stop] epoch=4/micro_step=420/global_step=2260, RunningAvgSamplesPerSec=171.72553453548377, CurrSamplesPerSec=171.85847109058415, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:15:38,045] [INFO] [logging.py:96:log_dist] [Rank 0] step=2270, skipped=42, lr=[7.6276369109580975e-06, 7.6276369109580975e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:15:38,074] [INFO] [timer.py:199:stop] epoch=4/micro_step=430/global_step=2270, RunningAvgSamplesPerSec=171.724944971251, CurrSamplesPerSec=171.87046494616152, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:15:45,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=2280, skipped=42, lr=[7.610846697589754e-06, 7.610846697589754e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:15:45,542] [INFO] [timer.py:199:stop] epoch=4/micro_step=440/global_step=2280, RunningAvgSamplesPerSec=171.72428434383306, CurrSamplesPerSec=171.36011960428957, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:15:52,998] [INFO] [logging.py:96:log_dist] [Rank 0] step=2290, skipped=42, lr=[7.594005726669053e-06, 7.594005726669053e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:15:53,027] [INFO] [timer.py:199:stop] epoch=4/micro_step=450/global_step=2290, RunningAvgSamplesPerSec=171.72191962904188, CurrSamplesPerSec=171.999862879624, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:16:00,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=2300, skipped=42, lr=[7.577114305035016e-06, 7.577114305035016e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:16:00,478] [INFO] [timer.py:199:stop] epoch=4/micro_step=460/global_step=2300, RunningAvgSamplesPerSec=171.72292370460053, CurrSamplesPerSec=171.87580219989897, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 5/16 ***** -ppl: 1.8821245431900024 -Beginning of Epoch 6/16, Total Micro Batches 460 -[2023-04-18 02:16:16,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=2310, skipped=42, lr=[7.560172740445858e-06, 7.560172740445858e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:16:16,139] [INFO] [timer.py:199:stop] epoch=5/micro_step=10/global_step=2310, RunningAvgSamplesPerSec=171.72319356235812, CurrSamplesPerSec=171.6490687797261, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:16:23,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=2320, skipped=42, lr=[7.543181341573394e-06, 7.543181341573394e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:16:23,594] [INFO] [timer.py:199:stop] epoch=5/micro_step=20/global_step=2320, RunningAvgSamplesPerSec=171.7237390003117, CurrSamplesPerSec=171.87926884123277, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:16:31,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=2330, skipped=42, lr=[7.526140417997409e-06, 7.526140417997409e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:16:31,055] [INFO] [timer.py:199:stop] epoch=5/micro_step=30/global_step=2330, RunningAvgSamplesPerSec=171.7237975801905, CurrSamplesPerSec=171.77747388642555, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:16:38,479] [INFO] [logging.py:96:log_dist] [Rank 0] step=2340, skipped=42, lr=[7.509050280200013e-06, 7.509050280200013e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:16:38,508] [INFO] [timer.py:199:stop] epoch=5/micro_step=40/global_step=2340, RunningAvgSamplesPerSec=171.72469125476056, CurrSamplesPerSec=171.83338843779336, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:16:42,197] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:16:42,903] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:16:45,860] [INFO] [logging.py:96:log_dist] [Rank 0] step=2350, skipped=44, lr=[7.495342944939244e-06, 7.495342944939244e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:16:45,889] [INFO] [timer.py:199:stop] epoch=5/micro_step=50/global_step=2350, RunningAvgSamplesPerSec=171.73248742723328, CurrSamplesPerSec=171.81188699292105, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:16:53,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=2360, skipped=44, lr=[7.478165006816799e-06, 7.478165006816799e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:16:53,370] [INFO] [timer.py:199:stop] epoch=5/micro_step=60/global_step=2360, RunningAvgSamplesPerSec=171.73052765588454, CurrSamplesPerSec=171.29123128983, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:17:00,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=2370, skipped=44, lr=[7.460938728575041e-06, 7.460938728575041e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:17:00,834] [INFO] [timer.py:199:stop] epoch=5/micro_step=70/global_step=2370, RunningAvgSamplesPerSec=171.73013522065082, CurrSamplesPerSec=171.2600310894151, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:17:08,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=2380, skipped=44, lr=[7.443664424073213e-06, 7.443664424073213e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:17:08,297] [INFO] [timer.py:199:stop] epoch=5/micro_step=80/global_step=2380, RunningAvgSamplesPerSec=171.72987433825026, CurrSamplesPerSec=171.6243215293262, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:17:15,742] [INFO] [logging.py:96:log_dist] [Rank 0] step=2390, skipped=44, lr=[7.426342408045578e-06, 7.426342408045578e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:17:15,771] [INFO] [timer.py:199:stop] epoch=5/micro_step=90/global_step=2390, RunningAvgSamplesPerSec=171.72868261826892, CurrSamplesPerSec=171.54727215792533, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:17:23,214] [INFO] [logging.py:96:log_dist] [Rank 0] step=2400, skipped=44, lr=[7.408972996095693e-06, 7.408972996095693e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:17:23,242] [INFO] [timer.py:199:stop] epoch=5/micro_step=100/global_step=2400, RunningAvgSamplesPerSec=171.7276275045925, CurrSamplesPerSec=171.41428502098336, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:17:30,682] [INFO] [logging.py:96:log_dist] [Rank 0] step=2410, skipped=44, lr=[7.391556504690667e-06, 7.391556504690667e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:17:30,710] [INFO] [timer.py:199:stop] epoch=5/micro_step=110/global_step=2410, RunningAvgSamplesPerSec=171.72703924064118, CurrSamplesPerSec=171.7663722793919, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:17:38,147] [INFO] [logging.py:96:log_dist] [Rank 0] step=2420, skipped=44, lr=[7.3740932511553785e-06, 7.3740932511553785e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:17:38,175] [INFO] [timer.py:199:stop] epoch=5/micro_step=120/global_step=2420, RunningAvgSamplesPerSec=171.72663762685866, CurrSamplesPerSec=171.22578406458263, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:17:45,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=2430, skipped=44, lr=[7.356583553666708e-06, 7.356583553666708e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:17:45,645] [INFO] [timer.py:199:stop] epoch=5/micro_step=130/global_step=2430, RunningAvgSamplesPerSec=171.725792130739, CurrSamplesPerSec=171.63589862287418, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:17:53,080] [INFO] [logging.py:96:log_dist] [Rank 0] step=2440, skipped=44, lr=[7.339027731247732e-06, 7.339027731247732e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:17:53,108] [INFO] [timer.py:199:stop] epoch=5/micro_step=140/global_step=2440, RunningAvgSamplesPerSec=171.725566242189, CurrSamplesPerSec=171.59831991380304, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:17:58,298] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:17:59,005] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:18:00,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=2450, skipped=46, lr=[7.324950078282891e-06, 7.324950078282891e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:18:00,500] [INFO] [timer.py:199:stop] epoch=5/micro_step=150/global_step=2450, RunningAvgSamplesPerSec=171.73211448692064, CurrSamplesPerSec=171.59393223254554, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:18:07,939] [INFO] [logging.py:96:log_dist] [Rank 0] step=2460, skipped=46, lr=[7.307312037606314e-06, 7.307312037606314e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:18:07,968] [INFO] [timer.py:199:stop] epoch=5/micro_step=160/global_step=2460, RunningAvgSamplesPerSec=171.73143985189677, CurrSamplesPerSec=171.7807167058039, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:18:15,416] [INFO] [logging.py:96:log_dist] [Rank 0] step=2470, skipped=46, lr=[7.289628769716295e-06, 7.289628769716295e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:18:15,445] [INFO] [timer.py:199:stop] epoch=5/micro_step=170/global_step=2470, RunningAvgSamplesPerSec=171.72998138855905, CurrSamplesPerSec=171.70050662229312, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:18:22,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=2480, skipped=46, lr=[7.271900596798327e-06, 7.271900596798327e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:18:22,900] [INFO] [timer.py:199:stop] epoch=5/micro_step=180/global_step=2480, RunningAvgSamplesPerSec=171.73051898763003, CurrSamplesPerSec=171.4839844317708, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:18:30,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=2490, skipped=46, lr=[7.254127841856065e-06, 7.254127841856065e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:18:30,369] [INFO] [timer.py:199:stop] epoch=5/micro_step=190/global_step=2490, RunningAvgSamplesPerSec=171.72980236903922, CurrSamplesPerSec=171.48606588245366, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:18:37,803] [INFO] [logging.py:96:log_dist] [Rank 0] step=2500, skipped=46, lr=[7.236310828705439e-06, 7.236310828705439e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:18:37,831] [INFO] [timer.py:199:stop] epoch=5/micro_step=200/global_step=2500, RunningAvgSamplesPerSec=171.72976922869412, CurrSamplesPerSec=171.90920883217402, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:18:45,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=2510, skipped=46, lr=[7.218449881968754e-06, 7.218449881968754e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:18:45,285] [INFO] [timer.py:199:stop] epoch=5/micro_step=210/global_step=2510, RunningAvgSamplesPerSec=171.73039660590433, CurrSamplesPerSec=171.82827379561667, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:18:52,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=2520, skipped=46, lr=[7.200545327068777e-06, 7.200545327068777e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:18:52,747] [INFO] [timer.py:199:stop] epoch=5/micro_step=220/global_step=2520, RunningAvgSamplesPerSec=171.73032842693985, CurrSamplesPerSec=171.8886790012307, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:19:00,180] [INFO] [logging.py:96:log_dist] [Rank 0] step=2530, skipped=46, lr=[7.182597490222809e-06, 7.182597490222809e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:19:00,209] [INFO] [timer.py:199:stop] epoch=5/micro_step=230/global_step=2530, RunningAvgSamplesPerSec=171.73027732345562, CurrSamplesPerSec=171.51866360904532, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:19:07,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=2540, skipped=46, lr=[7.164606698436731e-06, 7.164606698436731e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:19:07,679] [INFO] [timer.py:199:stop] epoch=5/micro_step=240/global_step=2540, RunningAvgSamplesPerSec=171.7293567441894, CurrSamplesPerSec=171.7072611575788, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:19:14,352] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:19:15,059] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:19:15,059] [INFO] [logging.py:96:log_dist] [Rank 0] step=2550, skipped=48, lr=[7.150183357698731e-06, 7.150183357698731e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:19:15,061] [INFO] [timer.py:199:stop] epoch=5/micro_step=250/global_step=2550, RunningAvgSamplesPerSec=171.73654536108882, CurrSamplesPerSec=180.9504413623418, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:19:22,484] [INFO] [logging.py:96:log_dist] [Rank 0] step=2560, skipped=48, lr=[7.132116073569637e-06, 7.132116073569637e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:19:22,512] [INFO] [timer.py:199:stop] epoch=5/micro_step=260/global_step=2560, RunningAvgSamplesPerSec=171.73729650038277, CurrSamplesPerSec=171.38435302811922, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:19:29,944] [INFO] [logging.py:96:log_dist] [Rank 0] step=2570, skipped=48, lr=[7.114006754261436e-06, 7.114006754261436e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:19:29,973] [INFO] [timer.py:199:stop] epoch=5/micro_step=270/global_step=2570, RunningAvgSamplesPerSec=171.73726652373182, CurrSamplesPerSec=171.73411981095109, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:19:37,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=2580, skipped=48, lr=[7.095855729722194e-06, 7.095855729722194e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:19:37,443] [INFO] [timer.py:199:stop] epoch=5/micro_step=280/global_step=2580, RunningAvgSamplesPerSec=171.73644235986814, CurrSamplesPerSec=171.56952985036946, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:19:44,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=2590, skipped=48, lr=[7.077663330659833e-06, 7.077663330659833e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:19:44,907] [INFO] [timer.py:199:stop] epoch=5/micro_step=290/global_step=2590, RunningAvgSamplesPerSec=171.73614223499243, CurrSamplesPerSec=171.77109853073202, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:19:52,347] [INFO] [logging.py:96:log_dist] [Rank 0] step=2600, skipped=48, lr=[7.059429888536115e-06, 7.059429888536115e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:19:52,375] [INFO] [timer.py:199:stop] epoch=5/micro_step=300/global_step=2600, RunningAvgSamplesPerSec=171.73545951541237, CurrSamplesPerSec=171.58976414317925, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:19:59,804] [INFO] [logging.py:96:log_dist] [Rank 0] step=2610, skipped=48, lr=[7.041155735560591e-06, 7.041155735560591e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:19:59,833] [INFO] [timer.py:199:stop] epoch=5/micro_step=310/global_step=2610, RunningAvgSamplesPerSec=171.73573442584987, CurrSamplesPerSec=172.18642812631197, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:20:07,279] [INFO] [logging.py:96:log_dist] [Rank 0] step=2620, skipped=48, lr=[7.022841204684563e-06, 7.022841204684563e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:20:07,307] [INFO] [timer.py:199:stop] epoch=5/micro_step=320/global_step=2620, RunningAvgSamplesPerSec=171.73471821460757, CurrSamplesPerSec=171.5881189004853, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:20:14,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=2630, skipped=48, lr=[7.004486629595007e-06, 7.004486629595007e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:20:14,772] [INFO] [timer.py:199:stop] epoch=5/micro_step=330/global_step=2630, RunningAvgSamplesPerSec=171.73433131230502, CurrSamplesPerSec=171.5222254561208, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:20:22,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=2640, skipped=48, lr=[6.9860923447084966e-06, 6.9860923447084966e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:20:22,246] [INFO] [timer.py:199:stop] epoch=5/micro_step=340/global_step=2640, RunningAvgSamplesPerSec=171.7332247078215, CurrSamplesPerSec=171.45556125437284, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:20:29,683] [INFO] [logging.py:96:log_dist] [Rank 0] step=2650, skipped=48, lr=[6.967658685165106e-06, 6.967658685165106e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:20:29,712] [INFO] [timer.py:199:stop] epoch=5/micro_step=350/global_step=2650, RunningAvgSamplesPerSec=171.73273061599858, CurrSamplesPerSec=171.55313754104924, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:20:30,415] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:20:31,121] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:20:37,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=2660, skipped=50, lr=[6.952883633449719e-06, 6.952883633449719e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:20:37,093] [INFO] [timer.py:199:stop] epoch=5/micro_step=360/global_step=2660, RunningAvgSamplesPerSec=171.73961762942946, CurrSamplesPerSec=171.60594404365017, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:20:44,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=2670, skipped=50, lr=[6.934379946362973e-06, 6.934379946362973e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:20:44,555] [INFO] [timer.py:199:stop] epoch=5/micro_step=370/global_step=2670, RunningAvgSamplesPerSec=171.7394729799509, CurrSamplesPerSec=171.5312129278089, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:20:51,987] [INFO] [logging.py:96:log_dist] [Rank 0] step=2680, skipped=50, lr=[6.915837826808583e-06, 6.915837826808583e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:20:52,015] [INFO] [timer.py:199:stop] epoch=5/micro_step=380/global_step=2680, RunningAvgSamplesPerSec=171.7395304934974, CurrSamplesPerSec=172.22912687388322, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:20:59,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=2690, skipped=50, lr=[6.897257612620147e-06, 6.897257612620147e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:20:59,474] [INFO] [timer.py:199:stop] epoch=5/micro_step=390/global_step=2690, RunningAvgSamplesPerSec=171.73961157355143, CurrSamplesPerSec=171.8536300084411, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:21:06,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=2700, skipped=50, lr=[6.878639642325329e-06, 6.878639642325329e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:21:06,926] [INFO] [timer.py:199:stop] epoch=5/micro_step=400/global_step=2700, RunningAvgSamplesPerSec=171.7403183013685, CurrSamplesPerSec=172.11361845450332, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:21:14,353] [INFO] [logging.py:96:log_dist] [Rank 0] step=2710, skipped=50, lr=[6.859984255139716e-06, 6.859984255139716e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:21:14,382] [INFO] [timer.py:199:stop] epoch=5/micro_step=410/global_step=2710, RunningAvgSamplesPerSec=171.74067758543265, CurrSamplesPerSec=171.59716812575311, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:21:21,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=2720, skipped=50, lr=[6.841291790960609e-06, 6.841291790960609e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:21:21,844] [INFO] [timer.py:199:stop] epoch=5/micro_step=420/global_step=2720, RunningAvgSamplesPerSec=171.74049209852356, CurrSamplesPerSec=171.453863829067, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:21:29,282] [INFO] [logging.py:96:log_dist] [Rank 0] step=2730, skipped=50, lr=[6.822562590360855e-06, 6.822562590360855e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:21:29,311] [INFO] [timer.py:199:stop] epoch=5/micro_step=430/global_step=2730, RunningAvgSamplesPerSec=171.73997093219333, CurrSamplesPerSec=171.9536427065157, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:21:36,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=2740, skipped=50, lr=[6.803796994582627e-06, 6.803796994582627e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:21:36,777] [INFO] [timer.py:199:stop] epoch=5/micro_step=440/global_step=2740, RunningAvgSamplesPerSec=171.73944314216075, CurrSamplesPerSec=171.14270868332517, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:21:44,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=2750, skipped=50, lr=[6.784995345531208e-06, 6.784995345531208e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:21:44,237] [INFO] [timer.py:199:stop] epoch=5/micro_step=450/global_step=2750, RunningAvgSamplesPerSec=171.7396919212653, CurrSamplesPerSec=171.9103097659248, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:21:46,434] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:21:47,139] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:21:51,586] [INFO] [logging.py:96:log_dist] [Rank 0] step=2760, skipped=52, lr=[6.769928298113241e-06, 6.769928298113241e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:21:51,615] [INFO] [timer.py:199:stop] epoch=5/micro_step=460/global_step=2760, RunningAvgSamplesPerSec=171.7468126153379, CurrSamplesPerSec=172.0399882459142, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 6/16 ***** -ppl: 1.8625335693359375 -Beginning of Epoch 7/16, Total Micro Batches 460 -[2023-04-18 02:22:07,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=2770, skipped=52, lr=[6.751062616864186e-06, 6.751062616864186e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:22:07,289] [INFO] [timer.py:199:stop] epoch=6/micro_step=10/global_step=2770, RunningAvgSamplesPerSec=171.7464567932106, CurrSamplesPerSec=171.69627844383382, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:22:14,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=2780, skipped=52, lr=[6.732161843151428e-06, 6.732161843151428e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:22:14,747] [INFO] [timer.py:199:stop] epoch=6/micro_step=20/global_step=2780, RunningAvgSamplesPerSec=171.74665416175, CurrSamplesPerSec=172.02521464264143, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:22:22,188] [INFO] [logging.py:96:log_dist] [Rank 0] step=2790, skipped=52, lr=[6.713226321343168e-06, 6.713226321343168e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:22:22,217] [INFO] [timer.py:199:stop] epoch=6/micro_step=30/global_step=2790, RunningAvgSamplesPerSec=171.74583723354053, CurrSamplesPerSec=171.42194754057323, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:22:29,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=2800, skipped=52, lr=[6.694256396440708e-06, 6.694256396440708e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:22:29,676] [INFO] [timer.py:199:stop] epoch=6/micro_step=40/global_step=2800, RunningAvgSamplesPerSec=171.74584361832433, CurrSamplesPerSec=172.17334103008145, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:22:37,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=2810, skipped=52, lr=[6.675252414072166e-06, 6.675252414072166e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:22:37,145] [INFO] [timer.py:199:stop] epoch=6/micro_step=50/global_step=2810, RunningAvgSamplesPerSec=171.74514056998873, CurrSamplesPerSec=171.7947336893358, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:22:44,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=2820, skipped=52, lr=[6.656214720486185e-06, 6.656214720486185e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:22:44,613] [INFO] [timer.py:199:stop] epoch=6/micro_step=60/global_step=2820, RunningAvgSamplesPerSec=171.74443294431106, CurrSamplesPerSec=171.52228025492357, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:22:52,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=2830, skipped=52, lr=[6.6371436625456135e-06, 6.6371436625456135e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:22:52,077] [INFO] [timer.py:199:stop] epoch=6/micro_step=70/global_step=2830, RunningAvgSamplesPerSec=171.74410210992167, CurrSamplesPerSec=171.461475127637, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:22:59,516] [INFO] [logging.py:96:log_dist] [Rank 0] step=2840, skipped=52, lr=[6.618039587721198e-06, 6.618039587721198e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:22:59,544] [INFO] [timer.py:199:stop] epoch=6/micro_step=80/global_step=2840, RunningAvgSamplesPerSec=171.74353892113865, CurrSamplesPerSec=171.8289887272924, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:23:06,984] [INFO] [logging.py:96:log_dist] [Rank 0] step=2850, skipped=52, lr=[6.598902844085239e-06, 6.598902844085239e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:23:07,013] [INFO] [timer.py:199:stop] epoch=6/micro_step=90/global_step=2850, RunningAvgSamplesPerSec=171.74292328498132, CurrSamplesPerSec=171.4900646005397, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:23:10,702] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:23:11,407] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:23:14,363] [INFO] [logging.py:96:log_dist] [Rank 0] step=2860, skipped=54, lr=[6.583570161916899e-06, 6.583570161916899e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:23:14,392] [INFO] [timer.py:199:stop] epoch=6/micro_step=100/global_step=2860, RunningAvgSamplesPerSec=171.74943980240954, CurrSamplesPerSec=171.64308708297406, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:23:21,840] [INFO] [logging.py:96:log_dist] [Rank 0] step=2870, skipped=54, lr=[6.564375493458251e-06, 6.564375493458251e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:23:21,869] [INFO] [timer.py:199:stop] epoch=6/micro_step=110/global_step=2870, RunningAvgSamplesPerSec=171.74803296354523, CurrSamplesPerSec=171.5521508130236, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:23:29,307] [INFO] [logging.py:96:log_dist] [Rank 0] step=2880, skipped=54, lr=[6.545149133936794e-06, 6.545149133936794e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:23:29,336] [INFO] [timer.py:199:stop] epoch=6/micro_step=120/global_step=2880, RunningAvgSamplesPerSec=171.74752273652751, CurrSamplesPerSec=171.04527162753504, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:23:36,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=2890, skipped=54, lr=[6.525891433652832e-06, 6.525891433652832e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:23:36,804] [INFO] [timer.py:199:stop] epoch=6/micro_step=130/global_step=2890, RunningAvgSamplesPerSec=171.7467975888452, CurrSamplesPerSec=171.74757976026333, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:23:44,244] [INFO] [logging.py:96:log_dist] [Rank 0] step=2900, skipped=54, lr=[6.506602743477693e-06, 6.506602743477693e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:23:44,273] [INFO] [timer.py:199:stop] epoch=6/micro_step=140/global_step=2900, RunningAvgSamplesPerSec=171.74611927429135, CurrSamplesPerSec=171.3937090727063, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:23:51,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=2910, skipped=54, lr=[6.487283414847333e-06, 6.487283414847333e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:23:51,737] [INFO] [timer.py:199:stop] epoch=6/micro_step=150/global_step=2910, RunningAvgSamplesPerSec=171.74578935540018, CurrSamplesPerSec=171.53066488427842, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:23:59,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=2920, skipped=54, lr=[6.467933799755936e-06, 6.467933799755936e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:23:59,199] [INFO] [timer.py:199:stop] epoch=6/micro_step=160/global_step=2920, RunningAvgSamplesPerSec=171.7456555196746, CurrSamplesPerSec=171.65247139876368, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:24:06,639] [INFO] [logging.py:96:log_dist] [Rank 0] step=2930, skipped=54, lr=[6.448554250749502e-06, 6.448554250749502e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:24:06,668] [INFO] [timer.py:199:stop] epoch=6/micro_step=170/global_step=2930, RunningAvgSamplesPerSec=171.7449501609932, CurrSamplesPerSec=171.11265330137186, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:24:14,095] [INFO] [logging.py:96:log_dist] [Rank 0] step=2940, skipped=54, lr=[6.429145120919414e-06, 6.429145120919414e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:24:14,124] [INFO] [timer.py:199:stop] epoch=6/micro_step=180/global_step=2940, RunningAvgSamplesPerSec=171.74524060578483, CurrSamplesPerSec=171.7540083171962, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:24:21,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=2950, skipped=54, lr=[6.409706763896017e-06, 6.409706763896017e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:24:21,586] [INFO] [timer.py:199:stop] epoch=6/micro_step=190/global_step=2950, RunningAvgSamplesPerSec=171.74512324505852, CurrSamplesPerSec=171.7392288494032, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:24:26,770] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:24:27,474] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:24:28,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=2960, skipped=56, lr=[6.394135272677828e-06, 6.394135272677828e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:24:28,968] [INFO] [timer.py:199:stop] epoch=6/micro_step=200/global_step=2960, RunningAvgSamplesPerSec=171.75113603838597, CurrSamplesPerSec=171.26287196824526, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:24:36,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=2970, skipped=56, lr=[6.374645199550603e-06, 6.374645199550603e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:24:36,424] [INFO] [timer.py:199:stop] epoch=6/micro_step=210/global_step=2970, RunningAvgSamplesPerSec=171.75141107456975, CurrSamplesPerSec=171.93244152491107, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:24:43,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=2980, skipped=56, lr=[6.355126892207394e-06, 6.355126892207394e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:24:43,890] [INFO] [timer.py:199:stop] epoch=6/micro_step=220/global_step=2980, RunningAvgSamplesPerSec=171.7509466916708, CurrSamplesPerSec=171.18489129377704, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:24:51,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=2990, skipped=56, lr=[6.335580706267743e-06, 6.335580706267743e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:24:51,358] [INFO] [timer.py:199:stop] epoch=6/micro_step=230/global_step=2990, RunningAvgSamplesPerSec=171.75035166198646, CurrSamplesPerSec=171.50211665970272, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:24:58,794] [INFO] [logging.py:96:log_dist] [Rank 0] step=3000, skipped=56, lr=[6.316006997859122e-06, 6.316006997859122e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:24:58,823] [INFO] [timer.py:199:stop] epoch=6/micro_step=240/global_step=3000, RunningAvgSamplesPerSec=171.74997230103935, CurrSamplesPerSec=171.66762017864122, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:25:06,256] [INFO] [logging.py:96:log_dist] [Rank 0] step=3010, skipped=56, lr=[6.296406123610463e-06, 6.296406123610463e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:25:06,285] [INFO] [timer.py:199:stop] epoch=6/micro_step=250/global_step=3010, RunningAvgSamplesPerSec=171.7498058888146, CurrSamplesPerSec=171.24954250705179, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:25:13,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=3020, skipped=56, lr=[6.276778440645655e-06, 6.276778440645655e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:25:13,744] [INFO] [timer.py:199:stop] epoch=6/micro_step=260/global_step=3020, RunningAvgSamplesPerSec=171.74988163904496, CurrSamplesPerSec=171.5178416653435, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:25:21,180] [INFO] [logging.py:96:log_dist] [Rank 0] step=3030, skipped=56, lr=[6.257124306577029e-06, 6.257124306577029e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:25:21,209] [INFO] [timer.py:199:stop] epoch=6/micro_step=270/global_step=3030, RunningAvgSamplesPerSec=171.74950187513588, CurrSamplesPerSec=171.93833327675404, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:25:28,639] [INFO] [logging.py:96:log_dist] [Rank 0] step=3040, skipped=56, lr=[6.23744407949886e-06, 6.23744407949886e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:25:28,667] [INFO] [timer.py:199:stop] epoch=6/micro_step=280/global_step=3040, RunningAvgSamplesPerSec=171.74958774867451, CurrSamplesPerSec=171.9236321819793, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:25:36,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=3050, skipped=56, lr=[6.217738117980825e-06, 6.217738117980825e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:25:36,129] [INFO] [timer.py:199:stop] epoch=6/micro_step=290/global_step=3050, RunningAvgSamplesPerSec=171.74943332717, CurrSamplesPerSec=171.71209399675556, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:25:42,806] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:25:43,512] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:25:43,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=3060, skipped=58, lr=[6.201955061228015e-06, 6.201955061228015e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:25:43,514] [INFO] [timer.py:199:stop] epoch=6/micro_step=300/global_step=3060, RunningAvgSamplesPerSec=171.7550774542347, CurrSamplesPerSec=181.2254297796957, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:25:50,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=3070, skipped=58, lr=[6.182203682806289e-06, 6.182203682806289e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:25:50,982] [INFO] [timer.py:199:stop] epoch=6/micro_step=310/global_step=3070, RunningAvgSamplesPerSec=171.75446872332333, CurrSamplesPerSec=172.16616329894526, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:25:58,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=3080, skipped=58, lr=[6.162427576413335e-06, 6.162427576413335e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:25:58,457] [INFO] [timer.py:199:stop] epoch=6/micro_step=320/global_step=3080, RunningAvgSamplesPerSec=171.75330070359314, CurrSamplesPerSec=171.6604296635318, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:26:05,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=3090, skipped=58, lr=[6.14262710236573e-06, 6.14262710236573e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:26:05,924] [INFO] [timer.py:199:stop] epoch=6/micro_step=330/global_step=3090, RunningAvgSamplesPerSec=171.75274771301358, CurrSamplesPerSec=171.20345167920175, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:26:13,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=3100, skipped=58, lr=[6.122802621424032e-06, 6.122802621424032e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:26:13,384] [INFO] [timer.py:199:stop] epoch=6/micro_step=340/global_step=3100, RunningAvgSamplesPerSec=171.75270070235757, CurrSamplesPerSec=171.9897793644651, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:26:20,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=3110, skipped=58, lr=[6.102954494786192e-06, 6.102954494786192e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:26:20,848] [INFO] [timer.py:199:stop] epoch=6/micro_step=350/global_step=3110, RunningAvgSamplesPerSec=171.7523978799328, CurrSamplesPerSec=171.4532067702297, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:26:28,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=3120, skipped=58, lr=[6.0830830840809885e-06, 6.0830830840809885e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:26:28,312] [INFO] [timer.py:199:stop] epoch=6/micro_step=360/global_step=3120, RunningAvgSamplesPerSec=171.75205298094, CurrSamplesPerSec=172.03524718629103, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:26:35,741] [INFO] [logging.py:96:log_dist] [Rank 0] step=3130, skipped=58, lr=[6.063188751361424e-06, 6.063188751361424e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:26:35,770] [INFO] [timer.py:199:stop] epoch=6/micro_step=370/global_step=3130, RunningAvgSamplesPerSec=171.7522204858993, CurrSamplesPerSec=172.12321984362197, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:26:43,206] [INFO] [logging.py:96:log_dist] [Rank 0] step=3140, skipped=58, lr=[6.043271859098137e-06, 6.043271859098137e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:26:43,234] [INFO] [timer.py:199:stop] epoch=6/micro_step=380/global_step=3140, RunningAvgSamplesPerSec=171.75185145613926, CurrSamplesPerSec=171.89737468485907, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:26:50,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=3150, skipped=58, lr=[6.023332770172801e-06, 6.023332770172801e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:26:50,692] [INFO] [timer.py:199:stop] epoch=6/micro_step=390/global_step=3150, RunningAvgSamplesPerSec=171.75197535508013, CurrSamplesPerSec=171.99606075964292, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:26:58,130] [INFO] [logging.py:96:log_dist] [Rank 0] step=3160, skipped=58, lr=[6.003371847871503e-06, 6.003371847871503e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:26:58,159] [INFO] [timer.py:199:stop] epoch=6/micro_step=400/global_step=3160, RunningAvgSamplesPerSec=171.7514796446397, CurrSamplesPerSec=172.04627330914278, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:26:58,864] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:26:59,570] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:27:05,514] [INFO] [logging.py:96:log_dist] [Rank 0] step=3170, skipped=60, lr=[5.987387634382147e-06, 5.987387634382147e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:27:05,543] [INFO] [timer.py:199:stop] epoch=6/micro_step=410/global_step=3170, RunningAvgSamplesPerSec=171.7569808133522, CurrSamplesPerSec=171.50041831247094, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:27:12,985] [INFO] [logging.py:96:log_dist] [Rank 0] step=3180, skipped=60, lr=[5.967388328750606e-06, 5.967388328750606e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:27:13,014] [INFO] [timer.py:199:stop] epoch=6/micro_step=420/global_step=3180, RunningAvgSamplesPerSec=171.7561703129999, CurrSamplesPerSec=171.81925516845857, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:27:20,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=3190, skipped=60, lr=[5.94736820903929e-06, 5.94736820903929e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:27:20,472] [INFO] [timer.py:199:stop] epoch=6/micro_step=430/global_step=3190, RunningAvgSamplesPerSec=171.75628571733458, CurrSamplesPerSec=172.1341468654113, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:27:27,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=3200, skipped=60, lr=[5.927327640010648e-06, 5.927327640010648e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:27:27,934] [INFO] [timer.py:199:stop] epoch=6/micro_step=440/global_step=3200, RunningAvgSamplesPerSec=171.7561119638139, CurrSamplesPerSec=172.00597968623947, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:27:35,371] [INFO] [logging.py:96:log_dist] [Rank 0] step=3210, skipped=60, lr=[5.9072669867997216e-06, 5.9072669867997216e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:27:35,397] [INFO] [timer.py:199:stop] epoch=6/micro_step=450/global_step=3210, RunningAvgSamplesPerSec=171.75590834270167, CurrSamplesPerSec=171.8264040024375, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:27:42,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=3220, skipped=60, lr=[5.88718661490747e-06, 5.88718661490747e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:27:42,852] [INFO] [timer.py:199:stop] epoch=6/micro_step=460/global_step=3220, RunningAvgSamplesPerSec=171.75628043422017, CurrSamplesPerSec=171.6331550945744, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 7/16 ***** -ppl: 1.849153995513916 -Beginning of Epoch 8/16, Total Micro Batches 460 -[2023-04-18 02:27:58,482] [INFO] [logging.py:96:log_dist] [Rank 0] step=3230, skipped=60, lr=[5.86708689019413e-06, 5.86708689019413e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:27:58,511] [INFO] [timer.py:199:stop] epoch=7/micro_step=10/global_step=3230, RunningAvgSamplesPerSec=171.75549819692344, CurrSamplesPerSec=171.0462525332561, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:28:05,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=3240, skipped=60, lr=[5.846968178872542e-06, 5.846968178872542e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:28:05,976] [INFO] [timer.py:199:stop] epoch=7/micro_step=20/global_step=3240, RunningAvgSamplesPerSec=171.75512394513737, CurrSamplesPerSec=171.9902752474036, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:28:13,418] [INFO] [logging.py:96:log_dist] [Rank 0] step=3250, skipped=60, lr=[5.826830847501475e-06, 5.826830847501475e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:28:13,447] [INFO] [timer.py:199:stop] epoch=7/micro_step=30/global_step=3250, RunningAvgSamplesPerSec=171.75434820085187, CurrSamplesPerSec=171.2836897443718, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:28:20,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=3260, skipped=60, lr=[5.806675262978959e-06, 5.806675262978959e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:28:20,901] [INFO] [timer.py:199:stop] epoch=7/micro_step=40/global_step=3260, RunningAvgSamplesPerSec=171.75469573071163, CurrSamplesPerSec=172.23161322299734, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:28:23,098] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:28:23,803] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:28:28,252] [INFO] [logging.py:96:log_dist] [Rank 0] step=3270, skipped=62, lr=[5.790537899859855e-06, 5.790537899859855e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:28:28,281] [INFO] [timer.py:199:stop] epoch=7/micro_step=50/global_step=3270, RunningAvgSamplesPerSec=171.76036103258383, CurrSamplesPerSec=171.64495288859675, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:28:35,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=3280, skipped=62, lr=[5.770350385305034e-06, 5.770350385305034e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:28:35,742] [INFO] [timer.py:199:stop] epoch=7/micro_step=60/global_step=3280, RunningAvgSamplesPerSec=171.7602267879924, CurrSamplesPerSec=171.3623074423596, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:28:43,176] [INFO] [logging.py:96:log_dist] [Rank 0] step=3290, skipped=62, lr=[5.7501456466611355e-06, 5.7501456466611355e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:28:43,205] [INFO] [timer.py:199:stop] epoch=7/micro_step=70/global_step=3290, RunningAvgSamplesPerSec=171.7600145438082, CurrSamplesPerSec=170.7072579157896, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:28:50,640] [INFO] [logging.py:96:log_dist] [Rank 0] step=3300, skipped=62, lr=[5.72992405205433e-06, 5.72992405205433e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:28:50,668] [INFO] [timer.py:199:stop] epoch=7/micro_step=80/global_step=3300, RunningAvgSamplesPerSec=171.75969665173204, CurrSamplesPerSec=171.97473894075887, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:28:58,100] [INFO] [logging.py:96:log_dist] [Rank 0] step=3310, skipped=62, lr=[5.709685969917904e-06, 5.709685969917904e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:28:58,129] [INFO] [timer.py:199:stop] epoch=7/micro_step=90/global_step=3310, RunningAvgSamplesPerSec=171.759638475081, CurrSamplesPerSec=171.73961341298593, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:29:05,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=3320, skipped=62, lr=[5.689431768985538e-06, 5.689431768985538e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:29:05,595] [INFO] [timer.py:199:stop] epoch=7/micro_step=100/global_step=3320, RunningAvgSamplesPerSec=171.75914250323373, CurrSamplesPerSec=171.78340998784748, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:29:13,039] [INFO] [logging.py:96:log_dist] [Rank 0] step=3330, skipped=62, lr=[5.669161818284596e-06, 5.669161818284596e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:29:13,068] [INFO] [timer.py:199:stop] epoch=7/micro_step=110/global_step=3330, RunningAvgSamplesPerSec=171.75819317381408, CurrSamplesPerSec=171.3142425722844, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:29:20,507] [INFO] [logging.py:96:log_dist] [Rank 0] step=3340, skipped=62, lr=[5.648876487129402e-06, 5.648876487129402e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:29:20,536] [INFO] [timer.py:199:stop] epoch=7/micro_step=120/global_step=3340, RunningAvgSamplesPerSec=171.75760718984068, CurrSamplesPerSec=171.15269311510633, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:29:27,972] [INFO] [logging.py:96:log_dist] [Rank 0] step=3350, skipped=62, lr=[5.628576145114505e-06, 5.628576145114505e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:29:28,001] [INFO] [timer.py:199:stop] epoch=7/micro_step=130/global_step=3350, RunningAvgSamplesPerSec=171.7572514323598, CurrSamplesPerSec=171.68447359340777, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:29:35,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=3360, skipped=62, lr=[5.608261162107952e-06, 5.608261162107952e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:29:35,464] [INFO] [timer.py:199:stop] epoch=7/micro_step=140/global_step=3360, RunningAvgSamplesPerSec=171.7570361328574, CurrSamplesPerSec=171.6710784271404, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:29:39,159] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:29:39,864] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:29:42,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=3370, skipped=64, lr=[5.591998882910512e-06, 5.591998882910512e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:29:42,847] [INFO] [timer.py:199:stop] epoch=7/micro_step=150/global_step=3370, RunningAvgSamplesPerSec=171.76226014257458, CurrSamplesPerSec=171.479438280986, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:29:50,277] [INFO] [logging.py:96:log_dist] [Rank 0] step=3380, skipped=64, lr=[5.571658479033629e-06, 5.571658479033629e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:29:50,306] [INFO] [timer.py:199:stop] epoch=7/micro_step=160/global_step=3380, RunningAvgSamplesPerSec=171.76232041916143, CurrSamplesPerSec=171.6497822209277, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:29:57,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=3390, skipped=64, lr=[5.551304471193227e-06, 5.551304471193227e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:29:57,775] [INFO] [timer.py:199:stop] epoch=7/micro_step=170/global_step=3390, RunningAvgSamplesPerSec=171.7617001046509, CurrSamplesPerSec=171.41483232109343, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:30:05,212] [INFO] [logging.py:96:log_dist] [Rank 0] step=3400, skipped=64, lr=[5.530937230235134e-06, 5.530937230235134e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:30:05,241] [INFO] [timer.py:199:stop] epoch=7/micro_step=180/global_step=3400, RunningAvgSamplesPerSec=171.76125994400383, CurrSamplesPerSec=171.4210717893766, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:30:12,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=3410, skipped=64, lr=[5.5105571272462785e-06, 5.5105571272462785e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:30:12,708] [INFO] [timer.py:199:stop] epoch=7/micro_step=190/global_step=3410, RunningAvgSamplesPerSec=171.76076001635252, CurrSamplesPerSec=171.76780111864682, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:30:20,146] [INFO] [logging.py:96:log_dist] [Rank 0] step=3420, skipped=64, lr=[5.490164533547934e-06, 5.490164533547934e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:30:20,175] [INFO] [timer.py:199:stop] epoch=7/micro_step=200/global_step=3420, RunningAvgSamplesPerSec=171.76024784021763, CurrSamplesPerSec=171.92247602273636, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:30:27,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=3430, skipped=64, lr=[5.469759820688954e-06, 5.469759820688954e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:30:27,640] [INFO] [timer.py:199:stop] epoch=7/micro_step=210/global_step=3430, RunningAvgSamplesPerSec=171.75986288747248, CurrSamplesPerSec=171.93998524227268, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:30:35,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=3440, skipped=64, lr=[5.449343360438996e-06, 5.449343360438996e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:30:35,106] [INFO] [timer.py:199:stop] epoch=7/micro_step=220/global_step=3440, RunningAvgSamplesPerSec=171.75945274854695, CurrSamplesPerSec=171.5340628106208, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:30:42,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=3450, skipped=64, lr=[5.4289155247817595e-06, 5.4289155247817595e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:30:42,567] [INFO] [timer.py:199:stop] epoch=7/micro_step=230/global_step=3450, RunningAvgSamplesPerSec=171.75932079388494, CurrSamplesPerSec=171.66734572044237, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:30:50,000] [INFO] [logging.py:96:log_dist] [Rank 0] step=3460, skipped=64, lr=[5.4084766859081955e-06, 5.4084766859081955e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:30:50,029] [INFO] [timer.py:199:stop] epoch=7/micro_step=240/global_step=3460, RunningAvgSamplesPerSec=171.75919702360142, CurrSamplesPerSec=171.88686292520612, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:30:55,215] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:30:55,919] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:30:57,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=3470, skipped=66, lr=[5.392117942734067e-06, 5.392117942734067e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:30:57,416] [INFO] [timer.py:199:stop] epoch=7/micro_step=250/global_step=3470, RunningAvgSamplesPerSec=171.7640136563967, CurrSamplesPerSec=171.33370586896808, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:31:04,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=3480, skipped=66, lr=[5.371660236627906e-06, 5.371660236627906e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:31:04,893] [INFO] [timer.py:199:stop] epoch=7/micro_step=260/global_step=3480, RunningAvgSamplesPerSec=171.76285030826102, CurrSamplesPerSec=169.34468415300202, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:31:12,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=3490, skipped=66, lr=[5.3511925704849616e-06, 5.3511925704849616e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:31:12,357] [INFO] [timer.py:199:stop] epoch=7/micro_step=270/global_step=3490, RunningAvgSamplesPerSec=171.7624966127333, CurrSamplesPerSec=171.67245078658956, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:31:19,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=3500, skipped=66, lr=[5.330715317221888e-06, 5.330715317221888e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:31:19,836] [INFO] [timer.py:199:stop] epoch=7/micro_step=280/global_step=3500, RunningAvgSamplesPerSec=171.761219241802, CurrSamplesPerSec=171.92693557978768, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:31:27,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=3510, skipped=66, lr=[5.310228849930021e-06, 5.310228849930021e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:31:27,303] [INFO] [timer.py:199:stop] epoch=7/micro_step=290/global_step=3510, RunningAvgSamplesPerSec=171.7607055008938, CurrSamplesPerSec=171.53636470817906, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:31:34,743] [INFO] [logging.py:96:log_dist] [Rank 0] step=3520, skipped=66, lr=[5.289733541868569e-06, 5.289733541868569e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:31:34,771] [INFO] [timer.py:199:stop] epoch=7/micro_step=300/global_step=3520, RunningAvgSamplesPerSec=171.76007797156842, CurrSamplesPerSec=171.1470187596094, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:31:42,216] [INFO] [logging.py:96:log_dist] [Rank 0] step=3530, skipped=66, lr=[5.2692297664578155e-06, 5.2692297664578155e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:31:42,244] [INFO] [timer.py:199:stop] epoch=7/micro_step=310/global_step=3530, RunningAvgSamplesPerSec=171.75938586129254, CurrSamplesPerSec=171.4246843207364, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:31:49,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=3540, skipped=66, lr=[5.248717897272325e-06, 5.248717897272325e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:31:49,707] [INFO] [timer.py:199:stop] epoch=7/micro_step=320/global_step=3540, RunningAvgSamplesPerSec=171.75920338742225, CurrSamplesPerSec=171.90111740179267, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:31:57,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=3550, skipped=66, lr=[5.228198308034119e-06, 5.228198308034119e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:31:57,171] [INFO] [timer.py:199:stop] epoch=7/micro_step=330/global_step=3550, RunningAvgSamplesPerSec=171.75888171743492, CurrSamplesPerSec=171.42747592652967, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:32:04,604] [INFO] [logging.py:96:log_dist] [Rank 0] step=3560, skipped=66, lr=[5.207671372605887e-06, 5.207671372605887e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:32:04,632] [INFO] [timer.py:199:stop] epoch=7/micro_step=340/global_step=3560, RunningAvgSamplesPerSec=171.7587813477973, CurrSamplesPerSec=171.74279986321253, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:32:11,308] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:32:12,014] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:32:12,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=3570, skipped=68, lr=[5.1912447863278255e-06, 5.1912447863278255e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:32:12,015] [INFO] [timer.py:199:stop] epoch=7/micro_step=350/global_step=3570, RunningAvgSamplesPerSec=171.76370411264085, CurrSamplesPerSec=181.3064606548054, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:32:19,447] [INFO] [logging.py:96:log_dist] [Rank 0] step=3580, skipped=68, lr=[5.170705570314394e-06, 5.170705570314394e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:32:19,475] [INFO] [timer.py:199:stop] epoch=7/micro_step=360/global_step=3580, RunningAvgSamplesPerSec=171.76363420826107, CurrSamplesPerSec=171.72846176827525, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:32:26,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=3590, skipped=68, lr=[5.15016005561676e-06, 5.15016005561676e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:32:26,934] [INFO] [timer.py:199:stop] epoch=7/micro_step=370/global_step=3590, RunningAvgSamplesPerSec=171.763668320382, CurrSamplesPerSec=171.76010764941765, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:32:34,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=3600, skipped=68, lr=[5.129608616569963e-06, 5.129608616569963e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:32:34,401] [INFO] [timer.py:199:stop] epoch=7/micro_step=380/global_step=3600, RunningAvgSamplesPerSec=171.7632202965248, CurrSamplesPerSec=171.89104546090525, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:32:41,839] [INFO] [logging.py:96:log_dist] [Rank 0] step=3610, skipped=68, lr=[5.109051627616987e-06, 5.109051627616987e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:32:41,867] [INFO] [timer.py:199:stop] epoch=7/micro_step=390/global_step=3610, RunningAvgSamplesPerSec=171.7627633658772, CurrSamplesPerSec=171.57693208681644, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:32:49,312] [INFO] [logging.py:96:log_dist] [Rank 0] step=3620, skipped=68, lr=[5.088489463301932e-06, 5.088489463301932e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:32:49,341] [INFO] [timer.py:199:stop] epoch=7/micro_step=400/global_step=3620, RunningAvgSamplesPerSec=171.76182572486906, CurrSamplesPerSec=171.6056697826285, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:32:56,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=3630, skipped=68, lr=[5.067922498263188e-06, 5.067922498263188e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:32:56,812] [INFO] [timer.py:199:stop] epoch=7/micro_step=410/global_step=3630, RunningAvgSamplesPerSec=171.76116842928775, CurrSamplesPerSec=171.8286587580874, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:33:04,245] [INFO] [logging.py:96:log_dist] [Rank 0] step=3640, skipped=68, lr=[5.04735110722662e-06, 5.04735110722662e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:33:04,274] [INFO] [timer.py:199:stop] epoch=7/micro_step=420/global_step=3640, RunningAvgSamplesPerSec=171.76099047822248, CurrSamplesPerSec=171.74708527581632, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:33:11,712] [INFO] [logging.py:96:log_dist] [Rank 0] step=3650, skipped=68, lr=[5.026775664998731e-06, 5.026775664998731e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:33:11,741] [INFO] [timer.py:199:stop] epoch=7/micro_step=430/global_step=3650, RunningAvgSamplesPerSec=171.7605148413002, CurrSamplesPerSec=171.8174405591138, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:33:19,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=3660, skipped=68, lr=[5.006196546459833e-06, 5.006196546459833e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:33:19,220] [INFO] [timer.py:199:stop] epoch=7/micro_step=440/global_step=3660, RunningAvgSamplesPerSec=171.7592808963722, CurrSamplesPerSec=171.62212699574678, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:33:26,660] [INFO] [logging.py:96:log_dist] [Rank 0] step=3670, skipped=68, lr=[4.985614126557223e-06, 4.985614126557223e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:33:26,689] [INFO] [timer.py:199:stop] epoch=7/micro_step=450/global_step=3670, RunningAvgSamplesPerSec=171.75870046572425, CurrSamplesPerSec=171.51964995188627, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:33:27,394] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:33:28,100] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:33:34,042] [INFO] [logging.py:96:log_dist] [Rank 0] step=3680, skipped=70, lr=[4.969146065656421e-06, 4.969146065656421e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:33:34,071] [INFO] [timer.py:199:stop] epoch=7/micro_step=460/global_step=3680, RunningAvgSamplesPerSec=171.76351062317892, CurrSamplesPerSec=171.68848156991064, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 8/16 ***** -ppl: 1.8435580730438232 -Beginning of Epoch 9/16, Total Micro Batches 460 -[2023-04-18 02:33:49,721] [INFO] [logging.py:96:log_dist] [Rank 0] step=3690, skipped=70, lr=[4.948558648353954e-06, 4.948558648353954e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:33:49,749] [INFO] [timer.py:199:stop] epoch=8/micro_step=10/global_step=3690, RunningAvgSamplesPerSec=171.76166229092658, CurrSamplesPerSec=171.4182256597836, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:33:57,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=3700, skipped=70, lr=[4.9279689798383935e-06, 4.9279689798383935e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:33:57,217] [INFO] [timer.py:199:stop] epoch=8/micro_step=20/global_step=3700, RunningAvgSamplesPerSec=171.76109910412663, CurrSamplesPerSec=171.8306935883739, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:34:04,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=3710, skipped=70, lr=[4.907377435249254e-06, 4.907377435249254e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:34:04,685] [INFO] [timer.py:199:stop] epoch=8/micro_step=30/global_step=3710, RunningAvgSamplesPerSec=171.76056548580638, CurrSamplesPerSec=171.51219786468684, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:34:12,121] [INFO] [logging.py:96:log_dist] [Rank 0] step=3720, skipped=70, lr=[4.88678438976023e-06, 4.88678438976023e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:34:12,150] [INFO] [timer.py:199:stop] epoch=8/micro_step=40/global_step=3720, RunningAvgSamplesPerSec=171.76025380590403, CurrSamplesPerSec=171.6958391645069, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:34:19,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=3730, skipped=70, lr=[4.866190218572368e-06, 4.866190218572368e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:34:19,623] [INFO] [timer.py:199:stop] epoch=8/micro_step=50/global_step=3730, RunningAvgSamplesPerSec=171.75937893413118, CurrSamplesPerSec=169.93995657100893, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:34:27,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=3740, skipped=70, lr=[4.845595296907214e-06, 4.845595296907214e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:34:27,091] [INFO] [timer.py:199:stop] epoch=8/micro_step=60/global_step=3740, RunningAvgSamplesPerSec=171.75889043261145, CurrSamplesPerSec=171.22359971117808, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:34:34,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=3750, skipped=70, lr=[4.825e-06, 4.825e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:34:34,558] [INFO] [timer.py:199:stop] epoch=8/micro_step=70/global_step=3750, RunningAvgSamplesPerSec=171.75837384024322, CurrSamplesPerSec=171.27106733911245, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:34:41,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=3760, skipped=70, lr=[4.804404703092785e-06, 4.804404703092785e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:34:42,022] [INFO] [timer.py:199:stop] epoch=8/micro_step=80/global_step=3760, RunningAvgSamplesPerSec=171.75808469672214, CurrSamplesPerSec=171.1732658083202, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:34:49,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=3770, skipped=70, lr=[4.783809781427634e-06, 4.783809781427634e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:34:49,498] [INFO] [timer.py:199:stop] epoch=8/micro_step=90/global_step=3770, RunningAvgSamplesPerSec=171.7571373922436, CurrSamplesPerSec=171.4071704375733, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:34:51,695] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:34:52,401] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:34:56,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=3780, skipped=72, lr=[4.767334366428274e-06, 4.767334366428274e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:34:56,884] [INFO] [timer.py:199:stop] epoch=8/micro_step=100/global_step=3780, RunningAvgSamplesPerSec=171.76161903037394, CurrSamplesPerSec=171.50397940179596, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:35:04,331] [INFO] [logging.py:96:log_dist] [Rank 0] step=3790, skipped=72, lr=[4.746741065782644e-06, 4.746741065782644e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:35:04,359] [INFO] [timer.py:199:stop] epoch=8/micro_step=110/global_step=3790, RunningAvgSamplesPerSec=171.76070954365687, CurrSamplesPerSec=171.89132063486372, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:35:11,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=3800, skipped=72, lr=[4.72614919099866e-06, 4.72614919099866e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:35:11,834] [INFO] [timer.py:199:stop] epoch=8/micro_step=120/global_step=3800, RunningAvgSamplesPerSec=171.75977560084104, CurrSamplesPerSec=171.3094321019169, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:35:19,286] [INFO] [logging.py:96:log_dist] [Rank 0] step=3810, skipped=72, lr=[4.705559117256029e-06, 4.705559117256029e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:35:19,315] [INFO] [timer.py:199:stop] epoch=8/micro_step=130/global_step=3810, RunningAvgSamplesPerSec=171.7584905384431, CurrSamplesPerSec=171.9169707158525, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:35:26,750] [INFO] [logging.py:96:log_dist] [Rank 0] step=3820, skipped=72, lr=[4.684971219701652e-06, 4.684971219701652e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:35:26,779] [INFO] [timer.py:199:stop] epoch=8/micro_step=140/global_step=3820, RunningAvgSamplesPerSec=171.7582848334549, CurrSamplesPerSec=171.77016425112117, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:35:34,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=3830, skipped=72, lr=[4.6643858734427785e-06, 4.6643858734427785e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:35:34,246] [INFO] [timer.py:199:stop] epoch=8/micro_step=150/global_step=3830, RunningAvgSamplesPerSec=171.75780735097615, CurrSamplesPerSec=171.3339792613698, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:35:41,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=3840, skipped=72, lr=[4.643803453540169e-06, 4.643803453540169e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:35:41,715] [INFO] [timer.py:199:stop] epoch=8/micro_step=160/global_step=3840, RunningAvgSamplesPerSec=171.75722476621038, CurrSamplesPerSec=171.5630602705021, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:35:49,154] [INFO] [logging.py:96:log_dist] [Rank 0] step=3850, skipped=72, lr=[4.6232243350012705e-06, 4.6232243350012705e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:35:49,183] [INFO] [timer.py:199:stop] epoch=8/micro_step=170/global_step=3850, RunningAvgSamplesPerSec=171.75677111845238, CurrSamplesPerSec=172.08068389831786, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:35:56,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=3860, skipped=72, lr=[4.60264889277338e-06, 4.60264889277338e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:35:56,648] [INFO] [timer.py:199:stop] epoch=8/micro_step=180/global_step=3860, RunningAvgSamplesPerSec=171.75646121253106, CurrSamplesPerSec=171.59151910349522, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:36:04,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=3870, skipped=72, lr=[4.582077501736813e-06, 4.582077501736813e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:36:04,115] [INFO] [timer.py:199:stop] epoch=8/micro_step=190/global_step=3870, RunningAvgSamplesPerSec=171.75599124731917, CurrSamplesPerSec=171.71720172474951, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:36:07,806] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:36:08,512] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:36:11,475] [INFO] [logging.py:96:log_dist] [Rank 0] step=3880, skipped=74, lr=[4.565623557637935e-06, 4.565623557637935e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:36:11,504] [INFO] [timer.py:199:stop] epoch=8/micro_step=200/global_step=3880, RunningAvgSamplesPerSec=171.76041267132706, CurrSamplesPerSec=170.5574159008534, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:36:18,939] [INFO] [logging.py:96:log_dist] [Rank 0] step=3890, skipped=74, lr=[4.5450604032041726e-06, 4.5450604032041726e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:36:18,968] [INFO] [timer.py:199:stop] epoch=8/micro_step=210/global_step=3890, RunningAvgSamplesPerSec=171.76016267099286, CurrSamplesPerSec=171.42928230813985, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:36:26,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=3900, skipped=74, lr=[4.524502349212137e-06, 4.524502349212137e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:36:26,434] [INFO] [timer.py:199:stop] epoch=8/micro_step=220/global_step=3900, RunningAvgSamplesPerSec=171.75978358356787, CurrSamplesPerSec=171.66092364911617, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:36:33,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=3910, skipped=74, lr=[4.503949770225332e-06, 4.503949770225332e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:36:33,898] [INFO] [timer.py:199:stop] epoch=8/micro_step=230/global_step=3910, RunningAvgSamplesPerSec=171.75950431663415, CurrSamplesPerSec=171.55949672728295, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:36:41,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=3920, skipped=74, lr=[4.483403040707509e-06, 4.483403040707509e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:36:41,358] [INFO] [timer.py:199:stop] epoch=8/micro_step=240/global_step=3920, RunningAvgSamplesPerSec=171.7594737529853, CurrSamplesPerSec=171.9956750467175, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:36:48,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=3930, skipped=74, lr=[4.462862535015845e-06, 4.462862535015845e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:36:48,821] [INFO] [timer.py:199:stop] epoch=8/micro_step=250/global_step=3930, RunningAvgSamplesPerSec=171.7592203398167, CurrSamplesPerSec=171.1728292011353, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:36:56,256] [INFO] [logging.py:96:log_dist] [Rank 0] step=3940, skipped=74, lr=[4.442328627394115e-06, 4.442328627394115e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:36:56,284] [INFO] [timer.py:199:stop] epoch=8/micro_step=260/global_step=3940, RunningAvgSamplesPerSec=171.75902722514923, CurrSamplesPerSec=171.29172315281906, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:37:03,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=3950, skipped=74, lr=[4.421801691965882e-06, 4.421801691965882e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:37:03,751] [INFO] [timer.py:199:stop] epoch=8/micro_step=270/global_step=3950, RunningAvgSamplesPerSec=171.7586160546943, CurrSamplesPerSec=171.69820031731828, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:37:11,188] [INFO] [logging.py:96:log_dist] [Rank 0] step=3960, skipped=74, lr=[4.401282102727679e-06, 4.401282102727679e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:37:11,217] [INFO] [timer.py:199:stop] epoch=8/micro_step=280/global_step=3960, RunningAvgSamplesPerSec=171.75825579106115, CurrSamplesPerSec=171.6496724603568, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:37:18,656] [INFO] [logging.py:96:log_dist] [Rank 0] step=3970, skipped=74, lr=[4.380770233542185e-06, 4.380770233542185e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:37:18,684] [INFO] [timer.py:199:stop] epoch=8/micro_step=290/global_step=3970, RunningAvgSamplesPerSec=171.75778456547872, CurrSamplesPerSec=171.78956637823399, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:37:23,870] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:37:24,574] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:37:26,047] [INFO] [logging.py:96:log_dist] [Rank 0] step=3980, skipped=76, lr=[4.364366547777846e-06, 4.364366547777846e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:37:26,075] [INFO] [timer.py:199:stop] epoch=8/micro_step=300/global_step=3980, RunningAvgSamplesPerSec=171.76173513947134, CurrSamplesPerSec=171.48935248641092, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:37:33,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=3990, skipped=76, lr=[4.3438695163678766e-06, 4.3438695163678766e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:37:33,534] [INFO] [timer.py:199:stop] epoch=8/micro_step=310/global_step=3990, RunningAvgSamplesPerSec=171.76176964057, CurrSamplesPerSec=171.4092500239456, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:37:40,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=4000, skipped=76, lr=[4.323381251056103e-06, 4.323381251056103e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:37:41,001] [INFO] [timer.py:199:stop] epoch=8/micro_step=320/global_step=4000, RunningAvgSamplesPerSec=171.76132164933978, CurrSamplesPerSec=172.17356189283805, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:37:48,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=4010, skipped=76, lr=[4.302902125134494e-06, 4.302902125134494e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:37:48,467] [INFO] [timer.py:199:stop] epoch=8/micro_step=330/global_step=4010, RunningAvgSamplesPerSec=171.76094267941843, CurrSamplesPerSec=171.87673762902259, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:37:55,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=4020, skipped=76, lr=[4.282432511728506e-06, 4.282432511728506e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:37:55,934] [INFO] [timer.py:199:stop] epoch=8/micro_step=340/global_step=4020, RunningAvgSamplesPerSec=171.76049982844393, CurrSamplesPerSec=171.88884410095406, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:38:03,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=4030, skipped=76, lr=[4.2619727837902666e-06, 4.2619727837902666e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:38:03,400] [INFO] [timer.py:199:stop] epoch=8/micro_step=350/global_step=4030, RunningAvgSamplesPerSec=171.7601331740281, CurrSamplesPerSec=171.64670897800252, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:38:10,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=4040, skipped=76, lr=[4.241523314091805e-06, 4.241523314091805e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:38:10,861] [INFO] [timer.py:199:stop] epoch=8/micro_step=360/global_step=4040, RunningAvgSamplesPerSec=171.76006932805248, CurrSamplesPerSec=171.86644846864428, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:38:18,299] [INFO] [logging.py:96:log_dist] [Rank 0] step=4050, skipped=76, lr=[4.221084475218243e-06, 4.221084475218243e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:38:18,327] [INFO] [timer.py:199:stop] epoch=8/micro_step=370/global_step=4050, RunningAvgSamplesPerSec=171.7596772606798, CurrSamplesPerSec=171.78885177457036, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:38:25,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=4060, skipped=76, lr=[4.200656639561005e-06, 4.200656639561005e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:38:25,793] [INFO] [timer.py:199:stop] epoch=8/micro_step=380/global_step=4060, RunningAvgSamplesPerSec=171.7593048122676, CurrSamplesPerSec=172.1023630174971, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:38:33,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=4070, skipped=76, lr=[4.180240179311048e-06, 4.180240179311048e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:38:33,258] [INFO] [timer.py:199:stop] epoch=8/micro_step=390/global_step=4070, RunningAvgSamplesPerSec=171.75899788634663, CurrSamplesPerSec=171.74159119575924, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:38:39,936] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:38:40,642] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:38:40,642] [INFO] [logging.py:96:log_dist] [Rank 0] step=4080, skipped=78, lr=[4.163915451384506e-06, 4.163915451384506e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:38:40,643] [INFO] [timer.py:199:stop] epoch=8/micro_step=400/global_step=4080, RunningAvgSamplesPerSec=171.76319995257634, CurrSamplesPerSec=181.48462807157148, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:38:48,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=4090, skipped=78, lr=[4.143520404122961e-06, 4.143520404122961e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:38:48,102] [INFO] [timer.py:199:stop] epoch=8/micro_step=410/global_step=4090, RunningAvgSamplesPerSec=171.763181419034, CurrSamplesPerSec=171.76659209926848, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:38:55,538] [INFO] [logging.py:96:log_dist] [Rank 0] step=4100, skipped=78, lr=[4.123137773279126e-06, 4.123137773279126e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:38:55,567] [INFO] [timer.py:199:stop] epoch=8/micro_step=420/global_step=4100, RunningAvgSamplesPerSec=171.76289018337627, CurrSamplesPerSec=171.72846176827525, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:39:03,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=4110, skipped=78, lr=[4.1027679302203305e-06, 4.1027679302203305e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:39:03,031] [INFO] [timer.py:199:stop] epoch=8/micro_step=430/global_step=4110, RunningAvgSamplesPerSec=171.76258400659336, CurrSamplesPerSec=171.58000283159097, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:39:10,468] [INFO] [logging.py:96:log_dist] [Rank 0] step=4120, skipped=78, lr=[4.0824112460809116e-06, 4.0824112460809116e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:39:10,496] [INFO] [timer.py:199:stop] epoch=8/micro_step=440/global_step=4120, RunningAvgSamplesPerSec=171.7622408726747, CurrSamplesPerSec=171.7331859335575, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:39:17,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=4130, skipped=78, lr=[4.0620680917554585e-06, 4.0620680917554585e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:39:17,966] [INFO] [timer.py:199:stop] epoch=8/micro_step=450/global_step=4130, RunningAvgSamplesPerSec=171.76169981563766, CurrSamplesPerSec=171.57868678465724, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:39:25,401] [INFO] [logging.py:96:log_dist] [Rank 0] step=4140, skipped=78, lr=[4.041738837892049e-06, 4.041738837892049e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:39:25,429] [INFO] [timer.py:199:stop] epoch=8/micro_step=460/global_step=4140, RunningAvgSamplesPerSec=171.7614836503873, CurrSamplesPerSec=171.72983504534022, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 9/16 ***** -ppl: 1.8337234258651733 -Beginning of Epoch 10/16, Total Micro Batches 460 -[2023-04-18 02:39:41,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=4150, skipped=78, lr=[4.021423854885496e-06, 4.021423854885496e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:39:41,098] [INFO] [timer.py:199:stop] epoch=9/micro_step=10/global_step=4150, RunningAvgSamplesPerSec=171.75717992171772, CurrSamplesPerSec=171.6180672565738, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:39:48,537] [INFO] [logging.py:96:log_dist] [Rank 0] step=4160, skipped=78, lr=[4.001123512870599e-06, 4.001123512870599e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:39:48,565] [INFO] [timer.py:199:stop] epoch=9/micro_step=20/global_step=4160, RunningAvgSamplesPerSec=171.7567280511448, CurrSamplesPerSec=171.80732343992963, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:39:56,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=4170, skipped=78, lr=[3.980838181715404e-06, 3.980838181715404e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:39:56,044] [INFO] [timer.py:199:stop] epoch=9/micro_step=30/global_step=4170, RunningAvgSamplesPerSec=171.7556628201534, CurrSamplesPerSec=171.37975745050738, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:40:03,481] [INFO] [logging.py:96:log_dist] [Rank 0] step=4180, skipped=78, lr=[3.960568231014464e-06, 3.960568231014464e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:40:03,509] [INFO] [timer.py:199:stop] epoch=9/micro_step=40/global_step=4180, RunningAvgSamplesPerSec=171.755358369245, CurrSamplesPerSec=171.66767507038628, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:40:04,213] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:40:04,918] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:40:10,866] [INFO] [logging.py:96:log_dist] [Rank 0] step=4190, skipped=80, lr=[3.944363592569584e-06, 3.944363592569584e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:40:10,895] [INFO] [timer.py:199:stop] epoch=9/micro_step=50/global_step=4190, RunningAvgSamplesPerSec=171.75943664236163, CurrSamplesPerSec=170.39350432005372, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:40:18,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=4200, skipped=80, lr=[3.9241222571657804e-06, 3.9241222571657804e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:40:18,364] [INFO] [timer.py:199:stop] epoch=9/micro_step=60/global_step=4200, RunningAvgSamplesPerSec=171.75892363475222, CurrSamplesPerSec=171.55801652977073, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:40:25,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=4210, skipped=80, lr=[3.90389733556868e-06, 3.90389733556868e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:40:25,836] [INFO] [timer.py:199:stop] epoch=9/micro_step=70/global_step=4210, RunningAvgSamplesPerSec=171.7582540693332, CurrSamplesPerSec=171.28478268082705, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:40:33,282] [INFO] [logging.py:96:log_dist] [Rank 0] step=4220, skipped=80, lr=[3.883689196272182e-06, 3.883689196272182e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:40:33,311] [INFO] [timer.py:199:stop] epoch=9/micro_step=80/global_step=4220, RunningAvgSamplesPerSec=171.75740479340698, CurrSamplesPerSec=171.6689375902118, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:40:40,752] [INFO] [logging.py:96:log_dist] [Rank 0] step=4230, skipped=80, lr=[3.863498207464418e-06, 3.863498207464418e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:40:40,780] [INFO] [timer.py:199:stop] epoch=9/micro_step=90/global_step=4230, RunningAvgSamplesPerSec=171.7568721322845, CurrSamplesPerSec=171.34480630189853, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:40:48,215] [INFO] [logging.py:96:log_dist] [Rank 0] step=4240, skipped=80, lr=[3.843324737021043e-06, 3.843324737021043e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:40:48,244] [INFO] [timer.py:199:stop] epoch=9/micro_step=100/global_step=4240, RunningAvgSamplesPerSec=171.75662311015128, CurrSamplesPerSec=171.67014523524784, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:40:55,687] [INFO] [logging.py:96:log_dist] [Rank 0] step=4250, skipped=80, lr=[3.8231691524985255e-06, 3.8231691524985255e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:40:55,715] [INFO] [timer.py:199:stop] epoch=9/micro_step=110/global_step=4250, RunningAvgSamplesPerSec=171.75598165848638, CurrSamplesPerSec=171.7146752836772, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:41:03,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=4260, skipped=80, lr=[3.803031821127459e-06, 3.803031821127459e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:41:03,180] [INFO] [timer.py:199:stop] epoch=9/micro_step=120/global_step=4260, RunningAvgSamplesPerSec=171.75569854085055, CurrSamplesPerSec=171.5170197295194, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:41:10,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=4270, skipped=80, lr=[3.7829131098058725e-06, 3.7829131098058725e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:41:10,649] [INFO] [timer.py:199:stop] epoch=9/micro_step=130/global_step=4270, RunningAvgSamplesPerSec=171.75519475805214, CurrSamplesPerSec=171.96526427358597, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:41:18,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=4280, skipped=80, lr=[3.762813385092532e-06, 3.762813385092532e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:41:18,107] [INFO] [timer.py:199:stop] epoch=9/micro_step=140/global_step=4280, RunningAvgSamplesPerSec=171.75527960826656, CurrSamplesPerSec=171.84559881465245, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:41:20,306] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:41:21,013] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:41:25,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=4290, skipped=82, lr=[3.74674752178662e-06, 3.74674752178662e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:41:25,494] [INFO] [timer.py:199:stop] epoch=9/micro_step=150/global_step=4290, RunningAvgSamplesPerSec=171.75914706504318, CurrSamplesPerSec=171.57199719155074, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:41:32,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=4300, skipped=82, lr=[3.726682895587759e-06, 3.726682895587759e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:41:32,960] [INFO] [timer.py:199:stop] epoch=9/micro_step=160/global_step=4300, RunningAvgSamplesPerSec=171.75880016571188, CurrSamplesPerSec=171.23921906347582, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:41:40,408] [INFO] [logging.py:96:log_dist] [Rank 0] step=4310, skipped=82, lr=[3.706638280500053e-06, 3.706638280500053e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:41:40,437] [INFO] [timer.py:199:stop] epoch=9/micro_step=170/global_step=4310, RunningAvgSamplesPerSec=171.75787975646148, CurrSamplesPerSec=171.57265516118446, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:41:47,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=4320, skipped=82, lr=[3.686614041732256e-06, 3.686614041732256e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:41:47,907] [INFO] [timer.py:199:stop] epoch=9/micro_step=180/global_step=4320, RunningAvgSamplesPerSec=171.757334901271, CurrSamplesPerSec=171.26729737110074, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:41:55,342] [INFO] [logging.py:96:log_dist] [Rank 0] step=4330, skipped=82, lr=[3.66661054412187e-06, 3.66661054412187e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:41:55,370] [INFO] [timer.py:199:stop] epoch=9/micro_step=190/global_step=4330, RunningAvgSamplesPerSec=171.7571642467872, CurrSamplesPerSec=171.47801423513914, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:42:02,809] [INFO] [logging.py:96:log_dist] [Rank 0] step=4340, skipped=82, lr=[3.6466281521284987e-06, 3.6466281521284987e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:42:02,838] [INFO] [timer.py:199:stop] epoch=9/micro_step=200/global_step=4340, RunningAvgSamplesPerSec=171.75673548158696, CurrSamplesPerSec=171.85313491314048, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:42:10,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=4350, skipped=82, lr=[3.6266672298272e-06, 3.6266672298272e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:42:10,302] [INFO] [timer.py:199:stop] epoch=9/micro_step=210/global_step=4350, RunningAvgSamplesPerSec=171.75647248736433, CurrSamplesPerSec=171.77219769622732, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:42:17,735] [INFO] [logging.py:96:log_dist] [Rank 0] step=4360, skipped=82, lr=[3.606728140901863e-06, 3.606728140901863e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:42:17,763] [INFO] [timer.py:199:stop] epoch=9/micro_step=220/global_step=4360, RunningAvgSamplesPerSec=171.75641438089414, CurrSamplesPerSec=172.19311049116664, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:42:25,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=4370, skipped=82, lr=[3.586811248638579e-06, 3.586811248638579e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:42:25,231] [INFO] [timer.py:199:stop] epoch=9/micro_step=230/global_step=4370, RunningAvgSamplesPerSec=171.7559803177859, CurrSamplesPerSec=171.33676791370465, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:42:32,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=4380, skipped=82, lr=[3.5669169159190127e-06, 3.5669169159190127e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:42:32,695] [INFO] [timer.py:199:stop] epoch=9/micro_step=240/global_step=4380, RunningAvgSamplesPerSec=171.7557139247764, CurrSamplesPerSec=171.5702426305326, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:42:36,385] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:42:37,089] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:42:40,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=4390, skipped=84, lr=[3.5510179362090878e-06, 3.5510179362090878e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:42:40,079] [INFO] [timer.py:199:stop] epoch=9/micro_step=250/global_step=4390, RunningAvgSamplesPerSec=171.7596848742377, CurrSamplesPerSec=171.12072521635085, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:42:47,516] [INFO] [logging.py:96:log_dist] [Rank 0] step=4400, skipped=84, lr=[3.531165123813805e-06, 3.531165123813805e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:42:47,545] [INFO] [timer.py:199:stop] epoch=9/micro_step=260/global_step=4400, RunningAvgSamplesPerSec=171.75934927300548, CurrSamplesPerSec=171.3279101553587, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:42:54,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=4410, skipped=84, lr=[3.511335884823161e-06, 3.511335884823161e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:42:55,011] [INFO] [timer.py:199:stop] epoch=9/micro_step=270/global_step=4410, RunningAvgSamplesPerSec=171.75916496259447, CurrSamplesPerSec=171.58367690279957, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:43:02,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=4420, skipped=84, lr=[3.4915305805218014e-06, 3.4915305805218014e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:43:02,474] [INFO] [timer.py:199:stop] epoch=9/micro_step=280/global_step=4420, RunningAvgSamplesPerSec=171.75901422321516, CurrSamplesPerSec=171.50239058982578, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:43:09,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=4430, skipped=84, lr=[3.47174957175829e-06, 3.47174957175829e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:43:09,946] [INFO] [timer.py:199:stop] epoch=9/micro_step=290/global_step=4430, RunningAvgSamplesPerSec=171.7583552451835, CurrSamplesPerSec=171.92456813180337, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:43:17,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=4440, skipped=84, lr=[3.451993218938522e-06, 3.451993218938522e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:43:17,408] [INFO] [timer.py:199:stop] epoch=9/micro_step=300/global_step=4440, RunningAvgSamplesPerSec=171.75817211960222, CurrSamplesPerSec=171.47549482734942, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:43:24,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=4450, skipped=84, lr=[3.432261882019177e-06, 3.432261882019177e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:43:24,877] [INFO] [timer.py:199:stop] epoch=9/micro_step=310/global_step=4450, RunningAvgSamplesPerSec=171.75770564468874, CurrSamplesPerSec=171.55160263569175, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:43:32,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=4460, skipped=84, lr=[3.412555920501142e-06, 3.412555920501142e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:43:32,342] [INFO] [timer.py:199:stop] epoch=9/micro_step=320/global_step=4460, RunningAvgSamplesPerSec=171.7574040292847, CurrSamplesPerSec=171.0253834661196, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:43:39,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=4470, skipped=84, lr=[3.3928756934229727e-06, 3.3928756934229727e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:43:39,807] [INFO] [timer.py:199:stop] epoch=9/micro_step=330/global_step=4470, RunningAvgSamplesPerSec=171.75713971309213, CurrSamplesPerSec=171.63216744585733, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:43:47,242] [INFO] [logging.py:96:log_dist] [Rank 0] step=4480, skipped=84, lr=[3.3732215593543475e-06, 3.3732215593543475e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:43:47,271] [INFO] [timer.py:199:stop] epoch=9/micro_step=340/global_step=4480, RunningAvgSamplesPerSec=171.75690183161072, CurrSamplesPerSec=171.5691460481193, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:43:52,455] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:43:53,161] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:43:54,631] [INFO] [logging.py:96:log_dist] [Rank 0] step=4490, skipped=86, lr=[3.357517279721811e-06, 3.357517279721811e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:43:54,660] [INFO] [timer.py:199:stop] epoch=9/micro_step=350/global_step=4490, RunningAvgSamplesPerSec=171.76057079067888, CurrSamplesPerSec=171.6784345301803, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:44:02,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=4500, skipped=86, lr=[3.337911015144286e-06, 3.337911015144286e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:44:02,131] [INFO] [timer.py:199:stop] epoch=9/micro_step=360/global_step=4500, RunningAvgSamplesPerSec=171.75999558534747, CurrSamplesPerSec=171.64231882186053, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:44:09,571] [INFO] [logging.py:96:log_dist] [Rank 0] step=4510, skipped=86, lr=[3.318331845021411e-06, 3.318331845021411e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:44:09,599] [INFO] [timer.py:199:stop] epoch=9/micro_step=370/global_step=4510, RunningAvgSamplesPerSec=171.759689094821, CurrSamplesPerSec=171.81128217075747, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:44:17,047] [INFO] [logging.py:96:log_dist] [Rank 0] step=4520, skipped=86, lr=[3.2987801260816275e-06, 3.2987801260816275e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:44:17,075] [INFO] [timer.py:199:stop] epoch=9/micro_step=380/global_step=4520, RunningAvgSamplesPerSec=171.75886762670393, CurrSamplesPerSec=171.6185609983761, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:44:24,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=4530, skipped=86, lr=[3.279256214553221e-06, 3.279256214553221e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:44:24,537] [INFO] [timer.py:199:stop] epoch=9/micro_step=390/global_step=4530, RunningAvgSamplesPerSec=171.75872701939713, CurrSamplesPerSec=171.59459037042978, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:44:31,968] [INFO] [logging.py:96:log_dist] [Rank 0] step=4540, skipped=86, lr=[3.259760466157834e-06, 3.259760466157834e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:44:31,997] [INFO] [timer.py:199:stop] epoch=9/micro_step=400/global_step=4540, RunningAvgSamplesPerSec=171.75873683060152, CurrSamplesPerSec=172.29633880128654, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:44:39,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=4550, skipped=86, lr=[3.2402932361039845e-06, 3.2402932361039845e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:44:39,461] [INFO] [timer.py:199:stop] epoch=9/micro_step=410/global_step=4550, RunningAvgSamplesPerSec=171.75852459990386, CurrSamplesPerSec=171.7334606023248, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:44:46,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=4560, skipped=86, lr=[3.2208548790805874e-06, 3.2208548790805874e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:44:46,924] [INFO] [timer.py:199:stop] epoch=9/micro_step=420/global_step=4560, RunningAvgSamplesPerSec=171.75831308397005, CurrSamplesPerSec=171.74582160627673, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:44:54,367] [INFO] [logging.py:96:log_dist] [Rank 0] step=4570, skipped=86, lr=[3.2014457492505e-06, 3.2014457492505e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:44:54,395] [INFO] [timer.py:199:stop] epoch=9/micro_step=430/global_step=4570, RunningAvgSamplesPerSec=171.75772721898412, CurrSamplesPerSec=171.5996911102133, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:45:01,842] [INFO] [logging.py:96:log_dist] [Rank 0] step=4580, skipped=86, lr=[3.1820662002440643e-06, 3.1820662002440643e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:45:01,870] [INFO] [timer.py:199:stop] epoch=9/micro_step=440/global_step=4580, RunningAvgSamplesPerSec=171.7569817472531, CurrSamplesPerSec=171.75296433077656, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:45:08,555] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:45:09,258] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:45:09,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=4590, skipped=88, lr=[3.166584096527632e-06, 3.166584096527632e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:45:09,260] [INFO] [timer.py:199:stop] epoch=9/micro_step=450/global_step=4590, RunningAvgSamplesPerSec=171.76051580567068, CurrSamplesPerSec=182.07481988233172, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:45:16,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=4600, skipped=88, lr=[3.1472586824277793e-06, 3.1472586824277793e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:45:16,723] [INFO] [timer.py:199:stop] epoch=9/micro_step=460/global_step=4600, RunningAvgSamplesPerSec=171.76029532878343, CurrSamplesPerSec=171.4761520570268, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 10/16 ***** -ppl: 1.8186482191085815 -Beginning of Epoch 11/16, Total Micro Batches 460 -[2023-04-18 02:45:32,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=4610, skipped=88, lr=[3.1279638364287635e-06, 3.1279638364287635e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:45:32,408] [INFO] [timer.py:199:stop] epoch=10/micro_step=10/global_step=4610, RunningAvgSamplesPerSec=171.75976999069377, CurrSamplesPerSec=171.4195939795224, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:45:39,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=4620, skipped=88, lr=[3.1086999100787e-06, 3.1086999100787e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:45:39,879] [INFO] [timer.py:199:stop] epoch=10/micro_step=20/global_step=4620, RunningAvgSamplesPerSec=171.75917538132904, CurrSamplesPerSec=171.66849845077493, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:45:47,320] [INFO] [logging.py:96:log_dist] [Rank 0] step=4630, skipped=88, lr=[3.089467254362352e-06, 3.089467254362352e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:45:47,349] [INFO] [timer.py:199:stop] epoch=10/micro_step=30/global_step=4630, RunningAvgSamplesPerSec=171.7586251755327, CurrSamplesPerSec=171.34885312287358, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:45:54,785] [INFO] [logging.py:96:log_dist] [Rank 0] step=4640, skipped=88, lr=[3.0702662196947415e-06, 3.0702662196947415e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:45:54,813] [INFO] [timer.py:199:stop] epoch=10/micro_step=40/global_step=4640, RunningAvgSamplesPerSec=171.75838031552846, CurrSamplesPerSec=171.6453919075795, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:46:02,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=4650, skipped=88, lr=[3.051097155914764e-06, 3.051097155914764e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:46:02,286] [INFO] [timer.py:199:stop] epoch=10/micro_step=50/global_step=4650, RunningAvgSamplesPerSec=171.75774270111972, CurrSamplesPerSec=170.92791666136037, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:46:09,727] [INFO] [logging.py:96:log_dist] [Rank 0] step=4660, skipped=88, lr=[3.0319604122788038e-06, 3.0319604122788038e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:46:09,756] [INFO] [timer.py:199:stop] epoch=10/micro_step=60/global_step=4660, RunningAvgSamplesPerSec=171.7572470065651, CurrSamplesPerSec=171.30363803038324, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:46:17,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=4670, skipped=88, lr=[3.0128563374543873e-06, 3.0128563374543873e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:46:17,227] [INFO] [timer.py:199:stop] epoch=10/micro_step=70/global_step=4670, RunningAvgSamplesPerSec=171.75664142239043, CurrSamplesPerSec=171.461475127637, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:46:24,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=4680, skipped=88, lr=[2.9937852795138175e-06, 2.9937852795138175e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:46:24,696] [INFO] [timer.py:199:stop] epoch=10/micro_step=80/global_step=4680, RunningAvgSamplesPerSec=171.75621350742884, CurrSamplesPerSec=171.428899133102, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:46:31,380] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384 -[2023-04-18 02:46:32,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=4690, skipped=89, lr=[2.976649844000092e-06, 2.976649844000092e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:46:32,127] [INFO] [timer.py:199:stop] epoch=10/micro_step=90/global_step=4690, RunningAvgSamplesPerSec=171.7575728308589, CurrSamplesPerSec=171.601226876209, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:46:39,562] [INFO] [logging.py:96:log_dist] [Rank 0] step=4700, skipped=89, lr=[2.957642474918433e-06, 2.957642474918433e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:46:39,591] [INFO] [timer.py:199:stop] epoch=10/micro_step=100/global_step=4700, RunningAvgSamplesPerSec=171.75737247673578, CurrSamplesPerSec=171.76406420465634, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:46:47,025] [INFO] [logging.py:96:log_dist] [Rank 0] step=4710, skipped=89, lr=[2.9386691287058124e-06, 2.9386691287058124e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:46:47,054] [INFO] [timer.py:199:stop] epoch=10/micro_step=110/global_step=4710, RunningAvgSamplesPerSec=171.75720018026718, CurrSamplesPerSec=171.3507125371429, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:46:54,498] [INFO] [logging.py:96:log_dist] [Rank 0] step=4720, skipped=89, lr=[2.9197301510526816e-06, 2.9197301510526816e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:46:54,527] [INFO] [timer.py:199:stop] epoch=10/micro_step=120/global_step=4720, RunningAvgSamplesPerSec=171.75656785077663, CurrSamplesPerSec=170.85959284266698, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:47:01,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=4730, skipped=89, lr=[2.900825887023309e-06, 2.900825887023309e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:47:02,004] [INFO] [timer.py:199:stop] epoch=10/micro_step=130/global_step=4730, RunningAvgSamplesPerSec=171.75567386161325, CurrSamplesPerSec=171.51181431889765, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:47:09,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=4740, skipped=89, lr=[2.8819566810494875e-06, 2.8819566810494875e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:47:09,467] [INFO] [timer.py:199:stop] epoch=10/micro_step=140/global_step=4740, RunningAvgSamplesPerSec=171.75553410120858, CurrSamplesPerSec=171.47779515326198, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:47:16,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=4750, skipped=89, lr=[2.86312287692426e-06, 2.86312287692426e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:47:16,934] [INFO] [timer.py:199:stop] epoch=10/micro_step=150/global_step=4750, RunningAvgSamplesPerSec=171.75517744151503, CurrSamplesPerSec=172.13127700818413, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:47:24,373] [INFO] [logging.py:96:log_dist] [Rank 0] step=4760, skipped=89, lr=[2.8443248177956512e-06, 2.8443248177956512e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:47:24,402] [INFO] [timer.py:199:stop] epoch=10/micro_step=160/global_step=4760, RunningAvgSamplesPerSec=171.75479455467791, CurrSamplesPerSec=171.5229378432881, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:47:31,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=4770, skipped=89, lr=[2.825562846160425e-06, 2.825562846160425e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:47:31,867] [INFO] [timer.py:199:stop] epoch=10/micro_step=170/global_step=4770, RunningAvgSamplesPerSec=171.7545174541741, CurrSamplesPerSec=171.49806259619137, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:47:39,304] [INFO] [logging.py:96:log_dist] [Rank 0] step=4780, skipped=89, lr=[2.8068373038578333e-06, 2.8068373038578333e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:47:39,333] [INFO] [timer.py:199:stop] epoch=10/micro_step=180/global_step=4780, RunningAvgSamplesPerSec=171.75420135636978, CurrSamplesPerSec=171.57890612441088, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:47:46,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=4790, skipped=89, lr=[2.7881485320633965e-06, 2.7881485320633965e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:47:46,795] [INFO] [timer.py:199:stop] epoch=10/micro_step=190/global_step=4790, RunningAvgSamplesPerSec=171.75407202037948, CurrSamplesPerSec=171.7371412485765, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:47:54,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=4800, skipped=89, lr=[2.769496871282684e-06, 2.769496871282684e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:47:54,270] [INFO] [timer.py:199:stop] epoch=10/micro_step=200/global_step=4800, RunningAvgSamplesPerSec=171.75333663355372, CurrSamplesPerSec=171.63803863583686, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:48:01,709] [INFO] [logging.py:96:log_dist] [Rank 0] step=4810, skipped=89, lr=[2.750882661345108e-06, 2.750882661345108e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:48:01,736] [INFO] [timer.py:199:stop] epoch=10/micro_step=210/global_step=4810, RunningAvgSamplesPerSec=171.75301913997652, CurrSamplesPerSec=171.538611858245, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:48:09,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=4820, skipped=89, lr=[2.732306241397732e-06, 2.732306241397732e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:48:09,208] [INFO] [timer.py:199:stop] epoch=10/micro_step=220/global_step=4820, RunningAvgSamplesPerSec=171.75245786359903, CurrSamplesPerSec=171.5596063725592, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:48:16,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=4830, skipped=89, lr=[2.7137679498991008e-06, 2.7137679498991008e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:48:16,679] [INFO] [timer.py:199:stop] epoch=10/micro_step=230/global_step=4830, RunningAvgSamplesPerSec=171.7519110476986, CurrSamplesPerSec=171.2473029241527, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:48:24,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=4840, skipped=89, lr=[2.6952681246130607e-06, 2.6952681246130607e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:48:24,159] [INFO] [timer.py:199:stop] epoch=10/micro_step=240/global_step=4840, RunningAvgSamplesPerSec=171.75096306103464, CurrSamplesPerSec=171.73444941716204, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:48:31,604] [INFO] [logging.py:96:log_dist] [Rank 0] step=4850, skipped=89, lr=[2.676807102602617e-06, 2.676807102602617e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:48:31,633] [INFO] [timer.py:199:stop] epoch=10/micro_step=250/global_step=4850, RunningAvgSamplesPerSec=171.75027891672556, CurrSamplesPerSec=171.3269260165209, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:48:39,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=4860, skipped=89, lr=[2.6583852202237785e-06, 2.6583852202237785e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:48:39,103] [INFO] [timer.py:199:stop] epoch=10/micro_step=260/global_step=4860, RunningAvgSamplesPerSec=171.74979520361575, CurrSamplesPerSec=171.99528933552207, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:48:46,551] [INFO] [logging.py:96:log_dist] [Rank 0] step=4870, skipped=89, lr=[2.6400028131194465e-06, 2.6400028131194465e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:48:46,580] [INFO] [timer.py:199:stop] epoch=10/micro_step=270/global_step=4870, RunningAvgSamplesPerSec=171.74896941540274, CurrSamplesPerSec=171.39939979823004, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:48:54,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=4880, skipped=89, lr=[2.6216602162132887e-06, 2.6216602162132887e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:48:54,055] [INFO] [timer.py:199:stop] epoch=10/micro_step=280/global_step=4880, RunningAvgSamplesPerSec=171.74823489853458, CurrSamplesPerSec=171.650934715397, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:49:01,482] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:49:01,482] [INFO] [logging.py:96:log_dist] [Rank 0] step=4890, skipped=90, lr=[2.605186192947191e-06, 2.605186192947191e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:49:01,483] [INFO] [timer.py:199:stop] epoch=10/micro_step=290/global_step=4890, RunningAvgSamplesPerSec=171.7497402743535, CurrSamplesPerSec=181.66320746832227, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:49:02,187] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:49:08,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=4900, skipped=91, lr=[2.5887449297758933e-06, 2.5887449297758933e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:49:08,908] [INFO] [timer.py:199:stop] epoch=10/micro_step=300/global_step=4900, RunningAvgSamplesPerSec=171.75137582694958, CurrSamplesPerSec=171.843838653291, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:49:16,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=4910, skipped=91, lr=[2.570515577021172e-06, 2.570515577021172e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:49:16,369] [INFO] [timer.py:199:stop] epoch=10/micro_step=310/global_step=4910, RunningAvgSamplesPerSec=171.751348982386, CurrSamplesPerSec=171.72071689687147, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:49:23,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=4920, skipped=91, lr=[2.5523273005077336e-06, 2.5523273005077336e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:49:23,845] [INFO] [timer.py:199:stop] epoch=10/micro_step=320/global_step=4920, RunningAvgSamplesPerSec=171.75064633794096, CurrSamplesPerSec=171.9350845038352, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:49:31,313] [INFO] [logging.py:96:log_dist] [Rank 0] step=4930, skipped=91, lr=[2.534180431622229e-06, 2.534180431622229e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:49:31,341] [INFO] [timer.py:199:stop] epoch=10/micro_step=330/global_step=4930, RunningAvgSamplesPerSec=171.74904645097163, CurrSamplesPerSec=171.43886224082, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:49:38,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=4940, skipped=91, lr=[2.5160753009968673e-06, 2.5160753009968673e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:49:38,818] [INFO] [timer.py:199:stop] epoch=10/micro_step=340/global_step=4940, RunningAvgSamplesPerSec=171.748238075892, CurrSamplesPerSec=171.40870275948163, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:49:46,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=4950, skipped=91, lr=[2.4980122385033927e-06, 2.4980122385033927e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:49:46,297] [INFO] [timer.py:199:stop] epoch=10/micro_step=350/global_step=4950, RunningAvgSamplesPerSec=171.74735634692513, CurrSamplesPerSec=170.91676135142663, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:49:53,741] [INFO] [logging.py:96:log_dist] [Rank 0] step=4960, skipped=91, lr=[2.47999157324708e-06, 2.47999157324708e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:49:53,769] [INFO] [timer.py:199:stop] epoch=10/micro_step=360/global_step=4960, RunningAvgSamplesPerSec=171.7468022744573, CurrSamplesPerSec=171.8643027562202, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:50:01,218] [INFO] [logging.py:96:log_dist] [Rank 0] step=4970, skipped=91, lr=[2.462013633560736e-06, 2.462013633560736e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:50:01,246] [INFO] [timer.py:199:stop] epoch=10/micro_step=370/global_step=4970, RunningAvgSamplesPerSec=171.74602717360057, CurrSamplesPerSec=171.6989141669993, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:50:08,687] [INFO] [logging.py:96:log_dist] [Rank 0] step=4980, skipped=91, lr=[2.4440787469987114e-06, 2.4440787469987114e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:50:08,715] [INFO] [timer.py:199:stop] epoch=10/micro_step=380/global_step=4980, RunningAvgSamplesPerSec=171.74563891246103, CurrSamplesPerSec=172.03353826375462, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:50:16,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=4990, skipped=91, lr=[2.4261872403309417e-06, 2.4261872403309417e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:50:16,190] [INFO] [timer.py:199:stop] epoch=10/micro_step=390/global_step=4990, RunningAvgSamplesPerSec=171.74497939788444, CurrSamplesPerSec=171.3281835292646, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:50:17,641] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:50:18,345] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:50:23,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=5000, skipped=93, lr=[2.411905487605566e-06, 2.411905487605566e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:50:23,580] [INFO] [timer.py:199:stop] epoch=10/micro_step=400/global_step=5000, RunningAvgSamplesPerSec=171.74818138574756, CurrSamplesPerSec=171.30139703235918, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:50:31,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=5010, skipped=93, lr=[2.394092885681033e-06, 2.394092885681033e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:50:31,055] [INFO] [timer.py:199:stop] epoch=10/micro_step=410/global_step=5010, RunningAvgSamplesPerSec=171.74751613933458, CurrSamplesPerSec=171.2261663321585, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:50:38,497] [INFO] [logging.py:96:log_dist] [Rank 0] step=5020, skipped=93, lr=[2.376324574382839e-06, 2.376324574382839e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:50:38,525] [INFO] [timer.py:199:stop] epoch=10/micro_step=420/global_step=5020, RunningAvgSamplesPerSec=171.74706296156384, CurrSamplesPerSec=171.71654264599815, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:50:45,967] [INFO] [logging.py:96:log_dist] [Rank 0] step=5030, skipped=93, lr=[2.3586008774459555e-06, 2.3586008774459555e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:50:45,996] [INFO] [timer.py:199:stop] epoch=10/micro_step=430/global_step=5030, RunningAvgSamplesPerSec=171.74658093783256, CurrSamplesPerSec=171.50408897612039, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:50:53,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=5040, skipped=93, lr=[2.340922117792483e-06, 2.340922117792483e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:50:53,467] [INFO] [timer.py:199:stop] epoch=10/micro_step=440/global_step=5040, RunningAvgSamplesPerSec=171.74606576915517, CurrSamplesPerSec=171.58510270415388, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:51:00,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=5050, skipped=93, lr=[2.3232886175257783e-06, 2.3232886175257783e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:51:00,935] [INFO] [timer.py:199:stop] epoch=10/micro_step=450/global_step=5050, RunningAvgSamplesPerSec=171.74571660969693, CurrSamplesPerSec=171.96102304575518, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:51:08,373] [INFO] [logging.py:96:log_dist] [Rank 0] step=5060, skipped=93, lr=[2.3057006979245793e-06, 2.3057006979245793e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:51:08,399] [INFO] [timer.py:199:stop] epoch=10/micro_step=460/global_step=5060, RunningAvgSamplesPerSec=171.74550572981545, CurrSamplesPerSec=171.30194361183666, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 11/16 ***** -ppl: 1.8158378601074219 -Beginning of Epoch 12/16, Total Micro Batches 460 -[2023-04-18 02:51:24,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=5070, skipped=93, lr=[2.288158679437157e-06, 2.288158679437157e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:51:24,079] [INFO] [timer.py:199:stop] epoch=11/micro_step=10/global_step=5070, RunningAvgSamplesPerSec=171.7451260127198, CurrSamplesPerSec=171.3781162325161, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:51:31,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=5080, skipped=93, lr=[2.2706628816754673e-06, 2.2706628816754673e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:51:31,550] [INFO] [timer.py:199:stop] epoch=11/micro_step=20/global_step=5080, RunningAvgSamplesPerSec=171.7446036456941, CurrSamplesPerSec=171.57643858451516, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:51:38,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=5090, skipped=93, lr=[2.2532136234093383e-06, 2.2532136234093383e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:51:39,023] [INFO] [timer.py:199:stop] epoch=11/micro_step=30/global_step=5090, RunningAvgSamplesPerSec=171.7440112148022, CurrSamplesPerSec=170.75302761934356, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:51:41,967] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:51:42,675] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:51:46,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=5100, skipped=95, lr=[2.239287938905389e-06, 2.239287938905389e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:51:46,409] [INFO] [timer.py:199:stop] epoch=11/micro_step=40/global_step=5100, RunningAvgSamplesPerSec=171.74739146957907, CurrSamplesPerSec=171.60303692142367, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:51:53,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=5110, skipped=95, lr=[2.2219232523209144e-06, 2.2219232523209144e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:51:53,887] [INFO] [timer.py:199:stop] epoch=11/micro_step=50/global_step=5110, RunningAvgSamplesPerSec=171.74655451384425, CurrSamplesPerSec=171.36876189016994, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:52:01,327] [INFO] [logging.py:96:log_dist] [Rank 0] step=5120, skipped=95, lr=[2.2046059932579966e-06, 2.2046059932579966e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:52:01,355] [INFO] [timer.py:199:stop] epoch=11/micro_step=60/global_step=5120, RunningAvgSamplesPerSec=171.74616377447077, CurrSamplesPerSec=171.4143397508371, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:52:08,792] [INFO] [logging.py:96:log_dist] [Rank 0] step=5130, skipped=95, lr=[2.1873364772335264e-06, 2.1873364772335264e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:52:08,821] [INFO] [timer.py:199:stop] epoch=11/micro_step=70/global_step=5130, RunningAvgSamplesPerSec=171.74592201859707, CurrSamplesPerSec=171.59310956728962, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:52:16,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=5140, skipped=95, lr=[2.1701150188945225e-06, 2.1701150188945225e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:52:16,289] [INFO] [timer.py:199:stop] epoch=11/micro_step=80/global_step=5140, RunningAvgSamplesPerSec=171.7455336384221, CurrSamplesPerSec=171.71105051981468, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:52:23,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=5150, skipped=95, lr=[2.1529419320124055e-06, 2.1529419320124055e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:52:23,765] [INFO] [timer.py:199:stop] epoch=11/micro_step=90/global_step=5150, RunningAvgSamplesPerSec=171.74491945852154, CurrSamplesPerSec=171.43683668008262, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:52:31,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=5160, skipped=95, lr=[2.1358175294772792e-06, 2.1358175294772792e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:52:31,245] [INFO] [timer.py:199:stop] epoch=11/micro_step=100/global_step=5160, RunningAvgSamplesPerSec=171.7441740406532, CurrSamplesPerSec=171.31746792529657, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:52:38,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=5170, skipped=95, lr=[2.1187421232922227e-06, 2.1187421232922227e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:52:38,722] [INFO] [timer.py:199:stop] epoch=11/micro_step=110/global_step=5170, RunningAvgSamplesPerSec=171.74345394505346, CurrSamplesPerSec=171.6698158758257, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:52:46,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=5180, skipped=95, lr=[2.101716024567618e-06, 2.101716024567618e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:52:46,202] [INFO] [timer.py:199:stop] epoch=11/micro_step=120/global_step=5180, RunningAvgSamplesPerSec=171.7425637751142, CurrSamplesPerSec=171.40837440248063, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:52:53,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=5190, skipped=95, lr=[2.084739543515474e-06, 2.084739543515474e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:52:53,679] [INFO] [timer.py:199:stop] epoch=11/micro_step=130/global_step=5190, RunningAvgSamplesPerSec=171.74181864031, CurrSamplesPerSec=171.22288980832138, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:52:58,115] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:52:58,821] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:53:01,038] [INFO] [logging.py:96:log_dist] [Rank 0] step=5200, skipped=97, lr=[2.071194291283412e-06, 2.071194291283412e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:53:01,067] [INFO] [timer.py:199:stop] epoch=11/micro_step=140/global_step=5200, RunningAvgSamplesPerSec=171.74502497574156, CurrSamplesPerSec=171.16278785057753, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:53:08,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=5210, skipped=97, lr=[2.0543079008867116e-06, 2.0543079008867116e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:53:08,534] [INFO] [timer.py:199:stop] epoch=11/micro_step=150/global_step=5210, RunningAvgSamplesPerSec=171.7447215651382, CurrSamplesPerSec=171.8517046538516, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:53:15,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=5220, skipped=97, lr=[2.037471991928708e-06, 2.037471991928708e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:53:16,003] [INFO] [timer.py:199:stop] epoch=11/micro_step=160/global_step=5220, RunningAvgSamplesPerSec=171.74432560958536, CurrSamplesPerSec=171.61225228911312, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:53:23,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=5230, skipped=97, lr=[2.0206868711561885e-06, 2.0206868711561885e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:53:23,475] [INFO] [timer.py:199:stop] epoch=11/micro_step=170/global_step=5230, RunningAvgSamplesPerSec=171.74377481929443, CurrSamplesPerSec=172.04191782173967, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:53:30,914] [INFO] [logging.py:96:log_dist] [Rank 0] step=5240, skipped=97, lr=[2.0039528443905942e-06, 2.0039528443905942e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:53:30,940] [INFO] [timer.py:199:stop] epoch=11/micro_step=180/global_step=5240, RunningAvgSamplesPerSec=171.7435565633984, CurrSamplesPerSec=171.49800781286186, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:53:38,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=5250, skipped=97, lr=[1.9872702165224435e-06, 1.9872702165224435e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:53:38,408] [INFO] [timer.py:199:stop] epoch=11/micro_step=190/global_step=5250, RunningAvgSamplesPerSec=171.74324179068486, CurrSamplesPerSec=171.83960341278052, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:53:45,839] [INFO] [logging.py:96:log_dist] [Rank 0] step=5260, skipped=97, lr=[1.9706392915057724e-06, 1.9706392915057724e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:53:45,867] [INFO] [timer.py:199:stop] epoch=11/micro_step=200/global_step=5260, RunningAvgSamplesPerSec=171.74327459552813, CurrSamplesPerSec=171.93794782270055, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:53:53,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=5270, skipped=97, lr=[1.9540603723526074e-06, 1.9540603723526074e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:53:53,334] [INFO] [timer.py:199:stop] epoch=11/micro_step=210/global_step=5270, RunningAvgSamplesPerSec=171.74299707527024, CurrSamplesPerSec=171.8256340994268, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:54:00,777] [INFO] [logging.py:96:log_dist] [Rank 0] step=5280, skipped=97, lr=[1.937533761127437e-06, 1.937533761127437e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:54:00,805] [INFO] [timer.py:199:stop] epoch=11/micro_step=220/global_step=5280, RunningAvgSamplesPerSec=171.74251272560358, CurrSamplesPerSec=171.36608159433672, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:54:08,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=5290, skipped=97, lr=[1.9210597589417105e-06, 1.9210597589417105e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:54:08,284] [INFO] [timer.py:199:stop] epoch=11/micro_step=230/global_step=5290, RunningAvgSamplesPerSec=171.74170804519903, CurrSamplesPerSec=171.44997630419536, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:54:14,219] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:54:14,925] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:54:15,645] [INFO] [logging.py:96:log_dist] [Rank 0] step=5300, skipped=99, lr=[1.9079186374366367e-06, 1.9079186374366367e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:54:15,673] [INFO] [timer.py:199:stop] epoch=11/micro_step=240/global_step=5300, RunningAvgSamplesPerSec=171.74478225113623, CurrSamplesPerSec=171.43174561714142, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:54:23,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=5310, skipped=99, lr=[1.8915400872596514e-06, 1.8915400872596514e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:54:23,142] [INFO] [timer.py:199:stop] epoch=11/micro_step=250/global_step=5310, RunningAvgSamplesPerSec=171.74438407591725, CurrSamplesPerSec=171.24435331898405, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:54:30,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=5320, skipped=99, lr=[1.8752149841173617e-06, 1.8752149841173617e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:54:30,606] [INFO] [timer.py:199:stop] epoch=11/micro_step=260/global_step=5320, RunningAvgSamplesPerSec=171.744247691443, CurrSamplesPerSec=171.68074030418143, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:54:38,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=5330, skipped=99, lr=[1.8589436254497796e-06, 1.8589436254497796e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:54:38,078] [INFO] [timer.py:199:stop] epoch=11/micro_step=270/global_step=5330, RunningAvgSamplesPerSec=171.7437062924739, CurrSamplesPerSec=171.592945035185, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:54:45,521] [INFO] [logging.py:96:log_dist] [Rank 0] step=5340, skipped=99, lr=[1.8427263077177062e-06, 1.8427263077177062e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:54:45,550] [INFO] [timer.py:199:stop] epoch=11/micro_step=280/global_step=5340, RunningAvgSamplesPerSec=171.74325016915992, CurrSamplesPerSec=172.04803761996186, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:54:53,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=5350, skipped=99, lr=[1.8265633263973277e-06, 1.8265633263973277e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:54:53,031] [INFO] [timer.py:199:stop] epoch=11/micro_step=290/global_step=5350, RunningAvgSamplesPerSec=171.74249988288813, CurrSamplesPerSec=171.65280069163617, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:55:00,478] [INFO] [logging.py:96:log_dist] [Rank 0] step=5360, skipped=99, lr=[1.8104549759748275e-06, 1.8104549759748275e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:55:00,507] [INFO] [timer.py:199:stop] epoch=11/micro_step=300/global_step=5360, RunningAvgSamplesPerSec=171.74182017026575, CurrSamplesPerSec=171.0427649195826, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:55:07,943] [INFO] [logging.py:96:log_dist] [Rank 0] step=5370, skipped=99, lr=[1.7944015499410302e-06, 1.7944015499410302e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:55:07,972] [INFO] [timer.py:199:stop] epoch=11/micro_step=310/global_step=5370, RunningAvgSamplesPerSec=171.7416275898126, CurrSamplesPerSec=171.45161889883073, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:55:15,416] [INFO] [logging.py:96:log_dist] [Rank 0] step=5380, skipped=99, lr=[1.77840334078605e-06, 1.77840334078605e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:55:15,444] [INFO] [timer.py:199:stop] epoch=11/micro_step=320/global_step=5380, RunningAvgSamplesPerSec=171.74112158978986, CurrSamplesPerSec=171.82601905006973, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:55:22,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=5390, skipped=99, lr=[1.7624606399939543e-06, 1.7624606399939543e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:55:22,917] [INFO] [timer.py:199:stop] epoch=11/micro_step=330/global_step=5390, RunningAvgSamplesPerSec=171.74055837243287, CurrSamplesPerSec=171.3641124508247, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:55:30,343] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:55:30,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=5400, skipped=100, lr=[1.7481599090280232e-06, 1.7481599090280232e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:55:30,344] [INFO] [timer.py:199:stop] epoch=11/micro_step=340/global_step=5400, RunningAvgSamplesPerSec=171.74204745157692, CurrSamplesPerSec=181.50874156136402, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:55:31,048] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:55:37,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=5410, skipped=101, lr=[1.7339045861826927e-06, 1.7339045861826927e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:55:37,784] [INFO] [timer.py:199:stop] epoch=11/micro_step=350/global_step=5410, RunningAvgSamplesPerSec=171.7429245141067, CurrSamplesPerSec=171.27396322434882, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:55:45,230] [INFO] [logging.py:96:log_dist] [Rank 0] step=5420, skipped=101, lr=[1.7181188508730639e-06, 1.7181188508730639e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:55:45,259] [INFO] [timer.py:199:stop] epoch=11/micro_step=360/global_step=5420, RunningAvgSamplesPerSec=171.74231183516392, CurrSamplesPerSec=171.74334926313344, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:55:52,702] [INFO] [logging.py:96:log_dist] [Rank 0] step=5430, skipped=101, lr=[1.702389722297295e-06, 1.702389722297295e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:55:52,730] [INFO] [timer.py:199:stop] epoch=11/micro_step=370/global_step=5430, RunningAvgSamplesPerSec=171.7418352639182, CurrSamplesPerSec=171.20694584892345, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:56:00,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=5440, skipped=101, lr=[1.6867174870368687e-06, 1.6867174870368687e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:56:00,201] [INFO] [timer.py:199:stop] epoch=11/micro_step=380/global_step=5440, RunningAvgSamplesPerSec=171.7413847149185, CurrSamplesPerSec=171.73895416218423, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:56:07,649] [INFO] [logging.py:96:log_dist] [Rank 0] step=5450, skipped=101, lr=[1.671102430636676e-06, 1.671102430636676e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:56:07,677] [INFO] [timer.py:199:stop] epoch=11/micro_step=390/global_step=5450, RunningAvgSamplesPerSec=171.74071252851496, CurrSamplesPerSec=171.35547063387725, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:56:15,122] [INFO] [logging.py:96:log_dist] [Rank 0] step=5460, skipped=101, lr=[1.655544837599826e-06, 1.655544837599826e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:56:15,150] [INFO] [timer.py:199:stop] epoch=11/micro_step=400/global_step=5460, RunningAvgSamplesPerSec=171.74017463975053, CurrSamplesPerSec=171.46262509481417, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:56:22,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=5470, skipped=101, lr=[1.6400449913824576e-06, 1.6400449913824576e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:56:22,624] [INFO] [timer.py:199:stop] epoch=11/micro_step=410/global_step=5470, RunningAvgSamplesPerSec=171.73963173897664, CurrSamplesPerSec=171.12983430521476, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:56:30,066] [INFO] [logging.py:96:log_dist] [Rank 0] step=5480, skipped=101, lr=[1.6246031743885691e-06, 1.6246031743885691e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:56:30,094] [INFO] [timer.py:199:stop] epoch=11/micro_step=420/global_step=5480, RunningAvgSamplesPerSec=171.73919283614381, CurrSamplesPerSec=171.85671066552067, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:56:37,546] [INFO] [logging.py:96:log_dist] [Rank 0] step=5490, skipped=101, lr=[1.6092196679648839e-06, 1.6092196679648839e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:56:37,574] [INFO] [timer.py:199:stop] epoch=11/micro_step=430/global_step=5490, RunningAvgSamplesPerSec=171.7383833981878, CurrSamplesPerSec=172.0203641620738, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:56:45,011] [INFO] [logging.py:96:log_dist] [Rank 0] step=5500, skipped=101, lr=[1.5938947523957166e-06, 1.5938947523957166e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:56:45,039] [INFO] [timer.py:199:stop] epoch=11/micro_step=440/global_step=5500, RunningAvgSamplesPerSec=171.73818285436403, CurrSamplesPerSec=171.65741092451267, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:56:46,488] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:56:47,194] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:56:52,390] [INFO] [logging.py:96:log_dist] [Rank 0] step=5510, skipped=103, lr=[1.581677193025326e-06, 1.581677193025326e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:56:52,418] [INFO] [timer.py:199:stop] epoch=11/micro_step=450/global_step=5510, RunningAvgSamplesPerSec=171.74157379041785, CurrSamplesPerSec=171.82194965910585, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:56:59,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=5520, skipped=103, lr=[1.5664584439000145e-06, 1.5664584439000145e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:56:59,877] [INFO] [timer.py:199:stop] epoch=11/micro_step=460/global_step=5520, RunningAvgSamplesPerSec=171.74164768800873, CurrSamplesPerSec=171.40667791134962, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 12/16 ***** -ppl: 1.8115999698638916 -Beginning of Epoch 13/16, Total Micro Batches 460 -[2023-04-18 02:57:15,510] [INFO] [logging.py:96:log_dist] [Rank 0] step=5530, skipped=103, lr=[1.5512990647299045e-06, 1.5512990647299045e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:57:15,539] [INFO] [timer.py:199:stop] epoch=12/micro_step=10/global_step=5530, RunningAvgSamplesPerSec=171.73661686192554, CurrSamplesPerSec=171.23561434463824, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:57:22,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=5540, skipped=103, lr=[1.5361993317157606e-06, 1.5361993317157606e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:57:23,007] [INFO] [timer.py:199:stop] epoch=12/micro_step=20/global_step=5540, RunningAvgSamplesPerSec=171.73629350134996, CurrSamplesPerSec=171.97760358055692, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:57:30,459] [INFO] [logging.py:96:log_dist] [Rank 0] step=5550, skipped=103, lr=[1.5211595199716022e-06, 1.5211595199716022e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:57:30,489] [INFO] [timer.py:199:stop] epoch=12/micro_step=30/global_step=5550, RunningAvgSamplesPerSec=171.73555290614007, CurrSamplesPerSec=171.272761146891, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:57:37,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=5560, skipped=103, lr=[1.5061799035196989e-06, 1.5061799035196989e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:57:37,958] [INFO] [timer.py:199:stop] epoch=12/micro_step=40/global_step=5560, RunningAvgSamplesPerSec=171.73528471581386, CurrSamplesPerSec=171.81078731942728, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:57:45,390] [INFO] [logging.py:96:log_dist] [Rank 0] step=5570, skipped=103, lr=[1.491260755285575e-06, 1.491260755285575e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:57:45,419] [INFO] [timer.py:199:stop] epoch=12/micro_step=50/global_step=5570, RunningAvgSamplesPerSec=171.73527152418197, CurrSamplesPerSec=171.8908253223728, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:57:52,852] [INFO] [logging.py:96:log_dist] [Rank 0] step=5580, skipped=103, lr=[1.4764023470930319e-06, 1.4764023470930319e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:57:52,880] [INFO] [timer.py:199:stop] epoch=12/micro_step=60/global_step=5580, RunningAvgSamplesPerSec=171.73522719947235, CurrSamplesPerSec=171.6390263521262, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:58:00,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=5590, skipped=103, lr=[1.4616049496592044e-06, 1.4616049496592044e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:58:00,344] [INFO] [timer.py:199:stop] epoch=12/micro_step=70/global_step=5590, RunningAvgSamplesPerSec=171.7351173828534, CurrSamplesPerSec=171.6412213179141, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:58:07,769] [INFO] [logging.py:96:log_dist] [Rank 0] step=5600, skipped=103, lr=[1.446868832589624e-06, 1.446868832589624e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:58:07,798] [INFO] [timer.py:199:stop] epoch=12/micro_step=80/global_step=5600, RunningAvgSamplesPerSec=171.73539568291852, CurrSamplesPerSec=172.00912091816645, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:58:10,737] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:58:11,441] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:58:15,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=5610, skipped=105, lr=[1.4351242412584727e-06, 1.4351242412584727e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:58:15,171] [INFO] [timer.py:199:stop] epoch=12/micro_step=90/global_step=5610, RunningAvgSamplesPerSec=171.73897705601766, CurrSamplesPerSec=171.72445192494382, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:58:22,604] [INFO] [logging.py:96:log_dist] [Rank 0] step=5620, skipped=105, lr=[1.4204991046834278e-06, 1.4204991046834278e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:58:22,632] [INFO] [timer.py:199:stop] epoch=12/micro_step=100/global_step=5620, RunningAvgSamplesPerSec=171.73894681829805, CurrSamplesPerSec=171.76082201396235, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:58:30,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=5630, skipped=105, lr=[1.4059359974126463e-06, 1.4059359974126463e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:58:30,106] [INFO] [timer.py:199:stop] epoch=12/micro_step=110/global_step=5630, RunningAvgSamplesPerSec=171.73838318846776, CurrSamplesPerSec=171.66262517677987, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:58:37,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=5640, skipped=105, lr=[1.3914351847829423e-06, 1.3914351847829423e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:58:37,577] [INFO] [timer.py:199:stop] epoch=12/micro_step=120/global_step=5640, RunningAvgSamplesPerSec=171.73793313050354, CurrSamplesPerSec=171.67832473296323, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:58:45,018] [INFO] [logging.py:96:log_dist] [Rank 0] step=5650, skipped=105, lr=[1.376996930996128e-06, 1.376996930996128e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:58:45,046] [INFO] [timer.py:199:stop] epoch=12/micro_step=130/global_step=5650, RunningAvgSamplesPerSec=171.73756910612133, CurrSamplesPerSec=171.8122168977142, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:58:52,479] [INFO] [logging.py:96:log_dist] [Rank 0] step=5660, skipped=105, lr=[1.362621499114214e-06, 1.362621499114214e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:58:52,507] [INFO] [timer.py:199:stop] epoch=12/micro_step=140/global_step=5660, RunningAvgSamplesPerSec=171.73756198047468, CurrSamplesPerSec=171.72549556477048, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:58:59,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=5670, skipped=105, lr=[1.3483091510546007e-06, 1.3483091510546007e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:58:59,978] [INFO] [timer.py:199:stop] epoch=12/micro_step=150/global_step=5670, RunningAvgSamplesPerSec=171.7371414811927, CurrSamplesPerSec=171.7521950857464, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:59:07,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=5680, skipped=105, lr=[1.334060147585321e-06, 1.334060147585321e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:59:07,438] [INFO] [timer.py:199:stop] epoch=12/micro_step=160/global_step=5680, RunningAvgSamplesPerSec=171.73713778482912, CurrSamplesPerSec=171.80605888380495, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:59:14,874] [INFO] [logging.py:96:log_dist] [Rank 0] step=5690, skipped=105, lr=[1.3198747483202794e-06, 1.3198747483202794e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:59:14,903] [INFO] [timer.py:199:stop] epoch=12/micro_step=170/global_step=5690, RunningAvgSamplesPerSec=171.7369886095345, CurrSamplesPerSec=171.73780048545987, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:59:22,347] [INFO] [logging.py:96:log_dist] [Rank 0] step=5700, skipped=105, lr=[1.3057532117145263e-06, 1.3057532117145263e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:59:22,376] [INFO] [timer.py:199:stop] epoch=12/micro_step=180/global_step=5700, RunningAvgSamplesPerSec=171.73649604475673, CurrSamplesPerSec=171.59371285437268, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:59:26,815] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 02:59:27,522] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 02:59:29,733] [INFO] [logging.py:96:log_dist] [Rank 0] step=5710, skipped=107, lr=[1.2945021364837032e-06, 1.2945021364837032e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:59:29,761] [INFO] [timer.py:199:stop] epoch=12/micro_step=190/global_step=5710, RunningAvgSamplesPerSec=171.73951657895128, CurrSamplesPerSec=171.9639423100022, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:59:37,194] [INFO] [logging.py:96:log_dist] [Rank 0] step=5720, skipped=107, lr=[1.2804962002543011e-06, 1.2804962002543011e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:59:37,222] [INFO] [timer.py:199:stop] epoch=12/micro_step=200/global_step=5720, RunningAvgSamplesPerSec=171.7394687955089, CurrSamplesPerSec=171.49357047940305, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:59:44,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=5730, skipped=107, lr=[1.2665548441532109e-06, 1.2665548441532109e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:59:44,696] [INFO] [timer.py:199:stop] epoch=12/micro_step=210/global_step=5730, RunningAvgSamplesPerSec=171.73894261459833, CurrSamplesPerSec=171.84543379799305, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:59:52,131] [INFO] [logging.py:96:log_dist] [Rank 0] step=5740, skipped=107, lr=[1.2526783221890675e-06, 1.2526783221890675e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:59:52,159] [INFO] [timer.py:199:stop] epoch=12/micro_step=220/global_step=5740, RunningAvgSamplesPerSec=171.73880511912438, CurrSamplesPerSec=171.39748460802952, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 02:59:59,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=5750, skipped=107, lr=[1.2388668871892381e-06, 1.2388668871892381e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 02:59:59,629] [INFO] [timer.py:199:stop] epoch=12/micro_step=230/global_step=5750, RunningAvgSamplesPerSec=171.73846631258564, CurrSamplesPerSec=171.35880692826288, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:00:07,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=5760, skipped=107, lr=[1.2251207907952224e-06, 1.2251207907952224e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:00:07,100] [INFO] [timer.py:199:stop] epoch=12/micro_step=240/global_step=5760, RunningAvgSamplesPerSec=171.73802810349193, CurrSamplesPerSec=171.1740298762532, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:00:14,542] [INFO] [logging.py:96:log_dist] [Rank 0] step=5770, skipped=107, lr=[1.2114402834580596e-06, 1.2114402834580596e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:00:14,570] [INFO] [timer.py:199:stop] epoch=12/micro_step=250/global_step=5770, RunningAvgSamplesPerSec=171.73763550432605, CurrSamplesPerSec=171.08184522197232, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:00:22,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=5780, skipped=107, lr=[1.1978256144337731e-06, 1.1978256144337731e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:00:22,033] [INFO] [timer.py:199:stop] epoch=12/micro_step=260/global_step=5780, RunningAvgSamplesPerSec=171.73755551489234, CurrSamplesPerSec=171.79583315733626, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:00:29,468] [INFO] [logging.py:96:log_dist] [Rank 0] step=5790, skipped=107, lr=[1.1842770317788278e-06, 1.1842770317788278e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:00:29,497] [INFO] [timer.py:199:stop] epoch=12/micro_step=270/global_step=5790, RunningAvgSamplesPerSec=171.7374071219536, CurrSamplesPerSec=171.77412126969358, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:00:36,936] [INFO] [logging.py:96:log_dist] [Rank 0] step=5800, skipped=107, lr=[1.170794782345601e-06, 1.170794782345601e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:00:36,964] [INFO] [timer.py:199:stop] epoch=12/micro_step=280/global_step=5800, RunningAvgSamplesPerSec=171.73713270210422, CurrSamplesPerSec=171.76406420465634, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:00:42,899] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:00:43,612] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:00:44,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=5810, skipped=109, lr=[1.1600569078320184e-06, 1.1600569078320184e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:00:44,360] [INFO] [timer.py:199:stop] epoch=12/micro_step=290/global_step=5810, RunningAvgSamplesPerSec=171.73967151954284, CurrSamplesPerSec=171.45495893832046, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:00:51,806] [INFO] [logging.py:96:log_dist] [Rank 0] step=5820, skipped=109, lr=[1.1466946764052567e-06, 1.1466946764052567e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:00:51,835] [INFO] [timer.py:199:stop] epoch=12/micro_step=300/global_step=5820, RunningAvgSamplesPerSec=171.73908999485695, CurrSamplesPerSec=171.3458453322882, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:00:59,275] [INFO] [logging.py:96:log_dist] [Rank 0] step=5830, skipped=109, lr=[1.1333994629429795e-06, 1.1333994629429795e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:00:59,304] [INFO] [timer.py:199:stop] epoch=12/micro_step=310/global_step=5830, RunningAvgSamplesPerSec=171.7387592139742, CurrSamplesPerSec=171.52754110302433, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:01:06,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=5840, skipped=109, lr=[1.1201715096812372e-06, 1.1201715096812372e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:01:06,767] [INFO] [timer.py:199:stop] epoch=12/micro_step=320/global_step=5840, RunningAvgSamplesPerSec=171.73862158385722, CurrSamplesPerSec=171.9255040918181, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:01:14,208] [INFO] [logging.py:96:log_dist] [Rank 0] step=5850, skipped=109, lr=[1.1070110576306094e-06, 1.1070110576306094e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:01:14,237] [INFO] [timer.py:199:stop] epoch=12/micro_step=330/global_step=5850, RunningAvgSamplesPerSec=171.73826768283453, CurrSamplesPerSec=171.5985393037561, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:01:21,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=5860, skipped=109, lr=[1.0939183465718164e-06, 1.0939183465718164e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:01:21,700] [INFO] [timer.py:199:stop] epoch=12/micro_step=340/global_step=5860, RunningAvgSamplesPerSec=171.7381494621715, CurrSamplesPerSec=171.81551601532823, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:01:29,140] [INFO] [logging.py:96:log_dist] [Rank 0] step=5870, skipped=109, lr=[1.0808936150513568e-06, 1.0808936150513568e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:01:29,169] [INFO] [timer.py:199:stop] epoch=12/micro_step=350/global_step=5870, RunningAvgSamplesPerSec=171.73784282074396, CurrSamplesPerSec=171.25992182672687, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:01:36,606] [INFO] [logging.py:96:log_dist] [Rank 0] step=5880, skipped=109, lr=[1.0679371003771527e-06, 1.0679371003771527e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:01:36,634] [INFO] [timer.py:199:stop] epoch=12/micro_step=360/global_step=5880, RunningAvgSamplesPerSec=171.7376374608741, CurrSamplesPerSec=171.79225993778792, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:01:44,075] [INFO] [logging.py:96:log_dist] [Rank 0] step=5890, skipped=109, lr=[1.055049038614228e-06, 1.055049038614228e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:01:44,103] [INFO] [timer.py:199:stop] epoch=12/micro_step=370/global_step=5890, RunningAvgSamplesPerSec=171.73728862848444, CurrSamplesPerSec=171.6551606608618, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:01:51,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=5900, skipped=109, lr=[1.0422296645804113e-06, 1.0422296645804113e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:01:51,581] [INFO] [timer.py:199:stop] epoch=12/micro_step=380/global_step=5900, RunningAvgSamplesPerSec=171.73657262636672, CurrSamplesPerSec=171.37045763083546, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:01:59,011] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:01:59,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=5910, skipped=110, lr=[1.030751149026882e-06, 1.030751149026882e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:01:59,013] [INFO] [timer.py:199:stop] epoch=12/micro_step=390/global_step=5910, RunningAvgSamplesPerSec=171.73769601482246, CurrSamplesPerSec=181.47922949121204, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:01:59,717] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:02:06,419] [INFO] [logging.py:96:log_dist] [Rank 0] step=5920, skipped=111, lr=[1.0193286291391376e-06, 1.0193286291391376e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:02:06,448] [INFO] [timer.py:199:stop] epoch=12/micro_step=400/global_step=5920, RunningAvgSamplesPerSec=171.73867918806894, CurrSamplesPerSec=171.3992356373929, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:02:13,886] [INFO] [logging.py:96:log_dist] [Rank 0] step=5930, skipped=111, lr=[1.006702819308814e-06, 1.006702819308814e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:02:13,914] [INFO] [timer.py:199:stop] epoch=12/micro_step=410/global_step=5930, RunningAvgSamplesPerSec=171.73842823684666, CurrSamplesPerSec=171.73999797829094, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:02:21,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=5940, skipped=111, lr=[9.941465780657464e-07, 9.941465780657464e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:02:21,382] [INFO] [timer.py:199:stop] epoch=12/micro_step=420/global_step=5940, RunningAvgSamplesPerSec=171.73815009881073, CurrSamplesPerSec=171.5046916374075, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:02:28,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=5950, skipped=111, lr=[9.816601341820594e-07, 9.816601341820594e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:02:28,857] [INFO] [timer.py:199:stop] epoch=12/micro_step=430/global_step=5950, RunningAvgSamplesPerSec=171.73756900015445, CurrSamplesPerSec=171.59321925553465, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:02:36,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=5960, skipped=111, lr=[9.69243715158184e-07, 9.69243715158184e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:02:36,326] [INFO] [timer.py:199:stop] epoch=12/micro_step=440/global_step=5960, RunningAvgSamplesPerSec=171.7372392910247, CurrSamplesPerSec=170.82153797986302, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:02:43,768] [INFO] [logging.py:96:log_dist] [Rank 0] step=5970, skipped=111, lr=[9.568975472187164e-07, 9.568975472187164e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:02:43,796] [INFO] [timer.py:199:stop] epoch=12/micro_step=450/global_step=5970, RunningAvgSamplesPerSec=171.7368557980724, CurrSamplesPerSec=171.69688245657855, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:02:51,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=5980, skipped=111, lr=[9.446218553082909e-07, 9.446218553082909e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:02:51,261] [INFO] [timer.py:199:stop] epoch=12/micro_step=460/global_step=5980, RunningAvgSamplesPerSec=171.73669628308278, CurrSamplesPerSec=171.40071309624756, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 13/16 ***** -ppl: 1.8052407503128052 -Beginning of Epoch 14/16, Total Micro Batches 460 -[2023-04-18 03:03:06,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=5990, skipped=111, lr=[9.32416863087481e-07, 9.32416863087481e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:03:06,959] [INFO] [timer.py:199:stop] epoch=13/micro_step=10/global_step=5990, RunningAvgSamplesPerSec=171.7346975125402, CurrSamplesPerSec=171.5644857292413, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:03:14,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=6000, skipped=111, lr=[9.202827929287289e-07, 9.202827929287289e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:03:14,430] [INFO] [timer.py:199:stop] epoch=13/micro_step=20/global_step=6000, RunningAvgSamplesPerSec=171.73428977945105, CurrSamplesPerSec=171.67541515788344, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:03:21,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=6010, skipped=111, lr=[9.082198659122924e-07, 9.082198659122924e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:03:21,901] [INFO] [timer.py:199:stop] epoch=13/micro_step=30/global_step=6010, RunningAvgSamplesPerSec=171.7339046916455, CurrSamplesPerSec=171.60671197917466, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:03:23,356] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:03:24,061] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:03:29,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=6020, skipped=113, lr=[8.986208951001399e-07, 8.986208951001399e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:03:29,285] [INFO] [timer.py:199:stop] epoch=13/micro_step=40/global_step=6020, RunningAvgSamplesPerSec=171.7367992079931, CurrSamplesPerSec=171.9776586707191, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:03:36,724] [INFO] [logging.py:96:log_dist] [Rank 0] step=6030, skipped=113, lr=[8.866865787220428e-07, 8.866865787220428e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:03:36,752] [INFO] [timer.py:199:stop] epoch=13/micro_step=50/global_step=6030, RunningAvgSamplesPerSec=171.73655883391564, CurrSamplesPerSec=171.78423447882625, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:03:44,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=6040, skipped=113, lr=[8.74824017602356e-07, 8.74824017602356e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:03:44,227] [INFO] [timer.py:199:stop] epoch=13/micro_step=60/global_step=6040, RunningAvgSamplesPerSec=171.73600225355077, CurrSamplesPerSec=171.13245266587742, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:03:51,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=6050, skipped=113, lr=[8.630334278744954e-07, 8.630334278744954e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:03:51,692] [INFO] [timer.py:199:stop] epoch=13/micro_step=70/global_step=6050, RunningAvgSamplesPerSec=171.7358173582239, CurrSamplesPerSec=171.76087696532733, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:03:59,138] [INFO] [logging.py:96:log_dist] [Rank 0] step=6060, skipped=113, lr=[8.513150243605734e-07, 8.513150243605734e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:03:59,163] [INFO] [timer.py:199:stop] epoch=13/micro_step=80/global_step=6060, RunningAvgSamplesPerSec=171.73542933842222, CurrSamplesPerSec=171.70522924961486, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:04:06,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=6070, skipped=113, lr=[8.396690205674879e-07, 8.396690205674879e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:04:06,636] [INFO] [timer.py:199:stop] epoch=13/micro_step=90/global_step=6070, RunningAvgSamplesPerSec=171.73496955803617, CurrSamplesPerSec=171.93805795225379, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:04:14,083] [INFO] [logging.py:96:log_dist] [Rank 0] step=6080, skipped=113, lr=[8.280956286830244e-07, 8.280956286830244e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:04:14,111] [INFO] [timer.py:199:stop] epoch=13/micro_step=100/global_step=6080, RunningAvgSamplesPerSec=171.7344119807452, CurrSamplesPerSec=171.20776481558372, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:04:21,559] [INFO] [logging.py:96:log_dist] [Rank 0] step=6090, skipped=113, lr=[8.165950595719979e-07, 8.165950595719979e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:04:21,587] [INFO] [timer.py:199:stop] epoch=13/micro_step=110/global_step=6090, RunningAvgSamplesPerSec=171.73382230200735, CurrSamplesPerSec=171.51526629275654, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:04:29,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=6100, skipped=113, lr=[8.051675227724063e-07, 8.051675227724063e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:04:29,060] [INFO] [timer.py:199:stop] epoch=13/micro_step=120/global_step=6100, RunningAvgSamplesPerSec=171.73340654232513, CurrSamplesPerSec=171.0015782431144, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:04:36,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=6110, skipped=113, lr=[7.938132264916119e-07, 7.938132264916119e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:04:36,534] [INFO] [timer.py:199:stop] epoch=13/micro_step=130/global_step=6110, RunningAvgSamplesPerSec=171.733011059768, CurrSamplesPerSec=171.62871076485848, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:04:39,479] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:04:40,183] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:04:43,894] [INFO] [logging.py:96:log_dist] [Rank 0] step=6120, skipped=115, lr=[7.847826617040572e-07, 7.847826617040572e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:04:43,922] [INFO] [timer.py:199:stop] epoch=13/micro_step=140/global_step=6120, RunningAvgSamplesPerSec=171.73576145242205, CurrSamplesPerSec=171.2751653186803, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:04:51,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=6130, skipped=115, lr=[7.735607187777599e-07, 7.735607187777599e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:04:51,389] [INFO] [timer.py:199:stop] epoch=13/micro_step=150/global_step=6130, RunningAvgSamplesPerSec=171.73551913106564, CurrSamplesPerSec=171.6807952043173, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:04:58,830] [INFO] [logging.py:96:log_dist] [Rank 0] step=6140, skipped=115, lr=[7.624125922397105e-07, 7.624125922397105e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:04:58,859] [INFO] [timer.py:199:stop] epoch=13/micro_step=160/global_step=6140, RunningAvgSamplesPerSec=171.73519190391022, CurrSamplesPerSec=170.97875977790983, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:05:06,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=6150, skipped=115, lr=[7.513384852064781e-07, 7.513384852064781e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:05:06,334] [INFO] [timer.py:199:stop] epoch=13/micro_step=170/global_step=6150, RunningAvgSamplesPerSec=171.73466179259972, CurrSamplesPerSec=171.59042224909095, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:05:13,774] [INFO] [logging.py:96:log_dist] [Rank 0] step=6160, skipped=115, lr=[7.403385994460072e-07, 7.403385994460072e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:05:13,802] [INFO] [timer.py:199:stop] epoch=13/micro_step=180/global_step=6160, RunningAvgSamplesPerSec=171.73436820196, CurrSamplesPerSec=171.9448862600938, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:05:21,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=6170, skipped=115, lr=[7.294131353739503e-07, 7.294131353739503e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:05:21,277] [INFO] [timer.py:199:stop] epoch=13/micro_step=190/global_step=6170, RunningAvgSamplesPerSec=171.7338223310318, CurrSamplesPerSec=171.66586366133342, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:05:28,711] [INFO] [logging.py:96:log_dist] [Rank 0] step=6180, skipped=115, lr=[7.185622920500073e-07, 7.185622920500073e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:05:28,740] [INFO] [timer.py:199:stop] epoch=13/micro_step=200/global_step=6180, RunningAvgSamplesPerSec=171.73374898324641, CurrSamplesPerSec=171.71006197441127, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:05:36,180] [INFO] [logging.py:96:log_dist] [Rank 0] step=6190, skipped=115, lr=[7.077862671743073e-07, 7.077862671743073e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:05:36,208] [INFO] [timer.py:199:stop] epoch=13/micro_step=210/global_step=6190, RunningAvgSamplesPerSec=171.7334633543434, CurrSamplesPerSec=171.80974264264677, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:05:43,646] [INFO] [logging.py:96:log_dist] [Rank 0] step=6200, skipped=115, lr=[6.970852570838024e-07, 6.970852570838024e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:05:43,674] [INFO] [timer.py:199:stop] epoch=13/micro_step=220/global_step=6200, RunningAvgSamplesPerSec=171.73326930904045, CurrSamplesPerSec=171.48431307851942, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:05:51,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=6210, skipped=115, lr=[6.864594567486877e-07, 6.864594567486877e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:05:51,140] [INFO] [timer.py:199:stop] epoch=13/micro_step=230/global_step=6210, RunningAvgSamplesPerSec=171.73308865819607, CurrSamplesPerSec=171.32020137038057, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:05:55,580] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:05:56,286] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:05:58,499] [INFO] [logging.py:96:log_dist] [Rank 0] step=6220, skipped=117, lr=[6.780130976497363e-07, 6.780130976497363e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:05:58,527] [INFO] [timer.py:199:stop] epoch=13/micro_step=240/global_step=6220, RunningAvgSamplesPerSec=171.73578340405777, CurrSamplesPerSec=171.21098616046615, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:06:05,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=6230, skipped=117, lr=[6.675231618229537e-07, 6.675231618229537e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:06:06,006] [INFO] [timer.py:199:stop] epoch=13/micro_step=250/global_step=6230, RunningAvgSamplesPerSec=171.73508496762736, CurrSamplesPerSec=171.1572219572597, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:06:13,440] [INFO] [logging.py:96:log_dist] [Rank 0] step=6240, skipped=117, lr=[6.571089743668406e-07, 6.571089743668406e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:06:13,469] [INFO] [timer.py:199:stop] epoch=13/micro_step=260/global_step=6240, RunningAvgSamplesPerSec=171.73501401648412, CurrSamplesPerSec=171.35093129441594, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:06:20,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=6250, skipped=117, lr=[6.467707250257478e-07, 6.467707250257478e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:06:20,937] [INFO] [timer.py:199:stop] epoch=13/micro_step=270/global_step=6250, RunningAvgSamplesPerSec=171.7347295588175, CurrSamplesPerSec=171.6286558980287, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:06:28,368] [INFO] [logging.py:96:log_dist] [Rank 0] step=6260, skipped=117, lr=[6.365086021604447e-07, 6.365086021604447e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:06:28,396] [INFO] [timer.py:199:stop] epoch=13/micro_step=280/global_step=6260, RunningAvgSamplesPerSec=171.7347786471707, CurrSamplesPerSec=171.911851096866, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:06:35,852] [INFO] [logging.py:96:log_dist] [Rank 0] step=6270, skipped=117, lr=[6.263227927446931e-07, 6.263227927446931e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:06:35,880] [INFO] [timer.py:199:stop] epoch=13/micro_step=290/global_step=6270, RunningAvgSamplesPerSec=171.73397608628198, CurrSamplesPerSec=171.06014990638818, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:06:43,321] [INFO] [logging.py:96:log_dist] [Rank 0] step=6280, skipped=117, lr=[6.162134823618406e-07, 6.162134823618406e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:06:43,349] [INFO] [timer.py:199:stop] epoch=13/micro_step=300/global_step=6280, RunningAvgSamplesPerSec=171.73368823068452, CurrSamplesPerSec=171.78896171320855, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:06:50,777] [INFO] [logging.py:96:log_dist] [Rank 0] step=6290, skipped=117, lr=[6.061808552014389e-07, 6.061808552014389e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:06:50,806] [INFO] [timer.py:199:stop] epoch=13/micro_step=310/global_step=6290, RunningAvgSamplesPerSec=171.73383949640157, CurrSamplesPerSec=171.7310984796384, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:06:58,242] [INFO] [logging.py:96:log_dist] [Rank 0] step=6300, skipped=117, lr=[5.962250940558841e-07, 5.962250940558841e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:06:58,270] [INFO] [timer.py:199:stop] epoch=13/micro_step=320/global_step=6300, RunningAvgSamplesPerSec=171.73369913485024, CurrSamplesPerSec=171.50699274673636, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:07:05,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=6310, skipped=117, lr=[5.863463803170926e-07, 5.863463803170926e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:07:05,743] [INFO] [timer.py:199:stop] epoch=13/micro_step=330/global_step=6310, RunningAvgSamplesPerSec=171.73326851697558, CurrSamplesPerSec=171.36203395954564, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:07:11,668] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:07:12,375] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:07:13,094] [INFO] [logging.py:96:log_dist] [Rank 0] step=6320, skipped=119, lr=[5.784990044582024e-07, 5.784990044582024e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:07:13,122] [INFO] [timer.py:199:stop] epoch=13/micro_step=340/global_step=6320, RunningAvgSamplesPerSec=171.73623909280812, CurrSamplesPerSec=171.8561605400846, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:07:20,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=6330, skipped=119, lr=[5.687594286762253e-07, 5.687594286762253e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:07:20,592] [INFO] [timer.py:199:stop] epoch=13/micro_step=350/global_step=6330, RunningAvgSamplesPerSec=171.73587066745097, CurrSamplesPerSec=171.30593374769344, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:07:28,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=6340, skipped=119, lr=[5.590974007197709e-07, 5.590974007197709e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:07:28,060] [INFO] [timer.py:199:stop] epoch=13/micro_step=360/global_step=6340, RunningAvgSamplesPerSec=171.73559018615515, CurrSamplesPerSec=171.822609547249, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:07:35,496] [INFO] [logging.py:96:log_dist] [Rank 0] step=6350, skipped=119, lr=[5.495130966289967e-07, 5.495130966289967e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:07:35,525] [INFO] [timer.py:199:stop] epoch=13/micro_step=370/global_step=6350, RunningAvgSamplesPerSec=171.7354375164566, CurrSamplesPerSec=171.75917348936358, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:07:42,969] [INFO] [logging.py:96:log_dist] [Rank 0] step=6360, skipped=119, lr=[5.400066910279462e-07, 5.400066910279462e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:07:42,997] [INFO] [timer.py:199:stop] epoch=13/micro_step=380/global_step=6360, RunningAvgSamplesPerSec=171.73500829387493, CurrSamplesPerSec=171.56201860409527, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:07:50,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=6370, skipped=119, lr=[5.305783571213679e-07, 5.305783571213679e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:07:50,471] [INFO] [timer.py:199:stop] epoch=13/micro_step=390/global_step=6370, RunningAvgSamplesPerSec=171.7345397985669, CurrSamplesPerSec=171.4693609266299, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:07:57,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=6380, skipped=119, lr=[5.212282666915628e-07, 5.212282666915628e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:07:57,945] [INFO] [timer.py:199:stop] epoch=13/micro_step=400/global_step=6380, RunningAvgSamplesPerSec=171.73408192199375, CurrSamplesPerSec=172.163016328634, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:08:05,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=6390, skipped=119, lr=[5.119565900952507e-07, 5.119565900952507e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:08:05,410] [INFO] [timer.py:199:stop] epoch=13/micro_step=410/global_step=6390, RunningAvgSamplesPerSec=171.733908063191, CurrSamplesPerSec=171.93123018672634, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:08:12,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=6400, skipped=119, lr=[5.027634962604662e-07, 5.027634962604662e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:08:12,886] [INFO] [timer.py:199:stop] epoch=13/micro_step=420/global_step=6400, RunningAvgSamplesPerSec=171.73336492751193, CurrSamplesPerSec=171.23282898402624, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:08:20,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=6410, skipped=119, lr=[4.936491526834829e-07, 4.936491526834829e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:08:20,358] [INFO] [timer.py:199:stop] epoch=13/micro_step=430/global_step=6410, RunningAvgSamplesPerSec=171.7329633273394, CurrSamplesPerSec=172.0033345272872, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:08:27,783] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:08:27,784] [INFO] [logging.py:96:log_dist] [Rank 0] step=6420, skipped=120, lr=[4.855137122141133e-07, 4.855137122141133e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:08:27,785] [INFO] [timer.py:199:stop] epoch=13/micro_step=440/global_step=6420, RunningAvgSamplesPerSec=171.73418251717703, CurrSamplesPerSec=181.03360717456613, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:08:28,488] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:08:35,174] [INFO] [logging.py:96:log_dist] [Rank 0] step=6430, skipped=121, lr=[4.774423140449176e-07, 4.774423140449176e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:08:35,202] [INFO] [timer.py:199:stop] epoch=13/micro_step=450/global_step=6430, RunningAvgSamplesPerSec=171.73573853246734, CurrSamplesPerSec=172.07059090492433, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:08:42,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=6440, skipped=121, lr=[4.685493500452524e-07, 4.685493500452524e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:08:42,661] [INFO] [timer.py:199:stop] epoch=13/micro_step=460/global_step=6440, RunningAvgSamplesPerSec=171.7358202727627, CurrSamplesPerSec=171.58367690279957, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 14/16 ***** -ppl: 1.8033154010772705 -Beginning of Epoch 15/16, Total Micro Batches 460 -[2023-04-18 03:08:58,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=6450, skipped=121, lr=[4.5973575967807865e-07, 4.5973575967807865e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:08:58,331] [INFO] [timer.py:199:stop] epoch=14/micro_step=10/global_step=6450, RunningAvgSamplesPerSec=171.7353133785698, CurrSamplesPerSec=171.74323938286807, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:09:05,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=6460, skipped=121, lr=[4.510017035251946e-07, 4.510017035251946e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:09:05,789] [INFO] [timer.py:199:stop] epoch=14/micro_step=20/global_step=6460, RunningAvgSamplesPerSec=171.7353784510775, CurrSamplesPerSec=172.01341963199502, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:09:13,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=6470, skipped=121, lr=[4.4234734071930136e-07, 4.4234734071930136e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:09:13,254] [INFO] [timer.py:199:stop] epoch=14/micro_step=30/global_step=6470, RunningAvgSamplesPerSec=171.73521709334568, CurrSamplesPerSec=171.71242351842434, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:09:20,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=6480, skipped=121, lr=[4.337728289411066e-07, 4.337728289411066e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:09:20,715] [INFO] [timer.py:199:stop] epoch=14/micro_step=40/global_step=6480, RunningAvgSamplesPerSec=171.73520708036466, CurrSamplesPerSec=171.46054421312465, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:09:28,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=6490, skipped=121, lr=[4.2527832441644477e-07, 4.2527832441644477e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:09:28,181] [INFO] [timer.py:199:stop] epoch=14/micro_step=50/global_step=6490, RunningAvgSamplesPerSec=171.73501927954905, CurrSamplesPerSec=171.61971307362978, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:09:35,631] [INFO] [logging.py:96:log_dist] [Rank 0] step=6500, skipped=121, lr=[4.1686398191343745e-07, 4.1686398191343745e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:09:35,659] [INFO] [timer.py:199:stop] epoch=14/micro_step=60/global_step=6500, RunningAvgSamplesPerSec=171.73436921354389, CurrSamplesPerSec=171.5585647480934, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:09:43,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=6510, skipped=121, lr=[4.085299547396713e-07, 4.085299547396713e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:09:43,125] [INFO] [timer.py:199:stop] epoch=14/micro_step=70/global_step=6510, RunningAvgSamplesPerSec=171.73418014749845, CurrSamplesPerSec=171.0206981615792, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:09:50,562] [INFO] [logging.py:96:log_dist] [Rank 0] step=6520, skipped=121, lr=[4.002763947394002e-07, 4.002763947394002e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:09:50,589] [INFO] [timer.py:199:stop] epoch=14/micro_step=80/global_step=6520, RunningAvgSamplesPerSec=171.7340655172397, CurrSamplesPerSec=171.770439038187, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:09:52,040] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:09:52,744] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:09:57,946] [INFO] [logging.py:96:log_dist] [Rank 0] step=6530, skipped=123, lr=[3.937315842075559e-07, 3.937315842075559e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:09:57,974] [INFO] [timer.py:199:stop] epoch=14/micro_step=90/global_step=6530, RunningAvgSamplesPerSec=171.7367348580179, CurrSamplesPerSec=170.94696567820492, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:10:05,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=6540, skipped=123, lr=[3.8562324308558007e-07, 3.8562324308558007e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:10:05,449] [INFO] [timer.py:199:stop] epoch=14/micro_step=100/global_step=6540, RunningAvgSamplesPerSec=171.7363299730167, CurrSamplesPerSec=171.49088627803727, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:10:12,886] [INFO] [logging.py:96:log_dist] [Rank 0] step=6550, skipped=123, lr=[3.775957864926619e-07, 3.775957864926619e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:10:12,915] [INFO] [timer.py:199:stop] epoch=14/micro_step=110/global_step=6550, RunningAvgSamplesPerSec=171.73617366216908, CurrSamplesPerSec=171.4696895173293, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:10:20,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=6560, skipped=123, lr=[3.6964936068740814e-07, 3.6964936068740814e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:10:20,383] [INFO] [timer.py:199:stop] epoch=14/micro_step=120/global_step=6560, RunningAvgSamplesPerSec=171.73590484123358, CurrSamplesPerSec=170.922148368081, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:10:27,827] [INFO] [logging.py:96:log_dist] [Rank 0] step=6570, skipped=123, lr=[3.617841104520558e-07, 3.617841104520558e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:10:27,855] [INFO] [timer.py:199:stop] epoch=14/micro_step=130/global_step=6570, RunningAvgSamplesPerSec=171.73549040164266, CurrSamplesPerSec=171.76071211133794, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:10:35,287] [INFO] [logging.py:96:log_dist] [Rank 0] step=6580, skipped=123, lr=[3.540001790898436e-07, 3.540001790898436e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:10:35,315] [INFO] [timer.py:199:stop] epoch=14/micro_step=140/global_step=6580, RunningAvgSamplesPerSec=171.73550602287855, CurrSamplesPerSec=171.96289576991276, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:10:42,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=6590, skipped=123, lr=[3.4629770842239534e-07, 3.4629770842239534e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:10:42,779] [INFO] [timer.py:199:stop] epoch=14/micro_step=150/global_step=6590, RunningAvgSamplesPerSec=171.73541443650893, CurrSamplesPerSec=171.78395964762063, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:10:50,216] [INFO] [logging.py:96:log_dist] [Rank 0] step=6600, skipped=123, lr=[3.3867683878713817e-07, 3.3867683878713817e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:10:50,244] [INFO] [timer.py:199:stop] epoch=14/micro_step=160/global_step=6600, RunningAvgSamplesPerSec=171.73525182391054, CurrSamplesPerSec=171.57232617573678, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:10:57,681] [INFO] [logging.py:96:log_dist] [Rank 0] step=6610, skipped=123, lr=[3.311377090347465e-07, 3.311377090347465e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:10:57,709] [INFO] [timer.py:199:stop] epoch=14/micro_step=170/global_step=6610, RunningAvgSamplesPerSec=171.7350987570702, CurrSamplesPerSec=171.30500452136977, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:11:05,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=6620, skipped=123, lr=[3.2368045652660754e-07, 3.2368045652660754e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:11:05,176] [INFO] [timer.py:199:stop] epoch=14/micro_step=180/global_step=6620, RunningAvgSamplesPerSec=171.73489319389734, CurrSamplesPerSec=171.71555403735726, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:11:08,125] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:11:08,830] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:11:12,531] [INFO] [logging.py:96:log_dist] [Rank 0] step=6630, skipped=125, lr=[3.1777369749052005e-07, 3.1777369749052005e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:11:12,559] [INFO] [timer.py:199:stop] epoch=14/micro_step=190/global_step=6630, RunningAvgSamplesPerSec=171.73759254215886, CurrSamplesPerSec=171.98168034789015, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:11:19,993] [INFO] [logging.py:96:log_dist] [Rank 0] step=6640, skipped=125, lr=[3.104641654093771e-07, 3.104641654093771e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:11:20,021] [INFO] [timer.py:199:stop] epoch=14/micro_step=200/global_step=6640, RunningAvgSamplesPerSec=171.73754952198223, CurrSamplesPerSec=171.2910126849638, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:11:27,455] [INFO] [logging.py:96:log_dist] [Rank 0] step=6650, skipped=125, lr=[3.0323688724018915e-07, 3.0323688724018915e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:11:27,483] [INFO] [timer.py:199:stop] epoch=14/micro_step=210/global_step=6650, RunningAvgSamplesPerSec=171.73748549295723, CurrSamplesPerSec=172.00327942067412, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:11:34,908] [INFO] [logging.py:96:log_dist] [Rank 0] step=6660, skipped=125, lr=[2.9609199466247525e-07, 2.9609199466247525e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:11:34,936] [INFO] [timer.py:199:stop] epoch=14/micro_step=220/global_step=6660, RunningAvgSamplesPerSec=171.73774697631964, CurrSamplesPerSec=172.16009030156104, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:11:42,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=6670, skipped=125, lr=[2.890296178547039e-07, 2.890296178547039e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:11:42,407] [INFO] [timer.py:199:stop] epoch=14/micro_step=230/global_step=6670, RunningAvgSamplesPerSec=171.73741243674166, CurrSamplesPerSec=171.3351821983026, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:11:49,851] [INFO] [logging.py:96:log_dist] [Rank 0] step=6680, skipped=125, lr=[2.8204988549192515e-07, 2.8204988549192515e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:11:49,879] [INFO] [timer.py:199:stop] epoch=14/micro_step=240/global_step=6680, RunningAvgSamplesPerSec=171.73698874688463, CurrSamplesPerSec=171.5165265718062, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:11:57,310] [INFO] [logging.py:96:log_dist] [Rank 0] step=6690, skipped=125, lr=[2.751529247434222e-07, 2.751529247434222e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:11:57,338] [INFO] [timer.py:199:stop] epoch=14/micro_step=250/global_step=6690, RunningAvgSamplesPerSec=171.73704306496762, CurrSamplesPerSec=171.47571390334878, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:12:04,780] [INFO] [logging.py:96:log_dist] [Rank 0] step=6700, skipped=125, lr=[2.6833886127039926e-07, 2.6833886127039926e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:12:04,808] [INFO] [timer.py:199:stop] epoch=14/micro_step=260/global_step=6700, RunningAvgSamplesPerSec=171.7367445327316, CurrSamplesPerSec=171.7134120910191, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:12:12,247] [INFO] [logging.py:96:log_dist] [Rank 0] step=6710, skipped=125, lr=[2.616078192236859e-07, 2.616078192236859e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:12:12,275] [INFO] [timer.py:199:stop] epoch=14/micro_step=270/global_step=6710, RunningAvgSamplesPerSec=171.736517502258, CurrSamplesPerSec=171.87519692765645, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:12:19,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=6720, skipped=125, lr=[2.549599212414806e-07, 2.549599212414806e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:12:19,756] [INFO] [timer.py:199:stop] epoch=14/micro_step=280/global_step=6720, RunningAvgSamplesPerSec=171.73582920920313, CurrSamplesPerSec=169.86720657559974, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:12:24,192] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:12:24,898] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:12:27,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=6730, skipped=127, lr=[2.497015480279977e-07, 2.497015480279977e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:12:27,139] [INFO] [timer.py:199:stop] epoch=14/micro_step=290/global_step=6730, RunningAvgSamplesPerSec=171.73846396526474, CurrSamplesPerSec=171.35574409574227, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:12:34,575] [INFO] [logging.py:96:log_dist] [Rank 0] step=6740, skipped=127, lr=[2.432036135732997e-07, 2.432036135732997e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:12:34,604] [INFO] [timer.py:199:stop] epoch=14/micro_step=300/global_step=6740, RunningAvgSamplesPerSec=171.73830993482335, CurrSamplesPerSec=171.89021994431596, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:12:42,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=6750, skipped=127, lr=[2.367891585039419e-07, 2.367891585039419e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:12:42,065] [INFO] [timer.py:199:stop] epoch=14/micro_step=310/global_step=6750, RunningAvgSamplesPerSec=171.73829916615932, CurrSamplesPerSec=171.51428000033223, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:12:49,508] [INFO] [logging.py:96:log_dist] [Rank 0] step=6760, skipped=127, lr=[2.3045829968997375e-07, 2.3045829968997375e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:12:49,536] [INFO] [timer.py:199:stop] epoch=14/micro_step=320/global_step=6760, RunningAvgSamplesPerSec=171.73791856919163, CurrSamplesPerSec=171.41286205705302, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:12:56,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=6770, skipped=127, lr=[2.2421115247833633e-07, 2.2421115247833633e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:12:57,006] [INFO] [timer.py:199:stop] epoch=14/micro_step=330/global_step=6770, RunningAvgSamplesPerSec=171.73758817452617, CurrSamplesPerSec=171.34622813613458, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:13:04,451] [INFO] [logging.py:96:log_dist] [Rank 0] step=6780, skipped=127, lr=[2.1804783069076385e-07, 2.1804783069076385e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:13:04,479] [INFO] [timer.py:199:stop] epoch=14/micro_step=340/global_step=6780, RunningAvgSamplesPerSec=171.73716609873904, CurrSamplesPerSec=171.61005806417586, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:13:11,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=6790, skipped=127, lr=[2.11968446621708e-07, 2.11968446621708e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:13:11,946] [INFO] [timer.py:199:stop] epoch=14/micro_step=350/global_step=6790, RunningAvgSamplesPerSec=171.7369677643235, CurrSamplesPerSec=171.91300711321, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:13:19,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=6800, skipped=127, lr=[2.0597311103629377e-07, 2.0597311103629377e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:13:19,422] [INFO] [timer.py:199:stop] epoch=14/micro_step=360/global_step=6800, RunningAvgSamplesPerSec=171.73644811626474, CurrSamplesPerSec=171.76615246007796, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:13:26,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=6810, skipped=127, lr=[2.0006193316829777e-07, 2.0006193316829777e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:13:26,886] [INFO] [timer.py:199:stop] epoch=14/micro_step=370/global_step=6810, RunningAvgSamplesPerSec=171.73632778841824, CurrSamplesPerSec=171.61691520341654, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:13:34,323] [INFO] [logging.py:96:log_dist] [Rank 0] step=6820, skipped=127, lr=[1.94235020718163e-07, 1.94235020718163e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:13:34,352] [INFO] [timer.py:199:stop] epoch=14/micro_step=380/global_step=6820, RunningAvgSamplesPerSec=171.73614230678078, CurrSamplesPerSec=171.89478790222887, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:13:40,280] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:13:40,987] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:13:41,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=6830, skipped=129, lr=[1.89634233253527e-07, 1.89634233253527e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:13:41,734] [INFO] [timer.py:199:stop] epoch=14/micro_step=390/global_step=6830, RunningAvgSamplesPerSec=171.73879178012962, CurrSamplesPerSec=171.8159009206362, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:13:49,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=6840, skipped=129, lr=[1.839592650587469e-07, 1.839592650587469e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:13:49,191] [INFO] [timer.py:199:stop] epoch=14/micro_step=400/global_step=6840, RunningAvgSamplesPerSec=171.73888621760463, CurrSamplesPerSec=171.4751114456975, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:13:56,627] [INFO] [logging.py:96:log_dist] [Rank 0] step=6850, skipped=129, lr=[1.783688556691196e-07, 1.783688556691196e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:13:56,656] [INFO] [timer.py:199:stop] epoch=14/micro_step=410/global_step=6850, RunningAvgSamplesPerSec=171.73874259586333, CurrSamplesPerSec=171.2383997958679, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:14:04,093] [INFO] [logging.py:96:log_dist] [Rank 0] step=6860, skipped=129, lr=[1.7286310694075282e-07, 1.7286310694075282e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:14:04,121] [INFO] [timer.py:199:stop] epoch=14/micro_step=420/global_step=6860, RunningAvgSamplesPerSec=171.73855397275662, CurrSamplesPerSec=171.5890512008049, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:14:11,559] [INFO] [logging.py:96:log_dist] [Rank 0] step=6870, skipped=129, lr=[1.6744211918725136e-07, 1.6744211918725136e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:14:11,588] [INFO] [timer.py:199:stop] epoch=14/micro_step=430/global_step=6870, RunningAvgSamplesPerSec=171.7383545247978, CurrSamplesPerSec=171.93293707726994, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:14:19,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=6880, skipped=129, lr=[1.6210599117789524e-07, 1.6210599117789524e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:14:19,056] [INFO] [timer.py:199:stop] epoch=14/micro_step=440/global_step=6880, RunningAvgSamplesPerSec=171.7381153212602, CurrSamplesPerSec=171.20159545954763, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:14:26,497] [INFO] [logging.py:96:log_dist] [Rank 0] step=6890, skipped=129, lr=[1.568548201358361e-07, 1.568548201358361e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:14:26,525] [INFO] [timer.py:199:stop] epoch=14/micro_step=450/global_step=6890, RunningAvgSamplesPerSec=171.73781559143598, CurrSamplesPerSec=171.71511465939298, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:14:33,957] [INFO] [logging.py:96:log_dist] [Rank 0] step=6900, skipped=129, lr=[1.5168870173632736e-07, 1.5168870173632736e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:14:33,985] [INFO] [timer.py:199:stop] epoch=14/micro_step=460/global_step=6900, RunningAvgSamplesPerSec=171.73782218773627, CurrSamplesPerSec=172.13905895849746, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 15/16 ***** -ppl: 1.805456280708313 -Beginning of Epoch 16/16, Total Micro Batches 460 -[2023-04-18 03:14:49,644] [INFO] [logging.py:96:log_dist] [Rank 0] step=6910, skipped=129, lr=[1.4660773010498093e-07, 1.4660773010498093e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:14:49,672] [INFO] [timer.py:199:stop] epoch=15/micro_step=10/global_step=6910, RunningAvgSamplesPerSec=171.73433678088838, CurrSamplesPerSec=171.78434441155474, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:14:57,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=6920, skipped=129, lr=[1.4161199781605266e-07, 1.4161199781605266e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:14:57,136] [INFO] [timer.py:199:stop] epoch=15/micro_step=20/global_step=6920, RunningAvgSamplesPerSec=171.73421596581719, CurrSamplesPerSec=171.52556824716868, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:15:04,569] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:15:04,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=6930, skipped=130, lr=[1.371887936549325e-07, 1.371887936549325e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:15:04,571] [INFO] [timer.py:199:stop] epoch=15/micro_step=30/global_step=6930, RunningAvgSamplesPerSec=171.73508886538318, CurrSamplesPerSec=181.34161287598587, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:15:05,277] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:15:11,960] [INFO] [logging.py:96:log_dist] [Rank 0] step=6940, skipped=131, lr=[1.328347723861031e-07, 1.328347723861031e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:15:11,989] [INFO] [timer.py:199:stop] epoch=15/micro_step=40/global_step=6940, RunningAvgSamplesPerSec=171.73649562005025, CurrSamplesPerSec=171.92341195997224, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:15:19,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=6950, skipped=131, lr=[1.2807818952514453e-07, 1.2807818952514453e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:15:19,455] [INFO] [timer.py:199:stop] epoch=15/micro_step=50/global_step=6950, RunningAvgSamplesPerSec=171.7363009534403, CurrSamplesPerSec=171.74027266884897, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:15:26,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=6960, skipped=131, lr=[1.2340718361101157e-07, 1.2340718361101157e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:15:26,927] [INFO] [timer.py:199:stop] epoch=15/micro_step=60/global_step=6960, RunningAvgSamplesPerSec=171.73602746745453, CurrSamplesPerSec=171.74065723710643, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:15:34,362] [INFO] [logging.py:96:log_dist] [Rank 0] step=6970, skipped=131, lr=[1.1882183974846924e-07, 1.1882183974846924e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:15:34,391] [INFO] [timer.py:199:stop] epoch=15/micro_step=70/global_step=6970, RunningAvgSamplesPerSec=171.7359056976346, CurrSamplesPerSec=171.3259418889892, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:15:41,827] [INFO] [logging.py:96:log_dist] [Rank 0] step=6980, skipped=131, lr=[1.1432224148153811e-07, 1.1432224148153811e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:15:41,857] [INFO] [timer.py:199:stop] epoch=15/micro_step=80/global_step=6980, RunningAvgSamplesPerSec=171.73571124248588, CurrSamplesPerSec=171.69820031731828, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:15:49,291] [INFO] [logging.py:96:log_dist] [Rank 0] step=6990, skipped=131, lr=[1.0990847079196909e-07, 1.0990847079196909e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:15:49,319] [INFO] [timer.py:199:stop] epoch=15/micro_step=90/global_step=6990, RunningAvgSamplesPerSec=171.73565833785514, CurrSamplesPerSec=171.78665303135557, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:15:56,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=7000, skipped=131, lr=[1.0558060809775491e-07, 1.0558060809775491e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:15:56,790] [INFO] [timer.py:199:stop] epoch=15/micro_step=100/global_step=7000, RunningAvgSamplesPerSec=171.73531330889284, CurrSamplesPerSec=171.78731264841016, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:16:04,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=7010, skipped=131, lr=[1.0133873225166119e-07, 1.0133873225166119e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:16:04,261] [INFO] [timer.py:199:stop] epoch=15/micro_step=110/global_step=7010, RunningAvgSamplesPerSec=171.73496753359225, CurrSamplesPerSec=171.63183823214425, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:16:11,697] [INFO] [logging.py:96:log_dist] [Rank 0] step=7020, skipped=131, lr=[9.718292053979136e-08, 9.718292053979136e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:16:11,726] [INFO] [timer.py:199:stop] epoch=15/micro_step=120/global_step=7020, RunningAvgSamplesPerSec=171.73484856580907, CurrSamplesPerSec=171.58400593177817, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:16:19,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=7030, skipped=131, lr=[9.311324868017731e-08, 9.311324868017731e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:16:19,195] [INFO] [timer.py:199:stop] epoch=15/micro_step=130/global_step=7030, RunningAvgSamplesPerSec=171.73457279313803, CurrSamplesPerSec=171.84477373452464, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:16:20,647] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:16:21,353] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:16:26,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=7040, skipped=133, lr=[8.991958176672623e-08, 8.991958176672623e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:16:26,576] [INFO] [timer.py:199:stop] epoch=15/micro_step=140/global_step=7040, RunningAvgSamplesPerSec=171.73715547828576, CurrSamplesPerSec=171.8523097606453, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:16:34,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=7050, skipped=133, lr=[8.600514744006625e-08, 8.600514744006625e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:16:34,040] [INFO] [timer.py:199:stop] epoch=15/micro_step=150/global_step=7050, RunningAvgSamplesPerSec=171.73704700436792, CurrSamplesPerSec=172.0330972569985, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:16:41,480] [INFO] [logging.py:96:log_dist] [Rank 0] step=7060, skipped=133, lr=[8.21770566223834e-08, 8.21770566223834e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:16:41,508] [INFO] [timer.py:199:stop] epoch=15/micro_step=160/global_step=7060, RunningAvgSamplesPerSec=171.7368085964655, CurrSamplesPerSec=171.28603957499362, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:16:48,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=7070, skipped=133, lr=[7.843537906070244e-08, 7.843537906070244e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:16:48,976] [INFO] [timer.py:199:stop] epoch=15/micro_step=170/global_step=7070, RunningAvgSamplesPerSec=171.73658459779307, CurrSamplesPerSec=171.8236543805084, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:16:56,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=7080, skipped=133, lr=[7.478018292761859e-08, 7.478018292761859e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:16:56,438] [INFO] [timer.py:199:stop] epoch=15/micro_step=180/global_step=7080, RunningAvgSamplesPerSec=171.7365286943237, CurrSamplesPerSec=171.45961330872066, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:17:03,876] [INFO] [logging.py:96:log_dist] [Rank 0] step=7090, skipped=133, lr=[7.121153482004985e-08, 7.121153482004985e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:17:03,905] [INFO] [timer.py:199:stop] epoch=15/micro_step=190/global_step=7090, RunningAvgSamplesPerSec=171.73632361926778, CurrSamplesPerSec=172.22967938970478, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:17:11,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=7100, skipped=133, lr=[6.77294997580291e-08, 6.77294997580291e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:17:11,361] [INFO] [timer.py:199:stop] epoch=15/micro_step=200/global_step=7100, RunningAvgSamplesPerSec=171.7364766454629, CurrSamplesPerSec=171.7557116907065, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:17:18,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=7110, skipped=133, lr=[6.433414118351754e-08, 6.433414118351754e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:17:18,818] [INFO] [timer.py:199:stop] epoch=15/micro_step=210/global_step=7110, RunningAvgSamplesPerSec=171.7365812381777, CurrSamplesPerSec=171.78775239592744, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:17:26,260] [INFO] [logging.py:96:log_dist] [Rank 0] step=7120, skipped=133, lr=[6.102552095924865e-08, 6.102552095924865e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:17:26,288] [INFO] [timer.py:199:stop] epoch=15/micro_step=220/global_step=7120, RunningAvgSamplesPerSec=171.73626644157557, CurrSamplesPerSec=171.83674336845152, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:17:33,721] [INFO] [logging.py:96:log_dist] [Rank 0] step=7130, skipped=133, lr=[5.780369936759957e-08, 5.780369936759957e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:17:33,749] [INFO] [timer.py:199:stop] epoch=15/micro_step=230/global_step=7130, RunningAvgSamplesPerSec=171.7362535341787, CurrSamplesPerSec=171.05622571341544, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:17:36,693] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:17:37,396] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:17:41,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=7140, skipped=135, lr=[5.5288776609953284e-08, 5.5288776609953284e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:17:41,129] [INFO] [timer.py:199:stop] epoch=15/micro_step=240/global_step=7140, RunningAvgSamplesPerSec=171.73886342862485, CurrSamplesPerSec=172.03359338975812, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:17:48,569] [INFO] [logging.py:96:log_dist] [Rank 0] step=7150, skipped=135, lr=[5.222333941993161e-08, 5.222333941993161e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:17:48,598] [INFO] [timer.py:199:stop] epoch=15/micro_step=250/global_step=7150, RunningAvgSamplesPerSec=171.7386009744307, CurrSamplesPerSec=171.41494178153485, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:17:56,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=7160, skipped=135, lr=[4.9244861236463733e-08, 4.9244861236463733e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:17:56,055] [INFO] [timer.py:199:stop] epoch=15/micro_step=260/global_step=7160, RunningAvgSamplesPerSec=171.73872327670557, CurrSamplesPerSec=171.9935261063733, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:18:03,491] [INFO] [logging.py:96:log_dist] [Rank 0] step=7170, skipped=135, lr=[4.635339632680675e-08, 4.635339632680675e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:18:03,519] [INFO] [timer.py:199:stop] epoch=15/micro_step=270/global_step=7170, RunningAvgSamplesPerSec=171.73859128362747, CurrSamplesPerSec=171.60490185643158, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:18:10,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=7180, skipped=135, lr=[4.354899737285545e-08, 4.354899737285545e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:18:10,983] [INFO] [timer.py:199:stop] epoch=15/micro_step=280/global_step=7180, RunningAvgSamplesPerSec=171.73848004752944, CurrSamplesPerSec=171.67865412503576, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:18:18,419] [INFO] [logging.py:96:log_dist] [Rank 0] step=7190, skipped=135, lr=[4.08317154701802e-08, 4.08317154701802e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:18:18,447] [INFO] [timer.py:199:stop] epoch=15/micro_step=290/global_step=7190, RunningAvgSamplesPerSec=171.7383869671936, CurrSamplesPerSec=171.47445422399693, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:18:25,883] [INFO] [logging.py:96:log_dist] [Rank 0] step=7200, skipped=135, lr=[3.8201600127097544e-08, 3.8201600127097544e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:18:25,911] [INFO] [timer.py:199:stop] epoch=15/micro_step=300/global_step=7200, RunningAvgSamplesPerSec=171.73825710628654, CurrSamplesPerSec=171.54623068322812, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:18:33,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=7210, skipped=135, lr=[3.5658699263765976e-08, 3.5658699263765976e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:18:33,373] [INFO] [timer.py:199:stop] epoch=15/micro_step=310/global_step=7210, RunningAvgSamplesPerSec=171.73821271686677, CurrSamplesPerSec=171.07950099055586, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:18:40,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=7220, skipped=135, lr=[3.320305921131547e-08, 3.320305921131547e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:18:40,835] [INFO] [timer.py:199:stop] epoch=15/micro_step=320/global_step=7220, RunningAvgSamplesPerSec=171.7381882240824, CurrSamplesPerSec=172.13729278039074, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:18:48,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=7230, skipped=135, lr=[3.0834724711000524e-08, 3.0834724711000524e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:18:48,300] [INFO] [timer.py:199:stop] epoch=15/micro_step=330/global_step=7230, RunningAvgSamplesPerSec=171.73804561044471, CurrSamplesPerSec=171.35115005224753, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:18:52,738] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:18:53,442] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:18:55,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=7240, skipped=137, lr=[2.900294615893491e-08, 2.900294615893491e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:18:55,681] [INFO] [timer.py:199:stop] epoch=15/micro_step=340/global_step=7240, RunningAvgSamplesPerSec=171.74058606308182, CurrSamplesPerSec=171.9060162040365, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:19:03,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=7250, skipped=137, lr=[2.679186932244656e-08, 2.679186932244656e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:19:03,142] [INFO] [timer.py:199:stop] epoch=15/micro_step=350/global_step=7250, RunningAvgSamplesPerSec=171.74054294884345, CurrSamplesPerSec=171.83646836920627, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:19:10,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=7260, skipped=137, lr=[2.4668214848651996e-08, 2.4668214848651996e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:19:10,606] [INFO] [timer.py:199:stop] epoch=15/micro_step=360/global_step=7260, RunningAvgSamplesPerSec=171.74045438595428, CurrSamplesPerSec=171.95243106957068, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:19:18,048] [INFO] [logging.py:96:log_dist] [Rank 0] step=7270, skipped=137, lr=[2.2632021430097857e-08, 2.2632021430097857e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:19:18,073] [INFO] [timer.py:199:stop] epoch=15/micro_step=370/global_step=7270, RunningAvgSamplesPerSec=171.74021723933785, CurrSamplesPerSec=171.96063748996255, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:19:25,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=7280, skipped=137, lr=[2.0683326165808425e-08, 2.0683326165808425e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:19:25,531] [INFO] [timer.py:199:stop] epoch=15/micro_step=380/global_step=7280, RunningAvgSamplesPerSec=171.74028296504414, CurrSamplesPerSec=171.9831678651789, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:19:32,968] [INFO] [logging.py:96:log_dist] [Rank 0] step=7290, skipped=137, lr=[1.882216456060956e-08, 1.882216456060956e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:19:32,996] [INFO] [timer.py:199:stop] epoch=15/micro_step=390/global_step=7290, RunningAvgSamplesPerSec=171.74015135712838, CurrSamplesPerSec=171.97358209404635, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:19:40,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=7300, skipped=137, lr=[1.7048570524481608e-08, 1.7048570524481608e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:19:40,469] [INFO] [timer.py:199:stop] epoch=15/micro_step=400/global_step=7300, RunningAvgSamplesPerSec=171.7397499445379, CurrSamplesPerSec=171.5590033252742, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:19:47,894] [INFO] [logging.py:96:log_dist] [Rank 0] step=7310, skipped=137, lr=[1.53625763719423e-08, 1.53625763719423e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:19:47,922] [INFO] [timer.py:199:stop] epoch=15/micro_step=410/global_step=7310, RunningAvgSamplesPerSec=171.7400083073617, CurrSamplesPerSec=171.75263465349133, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:19:55,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=7320, skipped=137, lr=[1.3764212821457492e-08, 1.3764212821457492e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:19:55,384] [INFO] [timer.py:199:stop] epoch=15/micro_step=420/global_step=7320, RunningAvgSamplesPerSec=171.73998650722035, CurrSamplesPerSec=171.56086730330946, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:20:02,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=7330, skipped=137, lr=[1.2253508994881918e-08, 1.2253508994881918e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:20:02,845] [INFO] [timer.py:199:stop] epoch=15/micro_step=430/global_step=7330, RunningAvgSamplesPerSec=171.73997785634248, CurrSamplesPerSec=171.8436186356561, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:20:08,779] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 -[2023-04-18 03:20:09,485] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 -[2023-04-18 03:20:10,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=7340, skipped=139, lr=[1.110807948511953e-08, 1.110807948511953e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:20:10,230] [INFO] [timer.py:199:stop] epoch=15/micro_step=440/global_step=7340, RunningAvgSamplesPerSec=171.74232409380699, CurrSamplesPerSec=172.107659510353, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:20:17,663] [INFO] [logging.py:96:log_dist] [Rank 0] step=7350, skipped=139, lr=[9.755231450253808e-09, 9.755231450253808e-09], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:20:17,692] [INFO] [timer.py:199:stop] epoch=15/micro_step=450/global_step=7350, RunningAvgSamplesPerSec=171.74228634046028, CurrSamplesPerSec=171.9823414635084, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -[2023-04-18 03:20:25,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=7360, skipped=139, lr=[8.490116182110961e-09, 8.490116182110961e-09], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:20:25,144] [INFO] [timer.py:199:stop] epoch=15/micro_step=460/global_step=7360, RunningAvgSamplesPerSec=171.7425285021646, CurrSamplesPerSec=171.46295365969803, MemAllocated=3.96GB, MaxMemAllocated=22.95GB -***** Evaluating perplexity, Epoch 16/16 ***** -ppl: 1.8085135221481323 -saving the final model ... -[2023-04-18 03:20:41,455] [INFO] [launch.py:460:main] Process 73593 exits successfully. -[2023-04-18 03:20:44,459] [INFO] [launch.py:460:main] Process 73584 exits successfully. -[2023-04-18 03:20:45,461] [INFO] [launch.py:460:main] Process 73589 exits successfully. -[2023-04-18 03:20:46,463] [INFO] [launch.py:460:main] Process 73582 exits successfully. -[2023-04-18 03:20:46,463] [INFO] [launch.py:460:main] Process 73580 exits successfully. -[2023-04-18 03:20:47,465] [INFO] [launch.py:460:main] Process 73592 exits successfully. -[2023-04-18 03:20:47,466] [INFO] [launch.py:460:main] Process 73583 exits successfully. -[2023-04-18 03:20:47,466] [INFO] [launch.py:460:main] Process 73587 exits successfully. -[2023-04-18 03:20:47,466] [INFO] [launch.py:460:main] Process 73591 exits successfully. -[2023-04-18 03:20:47,467] [INFO] [launch.py:460:main] Process 73590 exits successfully. -[2023-04-18 03:20:47,467] [INFO] [launch.py:460:main] Process 73579 exits successfully. -[2023-04-18 03:20:47,467] [INFO] [launch.py:460:main] Process 73588 exits successfully. -[2023-04-18 03:20:47,468] [INFO] [launch.py:460:main] Process 73581 exits successfully. -[2023-04-18 03:20:47,468] [INFO] [launch.py:460:main] Process 73585 exits successfully. -[2023-04-18 03:20:47,468] [INFO] [launch.py:460:main] Process 73586 exits successfully. -[2023-04-18 03:20:49,471] [INFO] [launch.py:460:main] Process 73578 exits successfully. diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_1.3b.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_1.3b.sh index 3eeeefe02..9f105bcd4 100644 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_1.3b.sh +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_1.3b.sh @@ -13,8 +13,17 @@ if [ "$ZERO_STAGE" == "" ]; then fi mkdir -p $OUTPUT -deepspeed main.py \ - --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ +if [[ $0 =~ ^\/.* ]] +then + script=$0 +else + script=$(pwd)/$0 +fi +path_dir=${script%%training_scripts*} +echo $path_dir + +nohup ds --num_gpus 2 $path_dir'main.py' \ + --data_path $HOME/.cache/huggingface/hub/datasets--Dahoas--full-hh-rlhf \ --data_split 2,4,4 \ --model_name_or_path facebook/opt-1.3b \ --per_device_train_batch_size 8 \ @@ -32,4 +41,6 @@ deepspeed main.py \ --enable_tensorboard \ --tensorboard_path $OUTPUT \ --output_dir $OUTPUT \ - &> $OUTPUT/training.log + > $OUTPUT/training.log 2>&1 & + +# Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ No newline at end of file diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_560m.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_560m.sh new file mode 100644 index 000000000..38bdbdaed --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_560m.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +OUTPUT=$1 +ZERO_STAGE=$2 +if [ "$OUTPUT" == "" ]; then + OUTPUT=./output +fi +if [ "$ZERO_STAGE" == "" ]; then + ZERO_STAGE=2 +fi +mkdir -p $OUTPUT + +if [[ $0 =~ ^\/.* ]] +then + script=$0 +else + script=$(pwd)/$0 +fi +path_dir=${script%%training_scripts*} +echo $path_dir + +nohup ds --num_gpus 2 $path_dir'main.py' \ + --data_path $HOME/.cache/huggingface/hub/datasets--Dahoas--full-hh-rlhf \ + --data_split 2,4,4 \ + --model_name_or_path bigscience/bloom-560m \ + --tokenizer_name_or_path bigscience/tokenizer \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --max_seq_len 512 \ + --learning_rate 9.65e-6 \ + --weight_decay 0. \ + --num_train_epochs 16 \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 0 \ + --seed 1234 \ + --zero_stage $ZERO_STAGE \ + --deepspeed \ + --enable_tensorboard \ + --tensorboard_path $OUTPUT \ + --output_dir $OUTPUT \ + > $OUTPUT/training_step1_bloom_560m_dahoas_full_hh_rlhf.log 2>&1 & + +# Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ No newline at end of file diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py index d648f53b9..5932ca6b6 100644 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py @@ -58,6 +58,13 @@ def parse_args(): "Path to pretrained model or model identifier from huggingface.co/models.", required=True, ) + parser.add_argument( + "--tokenizer_name_or_path", + type=str, + default=None, + help= + "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models, if None, use `model_name_or_path`" + ) parser.add_argument( "--num_padding_at_beginning", type=int, @@ -204,7 +211,10 @@ def main(): torch.distributed.barrier() # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family - tokenizer = load_hf_tokenizer(args.model_name_or_path, fast_tokenizer=True) + # Occasionally , some repo owners of huggingface hub, such as bigscience, would like to separate model and tokenizer + tokenizer_name_or_path = args.model_name_or_path if not args.tokenizer_name_or_path else args.tokenizer_name_or_path + + tokenizer = load_hf_tokenizer(tokenizer_name_or_path, fast_tokenizer=True) rm_model = create_critic_model(args.model_name_or_path, tokenizer, ds_config, diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_log_output/opt-350m_globalBatchSize-64.log b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_log_output/opt-350m_globalBatchSize-64.log deleted file mode 100644 index ea44d1bd5..000000000 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_log_output/opt-350m_globalBatchSize-64.log +++ /dev/null @@ -1,1853 +0,0 @@ -cmd = deepspeed main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets --data_split 2,4,4 --model_name_or_path facebook/opt-350m --num_padding_at_beginning 1 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --max_seq_len 512 --learning_rate 5e-5 --weight_decay 0.1 --num_train_epochs 1 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 0 --deepspeed --output_dir ./output_fourDatasets -***** Running training ***** -***** Evaluating reward, Epoch 0/1 ***** -chosen_last_scores (higher is better) : 2.8115079402923584, acc (higher is better) : 0.4924241304397583 -Beginning of Epoch 1/1, Total Micro Batches 1840 -[2023-04-18 03:36:00,501] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 65536, reducing to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,502] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 0 -[2023-04-18 03:36:00,503] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,746] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,746] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,745] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,746] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,746] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,746] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,746] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,746] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0 -[2023-04-18 03:36:00,746] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1 -[2023-04-18 03:36:00,746] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,746] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,988] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,989] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 16384.0, reducing to 8192.0 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 2 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:00,989] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,227] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 3 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,228] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 -[2023-04-18 03:36:01,228] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 4 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,469] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 5 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,721] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,722] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,960] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 6 -[2023-04-18 03:36:01,961] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,961] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,961] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 -[2023-04-18 03:36:01,961] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 1024.0, reducing to 512.0 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,198] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 7 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:36:02,199] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 512.0, reducing to 256.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 256.0, reducing to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 8 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,439] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 128.0, reducing to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 9 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,681] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:02,682] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=10, lr=[5e-05, 5e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:02,683] [INFO] [timer.py:199:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=265.320034984211, CurrSamplesPerSec=265.4349789974132, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:05,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=10, lr=[4.999635612423198e-05, 4.999635612423198e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:05,695] [INFO] [timer.py:199:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=233.55939410769662, CurrSamplesPerSec=213.49183724272422, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:08,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=10, lr=[4.998542555915435e-05, 4.998542555915435e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:08,642] [INFO] [timer.py:199:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=227.66776915298374, CurrSamplesPerSec=217.4545430979453, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:11,572] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=10, lr=[4.996721149113682e-05, 4.996721149113682e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:11,582] [INFO] [timer.py:199:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=225.11184596131628, CurrSamplesPerSec=218.65992852947812, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:14,512] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=10, lr=[4.994171922976348e-05, 4.994171922976348e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:14,522] [INFO] [timer.py:199:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=223.64021397905867, CurrSamplesPerSec=218.39876918771108, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:17,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=10, lr=[4.9908956206285e-05, 4.9908956206285e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:17,477] [INFO] [timer.py:199:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=222.50605211780783, CurrSamplesPerSec=217.6126698050419, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:20,411] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=10, lr=[4.986893197145237e-05, 4.986893197145237e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:20,421] [INFO] [timer.py:199:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=221.82950715661497, CurrSamplesPerSec=216.39103240519432, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:23,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=10, lr=[4.982165819273275e-05, 4.982165819273275e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:23,354] [INFO] [timer.py:199:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=221.42476663458936, CurrSamplesPerSec=220.97345211419955, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:26,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=10, lr=[4.976714865090827e-05, 4.976714865090827e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:26,280] [INFO] [timer.py:199:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=221.18608845437842, CurrSamplesPerSec=219.40429956010507, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:29,219] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=10, lr=[4.9705419236058825e-05, 4.9705419236058825e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:29,229] [INFO] [timer.py:199:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=220.81515390033704, CurrSamplesPerSec=216.64933270488575, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:32,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=10, lr=[4.963648794292992e-05, 4.963648794292992e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:32,157] [INFO] [timer.py:199:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=220.65748563686077, CurrSamplesPerSec=220.54227501538415, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:32,419] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,421] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,420] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,421] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,421] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,421] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:32,423] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:36:32,423] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:36:35,084] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=10, lr=[4.956037486568706e-05, 4.956037486568706e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:35,094] [INFO] [timer.py:199:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=220.4744540789782, CurrSamplesPerSec=219.14636945940694, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:38,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=10, lr=[4.947710219205808e-05, 4.947710219205808e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:38,042] [INFO] [timer.py:199:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=220.24658400459697, CurrSamplesPerSec=214.50108794581607, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:40,972] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=10, lr=[4.938669419686516e-05, 4.938669419686516e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:40,982] [INFO] [timer.py:199:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=220.10165949901187, CurrSamplesPerSec=217.11281014763176, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 128.0, reducing to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 145 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:42,687] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:36:43,845] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=11, lr=[4.929924804067349e-05, 4.929924804067349e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:43,855] [INFO] [timer.py:199:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=220.320079789043, CurrSamplesPerSec=220.6193093653841, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:46,759] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=11, lr=[4.919535725504757e-05, 4.919535725504757e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:46,768] [INFO] [timer.py:199:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=220.31818819882218, CurrSamplesPerSec=220.48847352434333, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:49,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=11, lr=[4.908441327934164e-05, 4.908441327934164e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:49,745] [INFO] [timer.py:199:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=220.02737580499908, CurrSamplesPerSec=216.1091609647629, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:52,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=11, lr=[4.8966448454840854e-05, 4.8966448454840854e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:52,747] [INFO] [timer.py:199:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=219.67021770257563, CurrSamplesPerSec=215.22133520624925, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:55,722] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=11, lr=[4.884149716947845e-05, 4.884149716947845e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:55,730] [INFO] [timer.py:199:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=219.419188688239, CurrSamplesPerSec=215.64631606007416, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:36:58,633] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=11, lr=[4.8709595847811294e-05, 4.8709595847811294e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:36:58,643] [INFO] [timer.py:199:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=219.46665944666333, CurrSamplesPerSec=221.22695183922121, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:01,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=11, lr=[4.8570782940401785e-05, 4.8570782940401785e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:01,562] [INFO] [timer.py:199:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=219.4836312865479, CurrSamplesPerSec=216.87810781878042, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:04,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=11, lr=[4.8425098912609085e-05, 4.8425098912609085e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:04,477] [INFO] [timer.py:199:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=219.51293370647764, CurrSamplesPerSec=220.4520773057153, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:07,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=11, lr=[4.8272586232793085e-05, 4.8272586232793085e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:07,419] [INFO] [timer.py:199:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=219.44858308200799, CurrSamplesPerSec=219.03836255363038, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:10,353] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=11, lr=[4.8113289359934456e-05, 4.8113289359934456e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:10,363] [INFO] [timer.py:199:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=219.38869905137545, CurrSamplesPerSec=219.2817070999005, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:12,393] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,394] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,397] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,397] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:12,397] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:12,398] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:37:13,303] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=11, lr=[4.794725473067437e-05, 4.794725473067437e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:13,313] [INFO] [timer.py:199:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=219.31216463612918, CurrSamplesPerSec=213.66516786499164, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:16,247] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=11, lr=[4.777453074577784e-05, 4.777453074577784e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:16,257] [INFO] [timer.py:199:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=219.25938356180865, CurrSamplesPerSec=218.80625814097934, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:19,172] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=11, lr=[4.759516775602428e-05, 4.759516775602428e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:19,182] [INFO] [timer.py:199:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=219.26356247333504, CurrSamplesPerSec=221.033496479082, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:22,113] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=11, lr=[4.740921804752989e-05, 4.740921804752989e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:22,123] [INFO] [timer.py:199:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=219.22444278910163, CurrSamplesPerSec=215.34547473233783, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:25,047] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=11, lr=[4.721673582650558e-05, 4.721673582650558e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:25,056] [INFO] [timer.py:199:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=219.20790670651408, CurrSamplesPerSec=220.0526909036954, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:27,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=11, lr=[4.701777720345546e-05, 4.701777720345546e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:27,993] [INFO] [timer.py:199:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=219.18577943358747, CurrSamplesPerSec=217.16339548854904, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:30,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=11, lr=[4.681240017681993e-05, 4.681240017681993e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:30,940] [INFO] [timer.py:199:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=219.13717817881815, CurrSamplesPerSec=216.56543542942705, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:33,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=11, lr=[4.660066461606867e-05, 4.660066461606867e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:33,918] [INFO] [timer.py:199:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=219.01961423882202, CurrSamplesPerSec=215.87262945419747, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:36,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=11, lr=[4.638263224424798e-05, 4.638263224424798e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:36,879] [INFO] [timer.py:199:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=218.95032322695113, CurrSamplesPerSec=220.3776925792442, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:39,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=11, lr=[4.615836661998799e-05, 4.615836661998799e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:39,811] [INFO] [timer.py:199:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=218.94708216062656, CurrSamplesPerSec=220.17975909741503, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,822] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:37:41,823] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:37:42,716] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=11, lr=[4.5927933118974595e-05, 4.5927933118974595e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:42,726] [INFO] [timer.py:199:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=218.9798057221148, CurrSamplesPerSec=220.57634475045873, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:45,640] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=11, lr=[4.569139891489183e-05, 4.569139891489183e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:45,649] [INFO] [timer.py:199:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=218.99475182710077, CurrSamplesPerSec=217.46564148346417, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:48,582] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=11, lr=[4.544883295984006e-05, 4.544883295984006e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:48,592] [INFO] [timer.py:199:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=218.96971387840748, CurrSamplesPerSec=217.18641259320628, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:51,532] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=11, lr=[4.520030596423575e-05, 4.520030596423575e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:51,541] [INFO] [timer.py:199:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=218.93275165132903, CurrSamplesPerSec=216.3172707009398, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:54,483] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=11, lr=[4.494589037619867e-05, 4.494589037619867e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:54,493] [INFO] [timer.py:199:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=218.8924833295146, CurrSamplesPerSec=216.60597766445034, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:37:57,442] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=11, lr=[4.468566036043251e-05, 4.468566036043251e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:37:57,450] [INFO] [timer.py:199:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=218.84628950410894, CurrSamplesPerSec=218.59048605529674, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:00,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=11, lr=[4.4419691776605146e-05, 4.4419691776605146e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:00,374] [INFO] [timer.py:199:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=218.8611148975697, CurrSamplesPerSec=218.8708408618002, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:03,283] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=11, lr=[4.41480621572348e-05, 4.41480621572348e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:03,293] [INFO] [timer.py:199:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=218.88460348315985, CurrSamplesPerSec=219.06463919906705, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:06,223] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=11, lr=[4.387085068508852e-05, 4.387085068508852e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:06,230] [INFO] [timer.py:199:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=218.87527237966157, CurrSamplesPerSec=217.19906529351985, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:09,159] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=11, lr=[4.358813817009955e-05, 4.358813817009955e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:09,169] [INFO] [timer.py:199:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=218.86277469612367, CurrSamplesPerSec=217.1229955780379, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:11,190] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,192] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,192] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:11,191] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:38:11,192] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:38:12,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=11, lr=[4.330000702581053e-05, 4.330000702581053e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:12,107] [INFO] [timer.py:199:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=218.8508557931614, CurrSamplesPerSec=217.3390462310744, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:15,015] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=11, lr=[4.300654124534902e-05, 4.300654124534902e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:15,025] [INFO] [timer.py:199:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=218.87510562266604, CurrSamplesPerSec=220.14310364057445, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:17,939] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=11, lr=[4.270782637694273e-05, 4.270782637694273e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:17,949] [INFO] [timer.py:199:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=218.88637924733757, CurrSamplesPerSec=219.62473675512132, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:20,883] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=11, lr=[4.2403949498981285e-05, 4.2403949498981285e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:20,893] [INFO] [timer.py:199:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=218.8670040828307, CurrSamplesPerSec=217.1164978655188, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 512.0, reducing to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:21,727] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 482 -[2023-04-18 03:38:21,728] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 512.0 to 256.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 256.0, reducing to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 485 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,561] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 256.0 to 128.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,803] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,804] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,804] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,804] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,804] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,804] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 128.0, reducing to 64.0 -[2023-04-18 03:38:22,804] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,804] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 486 -[2023-04-18 03:38:22,804] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:22,804] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 128.0 to 64.0 -[2023-04-18 03:38:23,672] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=14, lr=[4.2188211665338126e-05, 4.2188211665338126e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:23,681] [INFO] [timer.py:199:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=219.08700473431057, CurrSamplesPerSec=221.22695183922121, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:26,591] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=14, lr=[4.187576346253234e-05, 4.187576346253234e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:26,600] [INFO] [timer.py:199:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=219.10175871073957, CurrSamplesPerSec=218.45937794502777, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:29,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=14, lr=[4.1558395804882695e-05, 4.1558395804882695e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:29,514] [INFO] [timer.py:199:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=219.1241906891609, CurrSamplesPerSec=220.76536908755514, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:32,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=14, lr=[4.123620120825459e-05, 4.123620120825459e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:32,442] [INFO] [timer.py:199:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=219.1237731404074, CurrSamplesPerSec=217.3276088619739, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:35,391] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=14, lr=[4.0909273595614694e-05, 4.0909273595614694e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:35,401] [INFO] [timer.py:199:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=219.08096439007844, CurrSamplesPerSec=216.8374636598848, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:38,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=14, lr=[4.057770826965143e-05, 4.057770826965143e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:38,324] [INFO] [timer.py:199:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=219.08926594426876, CurrSamplesPerSec=218.83676069806555, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:41,256] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=14, lr=[4.0241601884993366e-05, 4.0241601884993366e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:41,266] [INFO] [timer.py:199:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=219.07058718904423, CurrSamplesPerSec=214.91617134658742, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 64.0, reducing to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,549] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,548] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 557 -[2023-04-18 03:38:43,549] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,549] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,549] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 64.0 to 32.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 558 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,787] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,788] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32.0 to 16.0 -[2023-04-18 03:38:43,788] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 32.0, reducing to 16.0 -[2023-04-18 03:38:44,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=16, lr=[3.996951301273557e-05, 3.996951301273557e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:44,084] [INFO] [timer.py:199:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=219.2193344837397, CurrSamplesPerSec=217.6075539731642, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 16.0, reducing to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 565 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:45,785] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16.0 to 8.0 -[2023-04-18 03:38:46,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=17, lr=[3.9660077271631113e-05, 3.9660077271631113e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:46,957] [INFO] [timer.py:199:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=219.29114140173343, CurrSamplesPerSec=220.04619703993345, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:49,870] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=17, lr=[3.931220308231662e-05, 3.931220308231662e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:49,880] [INFO] [timer.py:199:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=219.2953114903082, CurrSamplesPerSec=220.74067958426633, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:52,786] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=17, lr=[3.896015674180224e-05, 3.896015674180224e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:52,796] [INFO] [timer.py:199:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=219.3079739104511, CurrSamplesPerSec=222.318579204198, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:55,707] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=17, lr=[3.8604040875138315e-05, 3.8604040875138315e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:55,716] [INFO] [timer.py:199:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=219.31456684425606, CurrSamplesPerSec=220.3527279413696, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:38:58,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=17, lr=[3.8243959293683016e-05, 3.8243959293683016e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:38:58,643] [INFO] [timer.py:199:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=219.31388985936573, CurrSamplesPerSec=218.47911415328144, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:01,550] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=17, lr=[3.788001696484028e-05, 3.788001696484028e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:01,559] [INFO] [timer.py:199:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=219.3251462727111, CurrSamplesPerSec=219.21365935561005, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:04,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=17, lr=[3.751231998146076e-05, 3.751231998146076e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:04,477] [INFO] [timer.py:199:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=219.33432077676605, CurrSamplesPerSec=220.1095285653608, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:07,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=17, lr=[3.714097553091465e-05, 3.714097553091465e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:07,402] [INFO] [timer.py:199:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=219.33550379682038, CurrSamplesPerSec=220.08624864001547, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:10,323] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=17, lr=[3.6766091863845564e-05, 3.6766091863845564e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:10,332] [INFO] [timer.py:199:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=219.32889220493504, CurrSamplesPerSec=215.10319903713165, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:13,290] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=17, lr=[3.6387778262614316e-05, 3.6387778262614316e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:13,300] [INFO] [timer.py:199:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=219.2816918549144, CurrSamplesPerSec=215.42739193939298, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,342] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,342] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,341] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,342] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:15,343] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:15,344] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8.0 to 16.0 -[2023-04-18 03:39:16,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=17, lr=[3.600614500944205e-05, 3.600614500944205e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:16,247] [INFO] [timer.py:199:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=219.2586504715837, CurrSamplesPerSec=218.93403322072714, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:19,171] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=17, lr=[3.562130335426184e-05, 3.562130335426184e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:19,181] [INFO] [timer.py:199:stop] epoch=0/micro_step=680/global_step=680, RunningAvgSamplesPerSec=219.24968712306404, CurrSamplesPerSec=219.3980232266375, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:22,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=17, lr=[3.5233365482288225e-05, 3.5233365482288225e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:22,121] [INFO] [timer.py:199:stop] epoch=0/micro_step=690/global_step=690, RunningAvgSamplesPerSec=219.23533015388068, CurrSamplesPerSec=219.24105244673856, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:25,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=17, lr=[3.4842444481314116e-05, 3.4842444481314116e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:25,050] [INFO] [timer.py:199:stop] epoch=0/micro_step=700/global_step=700, RunningAvgSamplesPerSec=219.2323164586272, CurrSamplesPerSec=216.52054213325223, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:27,967] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=17, lr=[3.444865430874453e-05, 3.444865430874453e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:27,977] [INFO] [timer.py:199:stop] epoch=0/micro_step=710/global_step=710, RunningAvgSamplesPerSec=219.23212802703992, CurrSamplesPerSec=221.20288811282165, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:30,901] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=17, lr=[3.405210975837685e-05, 3.405210975837685e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:30,910] [INFO] [timer.py:199:stop] epoch=0/micro_step=720/global_step=720, RunningAvgSamplesPerSec=219.22525321803266, CurrSamplesPerSec=220.43433624278077, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:33,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=17, lr=[3.365292642693732e-05, 3.365292642693732e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:33,831] [INFO] [timer.py:199:stop] epoch=0/micro_step=730/global_step=730, RunningAvgSamplesPerSec=219.23136998837057, CurrSamplesPerSec=220.43180203831426, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:36,746] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=17, lr=[3.3251220680383436e-05, 3.3251220680383436e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:36,755] [INFO] [timer.py:199:stop] epoch=0/micro_step=740/global_step=740, RunningAvgSamplesPerSec=219.23390280938014, CurrSamplesPerSec=219.93135508651073, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:39,696] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=17, lr=[3.284710961998203e-05, 3.284710961998203e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:39,705] [INFO] [timer.py:199:stop] epoch=0/micro_step=750/global_step=750, RunningAvgSamplesPerSec=219.21035715581846, CurrSamplesPerSec=216.05576142352604, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:42,665] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=17, lr=[3.244071104817317e-05, 3.244071104817317e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:42,675] [INFO] [timer.py:199:stop] epoch=0/micro_step=760/global_step=760, RunningAvgSamplesPerSec=219.16820189640498, CurrSamplesPerSec=215.81223278272572, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,723] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,723] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,722] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,723] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,723] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,723] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,723] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,723] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,723] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:44,725] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:39:44,725] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16.0 to 32.0 -[2023-04-18 03:39:45,629] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=17, lr=[3.203214343422948e-05, 3.203214343422948e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:45,639] [INFO] [timer.py:199:stop] epoch=0/micro_step=770/global_step=770, RunningAvgSamplesPerSec=219.13204800694217, CurrSamplesPerSec=218.72816755428371, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:48,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=17, lr=[3.1621525879721206e-05, 3.1621525879721206e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:48,576] [INFO] [timer.py:199:stop] epoch=0/micro_step=780/global_step=780, RunningAvgSamplesPerSec=219.12295093958474, CurrSamplesPerSec=219.23782937480553, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:51,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=17, lr=[3.12089780837969e-05, 3.12089780837969e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:51,505] [INFO] [timer.py:199:stop] epoch=0/micro_step=790/global_step=790, RunningAvgSamplesPerSec=219.12234353762653, CurrSamplesPerSec=220.65612359056433, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:54,412] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=17, lr=[3.079462030828989e-05, 3.079462030828989e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:54,422] [INFO] [timer.py:199:stop] epoch=0/micro_step=800/global_step=800, RunningAvgSamplesPerSec=219.13262358523212, CurrSamplesPerSec=221.18447925103843, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:39:57,347] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=17, lr=[3.0378573342660782e-05, 3.0378573342660782e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:39:57,357] [INFO] [timer.py:199:stop] epoch=0/micro_step=810/global_step=810, RunningAvgSamplesPerSec=219.12705844488337, CurrSamplesPerSec=218.9522479608483, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:00,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=17, lr=[2.9960958468786083e-05, 2.9960958468786083e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:00,280] [INFO] [timer.py:199:stop] epoch=0/micro_step=820/global_step=820, RunningAvgSamplesPerSec=219.13208928029044, CurrSamplesPerSec=219.1562098168034, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:03,204] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=17, lr=[2.9541897425603337e-05, 2.9541897425603337e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:03,213] [INFO] [timer.py:199:stop] epoch=0/micro_step=830/global_step=830, RunningAvgSamplesPerSec=219.12754408149513, CurrSamplesPerSec=216.3753342130145, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:06,155] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=17, lr=[2.912151237362299e-05, 2.912151237362299e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:06,165] [INFO] [timer.py:199:stop] epoch=0/micro_step=840/global_step=840, RunningAvgSamplesPerSec=219.10674347846955, CurrSamplesPerSec=216.18034339464614, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:09,080] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=17, lr=[2.8699925859317366e-05, 2.8699925859317366e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:09,090] [INFO] [timer.py:199:stop] epoch=0/micro_step=850/global_step=850, RunningAvgSamplesPerSec=219.1095320888808, CurrSamplesPerSec=219.04765697293928, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:12,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=17, lr=[2.827726077939718e-05, 2.827726077939718e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:12,012] [INFO] [timer.py:199:stop] epoch=0/micro_step=860/global_step=860, RunningAvgSamplesPerSec=219.11561001854562, CurrSamplesPerSec=218.65422902389972, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,031] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32.0 to 64.0 -[2023-04-18 03:40:14,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=17, lr=[2.785364034498582e-05, 2.785364034498582e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:14,932] [INFO] [timer.py:199:stop] epoch=0/micro_step=870/global_step=870, RunningAvgSamplesPerSec=219.12268985209582, CurrSamplesPerSec=221.35866582225037, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:17,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=17, lr=[2.742918804570216e-05, 2.742918804570216e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:17,863] [INFO] [timer.py:199:stop] epoch=0/micro_step=880/global_step=880, RunningAvgSamplesPerSec=219.120244124262, CurrSamplesPerSec=216.30472052512235, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:20,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=17, lr=[2.7004027613662043e-05, 2.7004027613662043e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:20,803] [INFO] [timer.py:199:stop] epoch=0/micro_step=890/global_step=890, RunningAvgSamplesPerSec=219.11010578229926, CurrSamplesPerSec=216.54604343417313, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:26,665] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=17, lr=[2.6578282987409136e-05, 2.6578282987409136e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:26,675] [INFO] [timer.py:199:stop] epoch=0/micro_step=900/global_step=900, RunningAvgSamplesPerSec=216.67885779656578, CurrSamplesPerSec=218.4060546789188, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:29,586] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=17, lr=[2.6152078275785596e-05, 2.6152078275785596e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:29,595] [INFO] [timer.py:199:stop] epoch=0/micro_step=910/global_step=910, RunningAvgSamplesPerSec=216.71176865058735, CurrSamplesPerSec=215.8580478523996, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:32,519] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=17, lr=[2.5725537721753102e-05, 2.5725537721753102e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:32,528] [INFO] [timer.py:199:stop] epoch=0/micro_step=920/global_step=920, RunningAvgSamplesPerSec=216.7336629064403, CurrSamplesPerSec=215.78447553774203, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:35,480] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=17, lr=[2.529878566617475e-05, 2.529878566617475e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:35,490] [INFO] [timer.py:199:stop] epoch=0/micro_step=930/global_step=930, RunningAvgSamplesPerSec=216.73266086337313, CurrSamplesPerSec=215.84103847316655, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:38,427] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=17, lr=[2.4871946511568504e-05, 2.4871946511568504e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:38,437] [INFO] [timer.py:199:stop] epoch=0/micro_step=940/global_step=940, RunningAvgSamplesPerSec=216.74402305310278, CurrSamplesPerSec=217.51092758912395, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:41,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=17, lr=[2.444514468584253e-05, 2.444514468584253e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:41,371] [INFO] [timer.py:199:stop] epoch=0/micro_step=950/global_step=950, RunningAvgSamplesPerSec=216.76441470792236, CurrSamplesPerSec=222.16273505461882, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:44,293] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=17, lr=[2.4018504606023293e-05, 2.4018504606023293e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:44,303] [INFO] [timer.py:199:stop] epoch=0/micro_step=960/global_step=960, RunningAvgSamplesPerSec=216.78565702628165, CurrSamplesPerSec=217.97827975814428, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:46,327] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,327] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,327] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:46,330] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:40:46,330] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 64.0 to 128.0 -[2023-04-18 03:40:47,230] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=17, lr=[2.3592150641986648e-05, 2.3592150641986648e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:47,240] [INFO] [timer.py:199:stop] epoch=0/micro_step=970/global_step=970, RunningAvgSamplesPerSec=216.80311380259832, CurrSamplesPerSec=218.30925064918904, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:50,171] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=17, lr=[2.316620708020285e-05, 2.316620708020285e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:50,181] [INFO] [timer.py:199:stop] epoch=0/micro_step=980/global_step=980, RunningAvgSamplesPerSec=216.81646094209336, CurrSamplesPerSec=218.44924452097658, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:53,100] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=17, lr=[2.2740798087505783e-05, 2.2740798087505783e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:53,109] [INFO] [timer.py:199:stop] epoch=0/micro_step=990/global_step=990, RunningAvgSamplesPerSec=216.83947568519696, CurrSamplesPerSec=214.93458412368477, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:56,047] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=17, lr=[2.2316047674897034e-05, 2.2316047674897034e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:56,057] [INFO] [timer.py:199:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=216.84785356037364, CurrSamplesPerSec=219.07894206355877, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:40:58,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=17, lr=[2.1892079661395495e-05, 2.1892079661395495e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:40:58,993] [INFO] [timer.py:199:stop] epoch=0/micro_step=1010/global_step=1010, RunningAvgSamplesPerSec=216.86448472481848, CurrSamplesPerSec=221.3320184891715, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:01,927] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=17, lr=[2.1469017637942804e-05, 2.1469017637942804e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:01,937] [INFO] [timer.py:199:stop] epoch=0/micro_step=1020/global_step=1020, RunningAvgSamplesPerSec=216.87509326467367, CurrSamplesPerSec=217.03802919277206, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:04,864] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=17, lr=[2.1046984931375433e-05, 2.1046984931375433e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:04,874] [INFO] [timer.py:199:stop] epoch=0/micro_step=1030/global_step=1030, RunningAvgSamplesPerSec=216.8904083994008, CurrSamplesPerSec=219.79305499832148, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:07,785] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=17, lr=[2.0626104568473596e-05, 2.0626104568473596e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:07,795] [INFO] [timer.py:199:stop] epoch=0/micro_step=1040/global_step=1040, RunningAvgSamplesPerSec=216.91672794109778, CurrSamplesPerSec=219.5940141539262, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:10,731] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=17, lr=[2.0206499240097755e-05, 2.0206499240097755e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:10,741] [INFO] [timer.py:199:stop] epoch=0/micro_step=1050/global_step=1050, RunningAvgSamplesPerSec=216.925013568977, CurrSamplesPerSec=217.987838469934, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:13,654] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=17, lr=[1.9788291265422945e-05, 1.9788291265422945e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:13,664] [INFO] [timer.py:199:stop] epoch=0/micro_step=1060/global_step=1060, RunningAvgSamplesPerSec=216.94935165227335, CurrSamplesPerSec=218.1923122566591, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,679] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,678] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,679] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,679] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,679] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,679] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,679] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,679] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:15,680] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:15,680] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 128.0 to 256.0 -[2023-04-18 03:41:16,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=17, lr=[1.937160255628156e-05, 1.937160255628156e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:16,586] [INFO] [timer.py:199:stop] epoch=0/micro_step=1070/global_step=1070, RunningAvgSamplesPerSec=216.97340233687692, CurrSamplesPerSec=219.05963363769678, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:19,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=17, lr=[1.8956554581624824e-05, 1.8956554581624824e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:19,509] [INFO] [timer.py:199:stop] epoch=0/micro_step=1080/global_step=1080, RunningAvgSamplesPerSec=216.9969208728228, CurrSamplesPerSec=220.15501983921948, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:22,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=17, lr=[1.8543268332113316e-05, 1.8543268332113316e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:22,434] [INFO] [timer.py:199:stop] epoch=0/micro_step=1090/global_step=1090, RunningAvgSamplesPerSec=217.01894481846637, CurrSamplesPerSec=217.5128663258551, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:25,366] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=17, lr=[1.8131864284847043e-05, 1.8131864284847043e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:25,376] [INFO] [timer.py:199:stop] epoch=0/micro_step=1100/global_step=1100, RunningAvgSamplesPerSec=217.0289353713407, CurrSamplesPerSec=219.151200069884, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:28,299] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=17, lr=[1.7722462368245068e-05, 1.7722462368245068e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:28,308] [INFO] [timer.py:199:stop] epoch=0/micro_step=1110/global_step=1110, RunningAvgSamplesPerSec=217.0449729781252, CurrSamplesPerSec=216.28746641098053, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:31,239] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=17, lr=[1.7315181927085277e-05, 1.7315181927085277e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:31,249] [INFO] [timer.py:199:stop] epoch=0/micro_step=1120/global_step=1120, RunningAvgSamplesPerSec=217.0554655935377, CurrSamplesPerSec=217.53084534902848, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:34,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=17, lr=[1.691014168771409e-05, 1.691014168771409e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:34,180] [INFO] [timer.py:199:stop] epoch=0/micro_step=1130/global_step=1130, RunningAvgSamplesPerSec=217.07169920758096, CurrSamplesPerSec=220.80677270682065, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:37,096] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=17, lr=[1.6507459723436585e-05, 1.6507459723436585e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:37,106] [INFO] [timer.py:199:stop] epoch=0/micro_step=1140/global_step=1140, RunningAvgSamplesPerSec=217.09069511023836, CurrSamplesPerSec=220.00561907337743, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:40,018] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=17, lr=[1.6107253420096892e-05, 1.6107253420096892e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:40,028] [INFO] [timer.py:199:stop] epoch=0/micro_step=1150/global_step=1150, RunningAvgSamplesPerSec=217.11223821687588, CurrSamplesPerSec=220.45298253855364, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:42,939] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=17, lr=[1.5709639441859087e-05, 1.5709639441859087e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:42,949] [INFO] [timer.py:199:stop] epoch=0/micro_step=1160/global_step=1160, RunningAvgSamplesPerSec=217.1341331002338, CurrSamplesPerSec=219.69339959275337, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,983] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,984] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,984] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,984] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,984] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,984] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,984] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,984] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,984] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:44,986] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:41:44,987] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 256.0 to 512.0 -[2023-04-18 03:41:45,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=17, lr=[1.5314733697198407e-05, 1.5314733697198407e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:45,900] [INFO] [timer.py:199:stop] epoch=0/micro_step=1170/global_step=1170, RunningAvgSamplesPerSec=217.13627939001756, CurrSamplesPerSec=221.81888616013143, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:48,809] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=17, lr=[1.4922651305112744e-05, 1.4922651305112744e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:48,819] [INFO] [timer.py:199:stop] epoch=0/micro_step=1180/global_step=1180, RunningAvgSamplesPerSec=217.15881063831398, CurrSamplesPerSec=219.60227786817586, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:51,768] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=17, lr=[1.4533506561564306e-05, 1.4533506561564306e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:51,777] [INFO] [timer.py:199:stop] epoch=0/micro_step=1190/global_step=1190, RunningAvgSamplesPerSec=217.1567445267269, CurrSamplesPerSec=220.43524132992596, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:54,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=17, lr=[1.4147412906161172e-05, 1.4147412906161172e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:54,698] [INFO] [timer.py:199:stop] epoch=0/micro_step=1200/global_step=1200, RunningAvgSamplesPerSec=217.17757919615823, CurrSamplesPerSec=219.42582114416595, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:41:57,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=17, lr=[1.3764482889088581e-05, 1.3764482889088581e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:41:57,641] [INFO] [timer.py:199:stop] epoch=0/micro_step=1210/global_step=1210, RunningAvgSamplesPerSec=217.18479910748903, CurrSamplesPerSec=215.16975324475453, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:00,564] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=17, lr=[1.338482813829931e-05, 1.338482813829931e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:00,574] [INFO] [timer.py:199:stop] epoch=0/micro_step=1220/global_step=1220, RunningAvgSamplesPerSec=217.19835756559223, CurrSamplesPerSec=216.71772208412284, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:03,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=17, lr=[1.3008559326973116e-05, 1.3008559326973116e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:03,519] [INFO] [timer.py:199:stop] epoch=0/micro_step=1230/global_step=1230, RunningAvgSamplesPerSec=217.203575721669, CurrSamplesPerSec=217.4679317675118, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:06,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=17, lr=[1.2635786141254291e-05, 1.2635786141254291e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:06,470] [INFO] [timer.py:199:stop] epoch=0/micro_step=1240/global_step=1240, RunningAvgSamplesPerSec=217.20517683613562, CurrSamplesPerSec=217.2358016229029, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:09,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=17, lr=[1.2266617248277102e-05, 1.2266617248277102e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:09,415] [INFO] [timer.py:199:stop] epoch=0/micro_step=1250/global_step=1250, RunningAvgSamplesPerSec=217.21076953706302, CurrSamplesPerSec=220.21136842796085, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:12,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=17, lr=[1.1901160264488243e-05, 1.1901160264488243e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:12,339] [INFO] [timer.py:199:stop] epoch=0/micro_step=1260/global_step=1260, RunningAvgSamplesPerSec=217.22827776893337, CurrSamplesPerSec=218.91546444887726, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,371] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,372] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:14,374] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:14,374] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 -[2023-04-18 03:42:15,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=17, lr=[1.153952172427549e-05, 1.153952172427549e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:15,280] [INFO] [timer.py:199:stop] epoch=0/micro_step=1270/global_step=1270, RunningAvgSamplesPerSec=217.23560765896335, CurrSamplesPerSec=220.08678997769906, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:18,201] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=17, lr=[1.118180704891194e-05, 1.118180704891194e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:18,211] [INFO] [timer.py:199:stop] epoch=0/micro_step=1280/global_step=1280, RunningAvgSamplesPerSec=217.24872271148755, CurrSamplesPerSec=220.0830006698363, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:21,168] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=17, lr=[1.082812051582458e-05, 1.082812051582458e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:21,178] [INFO] [timer.py:199:stop] epoch=0/micro_step=1290/global_step=1290, RunningAvgSamplesPerSec=217.24095677346082, CurrSamplesPerSec=216.26359404531914, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:24,103] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=17, lr=[1.0478565228196391e-05, 1.0478565228196391e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:24,112] [INFO] [timer.py:199:stop] epoch=0/micro_step=1300/global_step=1300, RunningAvgSamplesPerSec=217.2519142662411, CurrSamplesPerSec=212.3059966149417, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:27,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=1310, skipped=17, lr=[1.0133243084910764e-05, 1.0133243084910764e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:27,040] [INFO] [timer.py:199:stop] epoch=0/micro_step=1310/global_step=1310, RunningAvgSamplesPerSec=217.26660770167967, CurrSamplesPerSec=219.68387050838967, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:29,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=1320, skipped=17, lr=[9.792254750846891e-06, 9.792254750846891e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:29,972] [INFO] [timer.py:199:stop] epoch=0/micro_step=1320/global_step=1320, RunningAvgSamplesPerSec=217.27841480091433, CurrSamplesPerSec=216.23084346278083, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:32,943] [INFO] [logging.py:96:log_dist] [Rank 0] step=1330, skipped=17, lr=[9.455699627535e-06, 9.455699627535e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:32,951] [INFO] [timer.py:199:stop] epoch=0/micro_step=1330/global_step=1330, RunningAvgSamplesPerSec=217.26411056701554, CurrSamplesPerSec=217.46564148346417, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:35,897] [INFO] [logging.py:96:log_dist] [Rank 0] step=1340, skipped=17, lr=[9.123675824179758e-06, 9.123675824179758e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:35,905] [INFO] [timer.py:199:stop] epoch=0/micro_step=1340/global_step=1340, RunningAvgSamplesPerSec=217.26390829428763, CurrSamplesPerSec=217.6611939620781, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:38,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=1350, skipped=17, lr=[8.796280129060475e-06, 8.796280129060475e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:38,864] [INFO] [timer.py:199:stop] epoch=0/micro_step=1350/global_step=1350, RunningAvgSamplesPerSec=217.26124941313205, CurrSamplesPerSec=211.574576887033, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:41,797] [INFO] [logging.py:96:log_dist] [Rank 0] step=1360, skipped=17, lr=[8.473607981316364e-06, 8.473607981316364e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:41,805] [INFO] [timer.py:199:stop] epoch=0/micro_step=1360/global_step=1360, RunningAvgSamplesPerSec=217.26811742461774, CurrSamplesPerSec=217.4402754095705, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,833] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,834] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,834] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,835] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,836] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,836] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:43,836] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:42:43,836] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:42:44,741] [INFO] [logging.py:96:log_dist] [Rank 0] step=1370, skipped=17, lr=[8.155753443125036e-06, 8.155753443125036e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:44,749] [INFO] [timer.py:199:stop] epoch=0/micro_step=1370/global_step=1370, RunningAvgSamplesPerSec=217.27282407195733, CurrSamplesPerSec=218.48960602183305, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:47,680] [INFO] [logging.py:96:log_dist] [Rank 0] step=1380, skipped=17, lr=[7.842809172282436e-06, 7.842809172282436e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:47,690] [INFO] [timer.py:199:stop] epoch=0/micro_step=1380/global_step=1380, RunningAvgSamplesPerSec=217.27975786360386, CurrSamplesPerSec=221.48889195096532, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:50,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=1390, skipped=17, lr=[7.534866395192203e-06, 7.534866395192203e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:50,632] [INFO] [timer.py:199:stop] epoch=0/micro_step=1390/global_step=1390, RunningAvgSamplesPerSec=217.2853138593823, CurrSamplesPerSec=218.33322163263048, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:53,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=17, lr=[7.2320148802721925e-06, 7.2320148802721925e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:53,560] [INFO] [timer.py:199:stop] epoch=0/micro_step=1400/global_step=1400, RunningAvgSamplesPerSec=217.29870563856866, CurrSamplesPerSec=217.69702749805566, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:56,501] [INFO] [logging.py:96:log_dist] [Rank 0] step=1410, skipped=17, lr=[6.934342911786143e-06, 6.934342911786143e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:56,511] [INFO] [timer.py:199:stop] epoch=0/micro_step=1410/global_step=1410, RunningAvgSamplesPerSec=217.29962425224832, CurrSamplesPerSec=219.6407302826069, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:42:59,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=1420, skipped=17, lr=[6.641937264107867e-06, 6.641937264107867e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:42:59,448] [INFO] [timer.py:199:stop] epoch=0/micro_step=1420/global_step=1420, RunningAvgSamplesPerSec=217.30762557862326, CurrSamplesPerSec=219.63947228031589, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:02,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=1430, skipped=17, lr=[6.35488317642568e-06, 6.35488317642568e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:02,396] [INFO] [timer.py:199:stop] epoch=0/micro_step=1430/global_step=1430, RunningAvgSamplesPerSec=217.3096154943934, CurrSamplesPerSec=216.95155119267181, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:05,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=1440, skipped=17, lr=[6.073264327894332e-06, 6.073264327894332e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:05,343] [INFO] [timer.py:199:stop] epoch=0/micro_step=1440/global_step=1440, RunningAvgSamplesPerSec=217.31228257026677, CurrSamplesPerSec=215.56838694393474, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:08,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=1450, skipped=17, lr=[5.79716281324165e-06, 5.79716281324165e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:08,293] [INFO] [timer.py:199:stop] epoch=0/micro_step=1450/global_step=1450, RunningAvgSamplesPerSec=217.31479805624969, CurrSamplesPerSec=219.14744290999815, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:11,208] [INFO] [logging.py:96:log_dist] [Rank 0] step=1460, skipped=17, lr=[5.526659118837144e-06, 5.526659118837144e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:11,217] [INFO] [timer.py:199:stop] epoch=0/micro_step=1460/global_step=1460, RunningAvgSamplesPerSec=217.32909250211313, CurrSamplesPerSec=219.80673318954436, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,252] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,252] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,251] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,252] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,252] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,252] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,252] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,252] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,252] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,252] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:13,255] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:43:13,255] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 -[2023-04-18 03:43:14,156] [INFO] [logging.py:96:log_dist] [Rank 0] step=1470, skipped=17, lr=[5.261832099229388e-06, 5.261832099229388e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:14,166] [INFO] [timer.py:199:stop] epoch=0/micro_step=1470/global_step=1470, RunningAvgSamplesPerSec=217.33108525374053, CurrSamplesPerSec=217.54917996250933, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:17,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=1480, skipped=17, lr=[5.0027589541591284e-06, 5.0027589541591284e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:17,121] [INFO] [timer.py:199:stop] epoch=0/micro_step=1480/global_step=1480, RunningAvgSamplesPerSec=217.32970469031258, CurrSamplesPerSec=216.65457844297265, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:20,061] [INFO] [logging.py:96:log_dist] [Rank 0] step=1490, skipped=17, lr=[4.749515206054822e-06, 4.749515206054822e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:20,070] [INFO] [timer.py:199:stop] epoch=0/micro_step=1490/global_step=1490, RunningAvgSamplesPerSec=217.33090348603076, CurrSamplesPerSec=218.09481158902196, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:23,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=1500, skipped=17, lr=[4.502174678017018e-06, 4.502174678017018e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:23,020] [INFO] [timer.py:199:stop] epoch=0/micro_step=1500/global_step=1500, RunningAvgSamplesPerSec=217.331954380102, CurrSamplesPerSec=217.49577138412425, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:25,968] [INFO] [logging.py:96:log_dist] [Rank 0] step=1510, skipped=17, lr=[4.26080947229826e-06, 4.26080947229826e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:25,977] [INFO] [timer.py:199:stop] epoch=0/micro_step=1510/global_step=1510, RunningAvgSamplesPerSec=217.32946125601498, CurrSamplesPerSec=216.20228190535715, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:28,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=1520, skipped=17, lr=[4.025489949284492e-06, 4.025489949284492e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:28,927] [INFO] [timer.py:199:stop] epoch=0/micro_step=1520/global_step=1520, RunningAvgSamplesPerSec=217.33111018956131, CurrSamplesPerSec=219.93477877926102, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:31,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=1530, skipped=17, lr=[3.7962847069843126e-06, 3.7962847069843126e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:31,863] [INFO] [timer.py:199:stop] epoch=0/micro_step=1530/global_step=1530, RunningAvgSamplesPerSec=217.33859122373565, CurrSamplesPerSec=217.16058457344178, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:34,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=1540, skipped=17, lr=[3.5732605610320074e-06, 3.5732605610320074e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:34,808] [INFO] [timer.py:199:stop] epoch=0/micro_step=1540/global_step=1540, RunningAvgSamplesPerSec=217.34334004578542, CurrSamplesPerSec=220.29829570401205, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,343] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,343] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,342] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,343] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,343] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,343] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:35,345] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1541 -[2023-04-18 03:43:35,345] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 -[2023-04-18 03:43:37,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=1550, skipped=18, lr=[3.3778774384813557e-06, 3.3778774384813557e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:37,676] [INFO] [timer.py:199:stop] epoch=0/micro_step=1550/global_step=1550, RunningAvgSamplesPerSec=217.38321677804396, CurrSamplesPerSec=220.2500344608033, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:40,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=1560, skipped=18, lr=[3.166774984049342e-06, 3.166774984049342e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:40,595] [INFO] [timer.py:199:stop] epoch=0/micro_step=1560/global_step=1560, RunningAvgSamplesPerSec=217.39874179250074, CurrSamplesPerSec=221.5183190597773, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:43,525] [INFO] [logging.py:96:log_dist] [Rank 0] step=1570, skipped=18, lr=[2.962037134383211e-06, 2.962037134383211e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:43,534] [INFO] [timer.py:199:stop] epoch=0/micro_step=1570/global_step=1570, RunningAvgSamplesPerSec=217.404440774079, CurrSamplesPerSec=220.7784422371344, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:46,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=1580, skipped=18, lr=[2.763723572626087e-06, 2.763723572626087e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:46,452] [INFO] [timer.py:199:stop] epoch=0/micro_step=1580/global_step=1580, RunningAvgSamplesPerSec=217.4199392258332, CurrSamplesPerSec=219.78603658741278, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:49,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=1590, skipped=18, lr=[2.5718921091765517e-06, 2.5718921091765517e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:49,383] [INFO] [timer.py:199:stop] epoch=0/micro_step=1590/global_step=1590, RunningAvgSamplesPerSec=217.42940829622682, CurrSamplesPerSec=220.160978422317, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:52,322] [INFO] [logging.py:96:log_dist] [Rank 0] step=1600, skipped=18, lr=[2.386598664836298e-06, 2.386598664836298e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:52,332] [INFO] [timer.py:199:stop] epoch=0/micro_step=1600/global_step=1600, RunningAvgSamplesPerSec=217.43048317864347, CurrSamplesPerSec=217.32303425138298, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:55,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=1610, skipped=18, lr=[2.2078972545086645e-06, 2.2078972545086645e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:55,260] [INFO] [timer.py:199:stop] epoch=0/micro_step=1610/global_step=1610, RunningAvgSamplesPerSec=217.44130438893876, CurrSamplesPerSec=220.9345316872428, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,669] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:56,668] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1614 -[2023-04-18 03:43:56,669] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:43:58,131] [INFO] [logging.py:96:log_dist] [Rank 0] step=1620, skipped=19, lr=[2.0527452693256287e-06, 2.0527452693256287e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:43:58,141] [INFO] [timer.py:199:stop] epoch=0/micro_step=1620/global_step=1620, RunningAvgSamplesPerSec=217.47328778409326, CurrSamplesPerSec=216.91350867822254, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:01,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=1630, skipped=19, lr=[1.8867106400655533e-06, 1.8867106400655533e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:01,070] [INFO] [timer.py:199:stop] epoch=0/micro_step=1630/global_step=1630, RunningAvgSamplesPerSec=217.482976618043, CurrSamplesPerSec=218.0386553243059, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:04,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=1640, skipped=19, lr=[1.7274137672069145e-06, 1.7274137672069145e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:04,011] [INFO] [timer.py:199:stop] epoch=0/micro_step=1640/global_step=1640, RunningAvgSamplesPerSec=217.48714198012703, CurrSamplesPerSec=218.5235781358754, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:06,956] [INFO] [logging.py:96:log_dist] [Rank 0] step=1650, skipped=19, lr=[1.5749010873909175e-06, 1.5749010873909175e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:06,966] [INFO] [timer.py:199:stop] epoch=0/micro_step=1650/global_step=1650, RunningAvgSamplesPerSec=217.48503430813517, CurrSamplesPerSec=214.09716685728688, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:09,894] [INFO] [logging.py:96:log_dist] [Rank 0] step=1660, skipped=19, lr=[1.4292170595982146e-06, 1.4292170595982146e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:09,904] [INFO] [timer.py:199:stop] epoch=0/micro_step=1660/global_step=1660, RunningAvgSamplesPerSec=217.49014632793464, CurrSamplesPerSec=218.5379883483511, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:12,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=1670, skipped=19, lr=[1.2904041521887122e-06, 1.2904041521887122e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:12,839] [INFO] [timer.py:199:stop] epoch=0/micro_step=1670/global_step=1670, RunningAvgSamplesPerSec=217.49696279530096, CurrSamplesPerSec=218.54919760539494, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:15,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=1680, skipped=19, lr=[1.15850283052156e-06, 1.15850283052156e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:15,781] [INFO] [timer.py:199:stop] epoch=0/micro_step=1680/global_step=1680, RunningAvgSamplesPerSec=217.50026662517845, CurrSamplesPerSec=218.11536811451006, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:18,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=1690, skipped=19, lr=[1.0335515451591503e-06, 1.0335515451591503e-06], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:18,722] [INFO] [timer.py:199:stop] epoch=0/micro_step=1690/global_step=1690, RunningAvgSamplesPerSec=217.50400749550178, CurrSamplesPerSec=218.01705083983143, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:21,655] [INFO] [logging.py:96:log_dist] [Rank 0] step=1700, skipped=19, lr=[9.155867206583624e-07, 9.155867206583624e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:21,665] [INFO] [timer.py:199:stop] epoch=0/micro_step=1700/global_step=1700, RunningAvgSamplesPerSec=217.50699718100392, CurrSamplesPerSec=217.72863434694094, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:24,603] [INFO] [logging.py:96:log_dist] [Rank 0] step=1710, skipped=19, lr=[8.046427449524274e-07, 8.046427449524274e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:24,613] [INFO] [timer.py:199:stop] epoch=0/micro_step=1710/global_step=1710, RunningAvgSamplesPerSec=217.50792668103418, CurrSamplesPerSec=217.65307568688692, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:26,349] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,349] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,349] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,349] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,350] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:26,353] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:26,353] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:27,554] [INFO] [logging.py:96:log_dist] [Rank 0] step=1720, skipped=19, lr=[7.007519593265204e-07, 7.007519593265204e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:27,564] [INFO] [timer.py:199:stop] epoch=0/micro_step=1720/global_step=1720, RunningAvgSamplesPerSec=217.50750395350983, CurrSamplesPerSec=217.20802851490276, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:28,979] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,979] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,979] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,979] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,979] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,979] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,979] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,979] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,979] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:362:_update_scale] -Grad overflow on iteration 1724 -[2023-04-18 03:44:28,980] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 -[2023-04-18 03:44:30,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=1730, skipped=20, lr=[6.133058031348476e-07, 6.133058031348476e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:30,454] [INFO] [timer.py:199:stop] epoch=0/micro_step=1730/global_step=1730, RunningAvgSamplesPerSec=217.53283922313705, CurrSamplesPerSec=215.76574343343734, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:33,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=1740, skipped=20, lr=[5.228978079419272e-07, 5.228978079419272e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:33,412] [INFO] [timer.py:199:stop] epoch=0/micro_step=1740/global_step=1740, RunningAvgSamplesPerSec=217.52932811848308, CurrSamplesPerSec=214.24924974778756, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:36,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=1750, skipped=20, lr=[4.396251343129376e-07, 4.396251343129376e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:36,365] [INFO] [timer.py:199:stop] epoch=0/micro_step=1750/global_step=1750, RunningAvgSamplesPerSec=217.52763757327713, CurrSamplesPerSec=220.37515967700094, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:39,278] [INFO] [logging.py:96:log_dist] [Rank 0] step=1760, skipped=20, lr=[3.635120570700784e-07, 3.635120570700784e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:39,288] [INFO] [timer.py:199:stop] epoch=0/micro_step=1760/global_step=1760, RunningAvgSamplesPerSec=217.53900324463726, CurrSamplesPerSec=219.31072977362672, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:42,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=1770, skipped=20, lr=[2.9458076394117684e-07, 2.9458076394117684e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:42,209] [INFO] [timer.py:199:stop] epoch=0/micro_step=1770/global_step=1770, RunningAvgSamplesPerSec=217.5508972956739, CurrSamplesPerSec=221.0833762977873, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:45,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=1780, skipped=20, lr=[2.3285134909173112e-07, 2.3285134909173112e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:45,121] [INFO] [timer.py:199:stop] epoch=0/micro_step=1780/global_step=1780, RunningAvgSamplesPerSec=217.56657478330348, CurrSamplesPerSec=214.361142831589, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:48,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=1790, skipped=20, lr=[1.7834180726725158e-07, 1.7834180726725158e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:48,036] [INFO] [timer.py:199:stop] epoch=0/micro_step=1790/global_step=1790, RunningAvgSamplesPerSec=217.5806861898962, CurrSamplesPerSec=220.5272369385267, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:50,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=1800, skipped=20, lr=[1.31068028547629e-07, 1.31068028547629e-07], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:50,983] [INFO] [timer.py:199:stop] epoch=0/micro_step=1800/global_step=1800, RunningAvgSamplesPerSec=217.58123717220835, CurrSamplesPerSec=217.04610165051298, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:53,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=1810, skipped=20, lr=[9.104379371500105e-08, 9.104379371500105e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:53,931] [INFO] [timer.py:199:stop] epoch=0/micro_step=1810/global_step=1810, RunningAvgSamplesPerSec=217.58161057766557, CurrSamplesPerSec=211.20186311457996, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:56,876] [INFO] [logging.py:96:log_dist] [Rank 0] step=1820, skipped=20, lr=[5.8280770236518456e-08, 5.8280770236518456e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:56,885] [INFO] [timer.py:199:stop] epoch=0/micro_step=1820/global_step=1820, RunningAvgSamplesPerSec=217.57991981259295, CurrSamplesPerSec=220.0718139507429, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:44:58,608] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,608] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,615] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:58,616] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations -[2023-04-18 03:44:58,617] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 -[2023-04-18 03:44:59,804] [INFO] [logging.py:96:log_dist] [Rank 0] step=1830, skipped=20, lr=[3.278850886317686e-08, 3.278850886317686e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:44:59,813] [INFO] [timer.py:199:stop] epoch=0/micro_step=1830/global_step=1830, RunningAvgSamplesPerSec=217.5884886843686, CurrSamplesPerSec=221.54592550470556, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -[2023-04-18 03:45:02,742] [INFO] [logging.py:96:log_dist] [Rank 0] step=1840, skipped=20, lr=[1.4574440845649407e-08, 1.4574440845649407e-08], mom=[(0.9, 0.95), (0.9, 0.95)] -[2023-04-18 03:45:02,752] [INFO] [timer.py:199:stop] epoch=0/micro_step=1840/global_step=1840, RunningAvgSamplesPerSec=217.5927207451398, CurrSamplesPerSec=218.17688361995747, MemAllocated=4.34GB, MaxMemAllocated=12.81GB -Epoch 1/1 with loss 0.5754789632299672 -***** Evaluating reward, Epoch 1/1 ***** -chosen_last_scores (higher is better) : 2.243525981903076, acc (higher is better) : 0.6994950175285339 -saving model ... -[2023-04-18 03:45:33,424] [INFO] [launch.py:460:main] Process 10393 exits successfully. -[2023-04-18 03:45:34,426] [INFO] [launch.py:460:main] Process 10388 exits successfully. -[2023-04-18 03:45:34,426] [INFO] [launch.py:460:main] Process 10396 exits successfully. -[2023-04-18 03:45:34,426] [INFO] [launch.py:460:main] Process 10389 exits successfully. -[2023-04-18 03:45:35,428] [INFO] [launch.py:460:main] Process 10401 exits successfully. -[2023-04-18 03:45:36,429] [INFO] [launch.py:460:main] Process 10397 exits successfully. -[2023-04-18 03:45:36,430] [INFO] [launch.py:460:main] Process 10390 exits successfully. -[2023-04-18 03:45:37,431] [INFO] [launch.py:460:main] Process 10399 exits successfully. -[2023-04-18 03:45:38,433] [INFO] [launch.py:460:main] Process 10387 exits successfully. -[2023-04-18 03:45:38,433] [INFO] [launch.py:460:main] Process 10391 exits successfully. -[2023-04-18 03:45:38,433] [INFO] [launch.py:460:main] Process 10398 exits successfully. -[2023-04-18 03:45:38,433] [INFO] [launch.py:460:main] Process 10392 exits successfully. -[2023-04-18 03:45:38,434] [INFO] [launch.py:460:main] Process 10402 exits successfully. -[2023-04-18 03:45:38,434] [INFO] [launch.py:460:main] Process 10400 exits successfully. -[2023-04-18 03:45:38,434] [INFO] [launch.py:460:main] Process 10395 exits successfully. -[2023-04-18 03:45:40,436] [INFO] [launch.py:460:main] Process 10394 exits successfully. diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_560m.sh b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_560m.sh new file mode 100644 index 000000000..ed04b553f --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_560m.sh @@ -0,0 +1,40 @@ +OUTPUT=$1 +ZERO_STAGE=$2 +if [ "$OUTPUT" == "" ]; then + OUTPUT=./output +fi +if [ "$ZERO_STAGE" == "" ]; then + ZERO_STAGE=2 +fi +mkdir -p $OUTPUT + +if [[ $0 =~ ^\/.* ]] +then + script=$0 +else + script=$(pwd)/$0 +fi +path_dir=${script%%training_scripts*} +echo $path_dir + +ds --num_gpus 2 $path_dir'main.py' \ + --data_path $HOME/.cache/huggingface/hub/datasets--Dahoas--full-hh-rlhf \ + --data_split 2,4,4 \ + --model_name_or_path bigscience/bloom-560m \ + --tokenizer_name_or_path bigscience/tokenizer \ + --num_padding_at_beginning 1 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --max_seq_len 512 \ + --learning_rate 5e-5 \ + --weight_decay 0.1 \ + --num_train_epochs 1 \ + --disable_dropout \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 0 \ + --seed 1234 \ + --zero_stage $ZERO_STAGE \ + --deepspeed \ + --output_dir $OUTPUT \ + > $OUTPUT/training_step2_bloom_560m_dahoas_full_hh_rlhf.log 2>&1 & diff --git a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py b/applications/DeepSpeed-Chat/training/utils/data/data_utils.py index 91887b7bd..97b1bd3b0 100644 --- a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py +++ b/applications/DeepSpeed-Chat/training/utils/data/data_utils.py @@ -15,40 +15,78 @@ import hashlib from itertools import chain from . import raw_datasets +import re +def get_dataset_name(dataset_name:str): + """if `dataset_name` is a not a path, return it directly; + else, get the repo-id by analyse the `dataset_name` -def get_raw_dataset(dataset_name, output_path, seed, local_rank): + @param dataset_name: str, the path or repo-id of dataset + return : str, the repo-id in huggingface_hub of the dataset + """ + if not os.path.exists(dataset_name): + return dataset_name + # downloaded and cached by huggingface_hub.snapshot_download + if dataset_name[-1] == "/": + dataset_name = dataset_name[:-1] + if "datasets--" in dataset_name: + splitted = re.split(r"/|\\",dataset_name)[-1].split("--") + print(splitted) + if len(splitted) == 3: + dataset_name = "/".join(splitted[-2:]) + else: + dataset_name = splitted[-1] + + # downloaded and cached by datasets.load_dataset + elif "__" in dataset_name: + dataset_name = re.split(r"/|\\",dataset_name)[-1].split("__") + if len(splitted) == 2: + dataset_name = "/".join(splitted[-2:]) + else: + dataset_name = splitted[-1] + + # the user build a local data dir just same as repo-id + elif len(re.split(r"/",dataset_name)) <= 2: + pass + # the user build a local data dir that contains repo-id + else: + dataset_name = "/".join(re.split(r"/|\\",dataset_name)[-2:]) + return dataset_name + +def get_raw_dataset(dataset_name_or_path, output_path, seed, local_rank): + dataset_name = get_dataset_name(dataset_name_or_path) + print(f"the datasetname is {dataset_name}") if "Dahoas/rm-static" in dataset_name: return raw_datasets.DahoasRmstaticDataset(output_path, seed, - local_rank, dataset_name) + local_rank, dataset_name_or_path) elif "Dahoas/full-hh-rlhf" in dataset_name: return raw_datasets.DahoasFullhhrlhfDataset(output_path, seed, - local_rank, dataset_name) + local_rank, dataset_name_or_path) elif "Dahoas/synthetic-instruct-gptj-pairwise" in dataset_name: return raw_datasets.DahoasSyntheticinstructgptjpairwiseDataset( - output_path, seed, local_rank, dataset_name) + output_path, seed, local_rank, dataset_name_or_path) elif "yitingxie/rlhf-reward-datasets" in dataset_name: return raw_datasets.YitingxieRlhfrewarddatasetsDataset( - output_path, seed, local_rank, dataset_name) + output_path, seed, local_rank, dataset_name_or_path) elif "openai/webgpt_comparisons" in dataset_name: return raw_datasets.OpenaiWebgptcomparisonsDataset( - output_path, seed, local_rank, dataset_name) + output_path, seed, local_rank, dataset_name_or_path) elif "stanfordnlp/SHP" in dataset_name: return raw_datasets.StanfordnlpSHPDataset(output_path, seed, - local_rank, dataset_name) + local_rank, dataset_name_or_path) elif "pvduy/sharegpt_alpaca_oa_vicuna_format" in dataset_name: return raw_datasets.PvduySharegptalpacaoavicunaformatDataset( - output_path, seed, local_rank, dataset_name) + output_path, seed, local_rank, dataset_name_or_path) elif "wangrui6/Zhihu-KOL" in dataset_name: return raw_datasets.Wangrui6ZhihuKOLDataset(output_path, seed, - local_rank, dataset_name) + local_rank, dataset_name_or_path) elif "Cohere/miracl-zh-queries-22-12" in dataset_name: return raw_datasets.CohereMiraclzhqueries2212Dataset( - output_path, seed, local_rank, dataset_name) + output_path, seed, local_rank, dataset_name_or_path) elif "Hello-SimpleAI/HC3-Chinese" in dataset_name: return raw_datasets.HelloSimpleAIHC3ChineseDataset( - output_path, seed, local_rank, dataset_name) + output_path, seed, local_rank, dataset_name_or_path) elif "mkqa-Chinese" in dataset_name: return raw_datasets.MkqaChineseDataset(output_path, seed, local_rank, "mkqa") @@ -57,13 +95,13 @@ def get_raw_dataset(dataset_name, output_path, seed, local_rank): "mkqa") elif "Cohere/miracl-ja-queries-22-12" in dataset_name: return raw_datasets.CohereMiracljaqueries2212Dataset( - output_path, seed, local_rank, dataset_name) + output_path, seed, local_rank, dataset_name_or_path) elif "lmqg/qg_jaquad" in dataset_name: return raw_datasets.LmqgQgjaquadDataset(output_path, seed, local_rank, - dataset_name) + dataset_name_or_path) elif "lmqg/qag_jaquad" in dataset_name: return raw_datasets.LmqgQagjaquadDataset(output_path, seed, local_rank, - dataset_name) + dataset_name_or_path) elif "local/jsonfile" in dataset_name: chat_path = os.path.abspath( os.path.join(os.path.dirname(__file__), os.path.pardir, diff --git a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py b/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py index 3c84f4b07..374b16b7a 100644 --- a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py +++ b/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # DeepSpeed Team -from datasets import load_dataset +from datasets import load_dataset,DownloadConfig from torch.utils.data import Subset import re @@ -16,7 +16,8 @@ def __init__(self, output_path, seed, local_rank, dataset_name): self.seed = seed self.local_rank = local_rank if not dataset_name == 'local/jsonfile': - self.raw_datasets = load_dataset(dataset_name) + config = DownloadConfig(resume_download=True, max_retries=300) + self.raw_datasets = load_dataset(dataset_name,download_config=config) def get_train_data(self): return diff --git a/applications/DeepSpeed-Chat/training/utils/utils.py b/applications/DeepSpeed-Chat/training/utils/utils.py index 2a6e47891..efe5c3fe2 100644 --- a/applications/DeepSpeed-Chat/training/utils/utils.py +++ b/applications/DeepSpeed-Chat/training/utils/utils.py @@ -92,7 +92,11 @@ def save_hf_format(model, tokenizer, args, sub_folder=""): del save_dict[key] torch.save(save_dict, output_model_file) model_to_save.config.to_json_file(output_config_file) - tokenizer.save_vocabulary(output_dir) + try: + tokenizer.save_vocabulary(output_dir) + except NotImplementedError: + print(f"{tokenizer.__class__} with repo-id: {tokenizer.name_or_path} " + "dose not support `save_vocabulary` method") def set_random_seed(seed):