diff --git a/megatron/arguments.py b/megatron/arguments.py index 3a71f5a1c..071396944 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -648,6 +648,11 @@ def _add_data_args(parser): '1) a single data path, 2) multiple datasets in the' 'form: dataset1-weight dataset1-path dataset2-weight ' 'dataset2-path ...') + group.add_argument('--valid-data', nargs='*', default=None, + help='Path to the validation dataset. If not provided,' + 'data will be selected from the --data-path based on --split.' + 'Accepted format : dataset1-weight dataset1-path ' + 'dataset2-weight dataset2-path ...') group.add_argument('--split', type=str, default='969, 30, 1', help='Comma-separated list of proportions for training,' ' validation, and test split. For example the split ' diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index e605c216e..ce7ed57ad 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -30,7 +30,8 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, - seq_length, seed, skip_warmup): + seq_length, seed, skip_warmup, + valid_data_prefix=None): """Build train, valid, and test datasets.""" # Single dataset. @@ -62,13 +63,27 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, if test_ds: test_datasets.append(test_ds) + valid_weights = weights + if valid_data_prefix is not None: + valid_datasets = [] + valid_output = get_datasets_weights_and_num_samples(valid_data_prefix, + [0, train_valid_test_num_samples[1], 0]) + valid_prefixes, valid_weights, valid_datasets_samples = valid_output + for i in range(len(valid_prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + valid_prefixes[i], data_impl, '0,100,0', + valid_datasets_samples[i], + seq_length, seed, skip_warmup) + if valid_ds: + valid_datasets.append(valid_ds) + # Blend. blending_train_dataset = None if train_datasets: blending_train_dataset = BlendableDataset(train_datasets, weights) blending_valid_dataset = None if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights) + blending_valid_dataset = BlendableDataset(valid_datasets, valid_weights) blending_test_dataset = None if test_datasets: blending_test_dataset = BlendableDataset(test_datasets, weights) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 0137cad5e..3bdcedfe8 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -191,7 +191,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): train_valid_test_num_samples=train_val_test_num_samples, seq_length=args.seq_length, seed=args.seed, - skip_warmup=(not args.mmap_warmup)) + skip_warmup=(not args.mmap_warmup), + valid_data_prefix=args.valid_data) print_rank_0("> finished creating GPT datasets ...") return train_ds, valid_ds, test_ds diff --git a/scripts/test_multiple_dataset_sampling/test_valid_sampling.sh b/scripts/test_multiple_dataset_sampling/test_valid_sampling.sh new file mode 100644 index 000000000..b9904758a --- /dev/null +++ b/scripts/test_multiple_dataset_sampling/test_valid_sampling.sh @@ -0,0 +1,184 @@ +EXP_PATH="./dumped/test/" +mkdir -p $EXP_PATH +BASE_DATA_PATH=$EXP_PATH +INPUT_PATH=$EXP_PATH +OUTPUT_PATH=$EXP_PATH + +wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P ${BASE_DATA_PATH} +wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P ${BASE_DATA_PATH} + +python scripts/test_multiple_dataset_sampling/create_dummy_dataset.py --dir ${INPUT_PATH} + + +python tools/preprocess_data.py \ + --input ${INPUT_PATH}/dataset_0.json \ + --output-prefix ${OUTPUT_PATH}/dataset-0 \ + --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \ + --dataset-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \ + --append-eod + +python tools/preprocess_data.py \ + --input ${INPUT_PATH}/dataset_1.json \ + --output-prefix ${OUTPUT_PATH}/dataset-1 \ + --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \ + --dataset-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \ + --append-eod + +python tools/preprocess_data.py \ + --input ${INPUT_PATH}/dataset_2.json \ + --output-prefix ${OUTPUT_PATH}/dataset-2 \ + --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \ + --dataset-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \ + --append-eod + +python tools/preprocess_data.py \ + --input ${INPUT_PATH}/dataset_3.json \ + --output-prefix ${OUTPUT_PATH}/dataset-3 \ + --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \ + --dataset-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \ + --append-eod + +python tools/preprocess_data.py \ + --input ${INPUT_PATH}/dataset_4.json \ + --output-prefix ${OUTPUT_PATH}/dataset-4 \ + --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \ + --dataset-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \ + --append-eod + + +DIR=`pwd` +DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` +mkdir -p ${BASE_DATA_PATH}/logs + +DATASET_0="${OUTPUT_PATH}/dataset-0_text_document" +DATASET_1="${OUTPUT_PATH}/dataset-1_text_document" +DATASET_2="${OUTPUT_PATH}/dataset-2_text_document" +DATASET_3="${OUTPUT_PATH}/dataset-3_text_document" +DATASET_4="${OUTPUT_PATH}/dataset-4_text_document" +DATASET="0.1 ${DATASET_0} 0.25 ${DATASET_1} 0.2 ${DATASET_2} 0.15 ${DATASET_3} 0.3 ${DATASET_4}" +VALID_DATASET="1.0 ${DATASET_0}" +VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json +MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt + +CONFIG_JSON="${EXP_PATH}/ds_config.json" +touch $CONFIG_JSON + +USE_DEEPSPEED=1 +ZERO_STAGE=0 + +# 52B +TP=4 +PP=16 +HIDDEN=1024 +LAYERS=24 +SEQ=128 +GLOBAL_BATCH=16 +WORKER_STR="" + +MICRO_BATCH=8 + +while [[ $# -gt 0 ]] +do +key="$1" +case $key in + --no-deepspeed) + USE_DEEPSPEED=0; + shift + ;; + -z|--zero-stage) + ZERO_STAGE=$2; + shift + ;; + *) + echo "Unknown argument(s)" + usage + exit 1 + shift + ;; +esac +done + +options=" \ + --tensor-model-parallel-size $TP \ + --pipeline-model-parallel-size $PP \ + --num-layers $LAYERS \ + --hidden-size $HIDDEN \ + --num-attention-heads 32 \ + --seq-length $SEQ \ + --loss-scale 12 \ + --max-position-embeddings $SEQ \ + --micro-batch-size $MICRO_BATCH \ + --global-batch-size $GLOBAL_BATCH \ + --train-iters 1000 \ + --lr 6.0e-5 \ + --min-lr 6.0e-6 \ + --lr-decay-style cosine \ + --log-interval 1 \ + --eval-iters 100 \ + --eval-interval 40 \ + --data-path ${DATASET} \ + --valid-data ${VALID_DATASET} \ + --vocab-file ${VOCAB_PATH} \ + --merge-file ${MERGE_PATH} \ + --save-interval 1000 \ + --split 98,2,0 \ + --clip-grad 1.0 \ + --weight-decay 0.1 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --init-method-std 0.006 \ + --fp16 \ + --checkpoint-activations + " + + +if [[ ${USE_DEEPSPEED} -eq 1 ]]; then + echo "Using DeepSpeed" + options="${options} \ + --deepspeed \ + --deepspeed_config=${CONFIG_JSON} \ + --zero-stage=${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " +fi + + +cat < $CONFIG_JSON +{ + "train_batch_size" : $GLOBAL_BATCH, + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "gradient_clipping": 1.0, + "prescale_gradients": true, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "wall_clock_breakdown" : true +} +EOT + +# run_cmd="deepspeed $WORKER_STR ${DIR}/test_sampling.py $@ ${options}" +run_cmd="deepspeed $WORKER_STR pretrain_gpt.py $@ ${options}" + +echo ${run_cmd} +eval ${run_cmd} + +set +x \ No newline at end of file