Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions megatron/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,11 @@ def _add_data_args(parser):
'1) a single data path, 2) multiple datasets in the'
'form: dataset1-weight dataset1-path dataset2-weight '
'dataset2-path ...')
group.add_argument('--valid-data', nargs='*', default=None,
help='Path to the validation dataset. If not provided,'
'data will be selected from the --data-path based on --split.'
'Accepted format : dataset1-weight dataset1-path '
'dataset2-weight dataset2-path ...')
group.add_argument('--split', type=str, default='969, 30, 1',
help='Comma-separated list of proportions for training,'
' validation, and test split. For example the split '
Expand Down
19 changes: 17 additions & 2 deletions megatron/data/gpt_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@

def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
train_valid_test_num_samples,
seq_length, seed, skip_warmup):
seq_length, seed, skip_warmup,
valid_data_prefix=None):
"""Build train, valid, and test datasets."""

# Single dataset.
Expand Down Expand Up @@ -62,13 +63,27 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
if test_ds:
test_datasets.append(test_ds)

valid_weights = weights
if valid_data_prefix is not None:
valid_datasets = []
valid_output = get_datasets_weights_and_num_samples(valid_data_prefix,
[0, train_valid_test_num_samples[1], 0])
valid_prefixes, valid_weights, valid_datasets_samples = valid_output
for i in range(len(valid_prefixes)):
train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
valid_prefixes[i], data_impl, '0,100,0',
valid_datasets_samples[i],
seq_length, seed, skip_warmup)
if valid_ds:
valid_datasets.append(valid_ds)

# Blend.
blending_train_dataset = None
if train_datasets:
blending_train_dataset = BlendableDataset(train_datasets, weights)
blending_valid_dataset = None
if valid_datasets:
blending_valid_dataset = BlendableDataset(valid_datasets, weights)
blending_valid_dataset = BlendableDataset(valid_datasets, valid_weights)
blending_test_dataset = None
if test_datasets:
blending_test_dataset = BlendableDataset(test_datasets, weights)
Expand Down
3 changes: 2 additions & 1 deletion pretrain_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
train_valid_test_num_samples=train_val_test_num_samples,
seq_length=args.seq_length,
seed=args.seed,
skip_warmup=(not args.mmap_warmup))
skip_warmup=(not args.mmap_warmup),
valid_data_prefix=args.valid_data)
print_rank_0("> finished creating GPT datasets ...")

return train_ds, valid_ds, test_ds
Expand Down
184 changes: 184 additions & 0 deletions scripts/test_multiple_dataset_sampling/test_valid_sampling.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
EXP_PATH="./dumped/test/"
mkdir -p $EXP_PATH
BASE_DATA_PATH=$EXP_PATH
INPUT_PATH=$EXP_PATH
OUTPUT_PATH=$EXP_PATH

wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P ${BASE_DATA_PATH}
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P ${BASE_DATA_PATH}

python scripts/test_multiple_dataset_sampling/create_dummy_dataset.py --dir ${INPUT_PATH}


python tools/preprocess_data.py \
--input ${INPUT_PATH}/dataset_0.json \
--output-prefix ${OUTPUT_PATH}/dataset-0 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod

python tools/preprocess_data.py \
--input ${INPUT_PATH}/dataset_1.json \
--output-prefix ${OUTPUT_PATH}/dataset-1 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod

python tools/preprocess_data.py \
--input ${INPUT_PATH}/dataset_2.json \
--output-prefix ${OUTPUT_PATH}/dataset-2 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod

python tools/preprocess_data.py \
--input ${INPUT_PATH}/dataset_3.json \
--output-prefix ${OUTPUT_PATH}/dataset-3 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod

python tools/preprocess_data.py \
--input ${INPUT_PATH}/dataset_4.json \
--output-prefix ${OUTPUT_PATH}/dataset-4 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod


DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p ${BASE_DATA_PATH}/logs

DATASET_0="${OUTPUT_PATH}/dataset-0_text_document"
DATASET_1="${OUTPUT_PATH}/dataset-1_text_document"
DATASET_2="${OUTPUT_PATH}/dataset-2_text_document"
DATASET_3="${OUTPUT_PATH}/dataset-3_text_document"
DATASET_4="${OUTPUT_PATH}/dataset-4_text_document"
DATASET="0.1 ${DATASET_0} 0.25 ${DATASET_1} 0.2 ${DATASET_2} 0.15 ${DATASET_3} 0.3 ${DATASET_4}"
VALID_DATASET="1.0 ${DATASET_0}"
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt

CONFIG_JSON="${EXP_PATH}/ds_config.json"
touch $CONFIG_JSON

USE_DEEPSPEED=1
ZERO_STAGE=0

# 52B
TP=4
PP=16
HIDDEN=1024
LAYERS=24
SEQ=128
GLOBAL_BATCH=16
WORKER_STR=""

MICRO_BATCH=8

while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--no-deepspeed)
USE_DEEPSPEED=0;
shift
;;
-z|--zero-stage)
ZERO_STAGE=$2;
shift
;;
*)
echo "Unknown argument(s)"
usage
exit 1
shift
;;
esac
done

options=" \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $LAYERS \
--hidden-size $HIDDEN \
--num-attention-heads 32 \
--seq-length $SEQ \
--loss-scale 12 \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters 1000 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 100 \
--eval-interval 40 \
--data-path ${DATASET} \
--valid-data ${VALID_DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--fp16 \
--checkpoint-activations
"


if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
fi


cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 1,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"gradient_clipping": 1.0,
"prescale_gradients": true,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"wall_clock_breakdown" : true
}
EOT

# run_cmd="deepspeed $WORKER_STR ${DIR}/test_sampling.py $@ ${options}"
run_cmd="deepspeed $WORKER_STR pretrain_gpt.py $@ ${options}"

echo ${run_cmd}
eval ${run_cmd}

set +x