diff --git a/scripts/fsmt/convert-allenai-wmt16.sh b/scripts/fsmt/convert-allenai-wmt16.sh new file mode 100755 index 000000000000..549919032c89 --- /dev/null +++ b/scripts/fsmt/convert-allenai-wmt16.sh @@ -0,0 +1,62 @@ +#/usr/bin/env bash + +# this script acquires data and converts it to fsmt model +# it covers: +# - allenai/wmt16-en-de-dist-12-1 +# - allenai/wmt16-en-de-dist-6-1 +# - allenai/wmt16-en-de-12-1 + +# this script needs to be run from the top level of the transformers repo +if [ ! -d "src/transformers" ]; then + echo "Error: This script needs to be run from the top of the transformers repo" + exit 1 +fi + +mkdir data + +# get data (run once) + +cd data +gdown 'https://drive.google.com/uc?id=1x_G2cjvM1nW5hjAB8-vWxRqtQTlmIaQU' +gdown 'https://drive.google.com/uc?id=1oA2aqZlVNj5FarxBlNXEHpBS4lRetTzU' +gdown 'https://drive.google.com/uc?id=1Wup2D318QYBFPW_NKI1mfP_hXOfmUI9r' +tar -xvzf trans_ende_12-1_0.2.tar.gz +tar -xvzf trans_ende-dist_12-1_0.2.tar.gz +tar -xvzf trans_ende-dist_6-1_0.2.tar.gz +gdown 'https://drive.google.com/uc?id=1mNufoynJ9-Zy1kJh2TA_lHm2squji0i9' +gdown 'https://drive.google.com/uc?id=1iO7um-HWoNoRKDtw27YUSgyeubn9uXqj' +tar -xvzf wmt16.en-de.deep-shallow.dist.tar.gz +tar -xvzf wmt16.en-de.deep-shallow.tar.gz +cp wmt16.en-de.deep-shallow/data-bin/dict.*.txt trans_ende_12-1_0.2 +cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_12-1_0.2 +cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_6-1_0.2 +cp wmt16.en-de.deep-shallow/bpecodes trans_ende_12-1_0.2 +cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_12-1_0.2 +cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_6-1_0.2 +cd - + +# run conversions and uploads + +PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-12-1 + +PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_6-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-6-1 + +PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-12-1 + + +# upload +cd data +transformers-cli upload -y wmt16-en-de-dist-12-1 +transformers-cli upload -y wmt16-en-de-dist-6-1 +transformers-cli upload -y wmt16-en-de-12-1 +cd - + + +# if updating just small files and not the large models, here is a script to generate the right commands: +perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json +# add/remove files as needed + +# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload +# So the only way to start using the new model sooner is either: +# 1. download it to a local path and use that path as model_name +# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere diff --git a/scripts/fsmt/convert-allenai-wmt19.sh b/scripts/fsmt/convert-allenai-wmt19.sh new file mode 100755 index 000000000000..3ece67d214b2 --- /dev/null +++ b/scripts/fsmt/convert-allenai-wmt19.sh @@ -0,0 +1,50 @@ +#/usr/bin/env bash + +# this script acquires data and converts it to fsmt model +# it covers: +# - allenai/wmt19-de-en-6-6-base +# - allenai/wmt19-de-en-6-6-big + +# this script needs to be run from the top level of the transformers repo +if [ ! -d "src/transformers" ]; then + echo "Error: This script needs to be run from the top of the transformers repo" + exit 1 +fi + +mkdir data + +# get data (run once) + +cd data +gdown 'https://drive.google.com/uc?id=1j6z9fYdlUyOYsh7KJoumRlr1yHczxR5T' +gdown 'https://drive.google.com/uc?id=1yT7ZjqfvUYOBXvMjeY8uGRHQFWoSo8Q5' +gdown 'https://drive.google.com/uc?id=15gAzHeRUCs-QV8vHeTReMPEh1j8excNE' +tar -xvzf wmt19.de-en.tar.gz +tar -xvzf wmt19_deen_base_dr0.1_1.tar.gz +tar -xvzf wmt19_deen_big_dr0.1_2.tar.gz +cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_base_dr0.1_1 +cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_big_dr0.1_2 +cd - + +# run conversions and uploads + +PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_base_dr0.1_1/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-base + +PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_big_dr0.1_2/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-big + + +# upload +cd data +transformers-cli upload -y wmt19-de-en-6-6-base +transformers-cli upload -y wmt19-de-en-6-6-big +cd - + + +# if updating just small files and not the large models, here is a script to generate the right commands: +perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json +# add/remove files as needed + +# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload +# So the only way to start using the new model sooner is either: +# 1. download it to a local path and use that path as model_name +# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere diff --git a/scripts/fsmt/convert-facebook-wmt19.sh b/scripts/fsmt/convert-facebook-wmt19.sh new file mode 100755 index 000000000000..89621ae6ff8f --- /dev/null +++ b/scripts/fsmt/convert-facebook-wmt19.sh @@ -0,0 +1,61 @@ +#/usr/bin/env bash + +# this script acquires data and converts it to fsmt model +# it covers: +# - facebook/wmt19-ru-en +# - facebook/wmt19-en-ru +# - facebook/wmt19-de-en +# - facebook/wmt19-en-de + +# this script needs to be run from the top level of the transformers repo +if [ ! -d "src/transformers" ]; then + echo "Error: This script needs to be run from the top of the transformers repo" + exit 1 +fi + +mkdir data + +# get data (run once) + +cd data +wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz +wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz +wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz +wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz +tar -xvzf wmt19.en-de.joined-dict.ensemble.tar.gz +tar -xvzf wmt19.de-en.joined-dict.ensemble.tar.gz +tar -xvzf wmt19.en-ru.ensemble.tar.gz +tar -xvzf wmt19.ru-en.ensemble.tar.gz +cd - + +# run conversions and uploads + +export PAIR=ru-en +PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR + +export PAIR=en-ru +PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR + +export PAIR=de-en +PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR + +export PAIR=en-de +PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR + + +# upload +cd data +transformers-cli upload -y wmt19-ru-en +transformers-cli upload -y wmt19-en-ru +transformers-cli upload -y wmt19-de-en +transformers-cli upload -y wmt19-en-de +cd - + +# if updating just small files and not the large models, here is a script to generate the right commands: +perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for map { "wmt19-$_" } ("en-ru", "ru-en", "de-en", "en-de")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json +# add/remove files as needed + +# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload +# So the only way to start using the new model sooner is either: +# 1. download it to a local path and use that path as model_name +# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere diff --git a/scripts/fsmt/eval-allenai-wmt16.sh b/scripts/fsmt/eval-allenai-wmt16.sh new file mode 100755 index 000000000000..513245a485d1 --- /dev/null +++ b/scripts/fsmt/eval-allenai-wmt16.sh @@ -0,0 +1,66 @@ +#/usr/bin/env bash + +# this script evals the following fsmt models +# it covers: +# - allenai/wmt16-en-de-dist-12-1 +# - allenai/wmt16-en-de-dist-6-1 +# - allenai/wmt16-en-de-12-1 + +# this script needs to be run from the top level of the transformers repo +if [ ! -d "src/transformers" ]; then + echo "Error: This script needs to be run from the top of the transformers repo" + exit 1 +fi + +# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU) + +### Normal eval ### + +export PAIR=en-de +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=64 +export NUM_BEAMS=5 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target + +MODEL_PATH=allenai/wmt16-en-de-dist-12-1 +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + +MODEL_PATH=allenai/wmt16-en-de-dist-6-1 +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + +MODEL_PATH=allenai/wmt16-en-de-12-1 +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + + + +### Searching hparams eval ### + + +export PAIR=en-de +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=32 +export NUM_BEAMS=5 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target + +MODEL_PATH=allenai/wmt16-en-de-dist-12-1 +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1" + + +MODEL_PATH=allenai/wmt16-en-de-dist-6-1 +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1" + + +MODEL_PATH=allenai/wmt16-en-de-12-1 +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1" diff --git a/scripts/fsmt/eval-allenai-wmt19.sh b/scripts/fsmt/eval-allenai-wmt19.sh new file mode 100755 index 000000000000..07da60b265a9 --- /dev/null +++ b/scripts/fsmt/eval-allenai-wmt19.sh @@ -0,0 +1,54 @@ +#/usr/bin/env bash + +# this script evals the following fsmt models +# it covers: +# - allenai/wmt19-de-en-6-6-base +# - allenai/wmt19-de-en-6-6-big + +# this script needs to be run from the top level of the transformers repo +if [ ! -d "src/transformers" ]; then + echo "Error: This script needs to be run from the top of the transformers repo" + exit 1 +fi + +# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU) + +### Normal eval ### + +export PAIR=de-en +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=64 +export NUM_BEAMS=5 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target + +MODEL_PATH=allenai/wmt19-de-en-6-6-base +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + +MODEL_PATH=allenai/wmt19-de-en-6-6-big +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + + + +### Searching hparams eval ### + +export PAIR=de-en +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=16 +export NUM_BEAMS=5 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target + +MODEL_PATH=allenai/wmt19-de-en-6-6-base +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1" + +MODEL_PATH=allenai/wmt19-de-en-6-6-big +echo $PAIR $MODEL_PATH +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1" diff --git a/scripts/fsmt/eval-facebook-wmt19.sh b/scripts/fsmt/eval-facebook-wmt19.sh new file mode 100755 index 000000000000..a47051489f53 --- /dev/null +++ b/scripts/fsmt/eval-facebook-wmt19.sh @@ -0,0 +1,148 @@ +#/usr/bin/env bash + +# this script evals the following fsmt models +# it covers: +# - facebook/wmt19-ru-en +# - facebook/wmt19-en-ru +# - facebook/wmt19-de-en +# - facebook/wmt19-en-de + + +# this script needs to be run from the top level of the transformers repo +if [ ! -d "src/transformers" ]; then + echo "Error: This script needs to be run from the top of the transformers repo" + exit 1 +fi + + +# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU) + +### a short estimate version for quick testing ### + +export PAIR=en-ru +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=8 +export NUM_BEAMS=8 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src | head -10 > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref | head -10 > $DATA_DIR/val.target +echo $PAIR +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + + + +### Normal eval ### + +# ru-en + +export PAIR=ru-en +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=8 +export NUM_BEAMS=50 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + + +# (target BLEU: 41.3 http://matrix.statmt.org/matrix/output/1907?run_id=6937) + + +# en-ru + +export PAIR=en-ru +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=8 +export NUM_BEAMS=50 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +echo $PAIR +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + +# (target BLEU: 36.4 http://matrix.statmt.org/matrix/output/1914?score_id=37605) + + + +# en-de + +export PAIR=en-de +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=8 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +echo $PAIR +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + +# (target BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862) + + +# de-en + +export PAIR=de-en +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=8 +export NUM_BEAMS=50 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +echo $PAIR +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS + +# (target BLEU: 42.3 http://matrix.statmt.org/matrix/output/1902?run_id=6750) + + +### Searching hparams eval ### + +# en-ru + +export PAIR=ru-en +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=32 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1" + + +# en-ru + +export PAIR=en-ru +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=16 +mkdir -p $DATA_DIR +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false" + +# en-de + +export PAIR=en-de +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=16 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false" + +# de-en + +export PAIR=de-en +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=16 +mkdir -p $DATA_DIR +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false" diff --git a/scripts/fsmt/gen-card-allenai-wmt16.py b/scripts/fsmt/gen-card-allenai-wmt16.py new file mode 100755 index 000000000000..854d2a11fb34 --- /dev/null +++ b/scripts/fsmt/gen-card-allenai-wmt16.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python + +# Usage: +# ./gen-card-allenai-wmt16.py + +import os +from pathlib import Path + +def write_model_card(model_card_dir, src_lang, tgt_lang, model_name): + + texts = { + "en": "Machine learning is great, isn't it?", + "ru": "Машинное обучение - это здорово, не так ли?", + "de": "Maschinelles Lernen ist großartig, nicht wahr?", + } + + # BLUE scores as follows: + # "pair": [fairseq, transformers] + scores = { + "wmt16-en-de-dist-12-1": [28.3, 27.52], + "wmt16-en-de-dist-6-1": [27.4, 27.11], + "wmt16-en-de-12-1": [26.9, 25.75], + } + pair = f"{src_lang}-{tgt_lang}" + + readme = f""" +--- + +language: {src_lang}, {tgt_lang} +thumbnail: +tags: +- translation +- wmt16 +- allenai +license: Apache 2.0 +datasets: +- http://www.statmt.org/wmt16/ ([test-set](http://matrix.statmt.org/test_sets/newstest2016.tgz?1504722372)) + +metrics: +- http://www.statmt.org/wmt16/metrics-task.html +--- + +# FSMT + +## Model description + +This is a ported version of fairseq-based [wmt16 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}. + +For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369). + +All 3 models are available: + +* [wmt16-en-de-dist-12-1](https://huggingface.co/allenai/wmt16-en-de-dist-12-1) +* [wmt16-en-de-dist-6-1](https://huggingface.co/allenai/wmt16-en-de-dist-6-1) +* [wmt16-en-de-12-1](https://huggingface.co/allenai/wmt16-en-de-12-1) + +``` +@misc{{kasai2020deep, + title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}}, + author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}}, + year={{2020}}, + eprint={{2006.10369}}, + archivePrefix={{arXiv}}, + primaryClass={{cs.CL}} +}} +``` + +## Intended uses & limitations + +#### How to use + +```python +from transformers.tokenization_fsmt import FSMTTokenizer +from transformers.modeling_fsmt import FSMTForConditionalGeneration +mname = "allenai/{model_name}" +tokenizer = FSMTTokenizer.from_pretrained(mname) +model = FSMTForConditionalGeneration.from_pretrained(mname) + +input = "{texts[src_lang]}" +input_ids = tokenizer.encode(input, return_tensors="pt") +outputs = model.generate(input_ids) +decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(decoded) # {texts[tgt_lang]} + +``` + +#### Limitations and bias + + +## Training data + +Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369). + +## Eval results + +Here are the BLEU scores: + +model | fairseq | transformers +-------|---------|---------- +{model_name} | {scores[model_name][0]} | {scores[model_name][1]} + +The score is slightly below the score reported in the paper, as the researchers don't use `sacrebleu` and measure the score on tokenized outputs. `transformers` score was measured using `sacrebleu` on detokenized outputs. + +The score was calculated using this code: + +```bash +git clone https://github.com/huggingface/transformers +cd transformers +export PAIR={pair} +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=8 +export NUM_BEAMS=5 +mkdir -p $DATA_DIR +sacrebleu -t wmt16 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt16 -l $PAIR --echo ref > $DATA_DIR/val.target +echo $PAIR +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS +``` + +""" + model_card_dir.mkdir(parents=True, exist_ok=True) + path = os.path.join(model_card_dir, "README.md") + print(f"Generating {path}") + with open(path, "w", encoding="utf-8") as f: + f.write(readme) + +# make sure we are under the root of the project +repo_dir = Path(__file__).resolve().parent.parent.parent +model_cards_dir = repo_dir / "model_cards" + +for model_name in ["wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1"]: + model_card_dir = model_cards_dir / "allenai" / model_name + write_model_card(model_card_dir, src_lang="en", tgt_lang="de", model_name=model_name) diff --git a/scripts/fsmt/gen-card-allenai-wmt19.py b/scripts/fsmt/gen-card-allenai-wmt19.py new file mode 100755 index 000000000000..a4be917db135 --- /dev/null +++ b/scripts/fsmt/gen-card-allenai-wmt19.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +# Usage: +# ./gen-card-allenai-wmt19.py + +import os +from pathlib import Path + +def write_model_card(model_card_dir, src_lang, tgt_lang, model_name): + + texts = { + "en": "Machine learning is great, isn't it?", + "ru": "Машинное обучение - это здорово, не так ли?", + "de": "Maschinelles Lernen ist großartig, nicht wahr?", + } + + # BLUE scores as follows: + # "pair": [fairseq, transformers] + scores = { + "wmt19-de-en-6-6-base": [0, 38.37], + "wmt19-de-en-6-6-big": [0, 39.90], + } + pair = f"{src_lang}-{tgt_lang}" + + readme = f""" +--- + +language: {src_lang}, {tgt_lang} +thumbnail: +tags: +- translation +- wmt19 +- allenai +license: Apache 2.0 +datasets: +- http://www.statmt.org/wmt19/ ([test-set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)) +metrics: +- http://www.statmt.org/wmt19/metrics-task.html +--- + +# FSMT + +## Model description + +This is a ported version of fairseq-based wmt19 transformer created by [jungokasai]](https://github.com/jungokasai/) @ allenai for {src_lang}-{tgt_lang}. + +2 models are available: + +* [wmt19-de-en-6-6-big](https://huggingface.co/allenai/wmt19-de-en-6-6-big) +* [wmt19-de-en-6-6-base](https://huggingface.co/allenai/wmt19-de-en-6-6-base) + +## Intended uses & limitations + +#### How to use + +```python +from transformers.tokenization_fsmt import FSMTTokenizer +from transformers.modeling_fsmt import FSMTForConditionalGeneration +mname = "allenai/{model_name}" +tokenizer = FSMTTokenizer.from_pretrained(mname) +model = FSMTForConditionalGeneration.from_pretrained(mname) + +input = "{texts[src_lang]}" +input_ids = tokenizer.encode(input, return_tensors="pt") +outputs = model.generate(input_ids) +decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(decoded) # {texts[tgt_lang]} + +``` + +#### Limitations and bias + + +## Training data + +Pretrained weights were left identical to the original model released by the researcher. + +## Eval results + +Here are the BLEU scores: + +model | transformers +-------|---------|---------- +{model_name} | {scores[model_name][1]} + +The score was calculated using this code: + +```bash +git clone https://github.com/huggingface/transformers +cd transformers +export PAIR={pair} +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=8 +export NUM_BEAMS=5 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +echo $PAIR +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS +``` + +""" + model_card_dir.mkdir(parents=True, exist_ok=True) + path = os.path.join(model_card_dir, "README.md") + print(f"Generating {path}") + with open(path, "w", encoding="utf-8") as f: + f.write(readme) + +# make sure we are under the root of the project +repo_dir = Path(__file__).resolve().parent.parent.parent +model_cards_dir = repo_dir / "model_cards" + +for model_name in ["wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"]: + model_card_dir = model_cards_dir / "allenai" / model_name + write_model_card(model_card_dir, src_lang="de", tgt_lang="en", model_name=model_name) diff --git a/scripts/fsmt/gen-card-facebook-wmt19.py b/scripts/fsmt/gen-card-facebook-wmt19.py new file mode 100755 index 000000000000..ceef013de33e --- /dev/null +++ b/scripts/fsmt/gen-card-facebook-wmt19.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python + +# Usage: +# ./gen-card-facebook-wmt19.py + +import os +from pathlib import Path + +def write_model_card(model_card_dir, src_lang, tgt_lang): + + texts = { + "en": "Machine learning is great, isn't it?", + "ru": "Машинное обучение - это здорово, не так ли?", + "de": "Maschinelles Lernen ist großartig, oder?", + } + + # BLUE scores as follows: + # "pair": [fairseq, transformers] + scores = { + "ru-en": ["[41.3](http://matrix.statmt.org/matrix/output/1907?run_id=6937)", "39.20"], + "en-ru": ["[36.4](http://matrix.statmt.org/matrix/output/1914?run_id=6724)", "33.47"], + "en-de": ["[43.1](http://matrix.statmt.org/matrix/output/1909?run_id=6862)", "42.83"], + "de-en": ["[42.3](http://matrix.statmt.org/matrix/output/1902?run_id=6750)", "41.35"], + } + pair = f"{src_lang}-{tgt_lang}" + + readme = f""" +--- + + + +language: {src_lang}, {tgt_lang} +thumbnail: +tags: +- translation +- wmt19 +license: Apache 2.0 +datasets: +- http://www.statmt.org/wmt19/ ([test-set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)) +metrics: +- http://www.statmt.org/wmt19/metrics-task.html +--- + +# FSMT + +## Model description + +This is a ported version of [fairseq wmt19 transformer](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md) for {src_lang}-{tgt_lang}. + +For more details, please see, [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616). + +The abbreviation FSMT stands for FairSeqMachineTranslation + +All four models are available: + +* [wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru) +* [wmt19-ru-en](https://huggingface.co/facebook/wmt19-ru-en) +* [wmt19-en-de](https://huggingface.co/facebook/wmt19-en-de) +* [wmt19-de-en](https://huggingface.co/facebook/wmt19-de-en) + +## Intended uses & limitations + +#### How to use + +```python +from transformers.tokenization_fsmt import FSMTTokenizer +from transformers.modeling_fsmt import FSMTForConditionalGeneration +mname = "facebook/wmt19-{src_lang}-{tgt_lang}" +tokenizer = FSMTTokenizer.from_pretrained(mname) +model = FSMTForConditionalGeneration.from_pretrained(mname) + +input = "{texts[src_lang]} +input_ids = tokenizer.encode(input, return_tensors="pt") +outputs = model.generate(input_ids) +decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(decoded) # {texts[tgt_lang]} + +``` + +#### Limitations and bias + +- The original (and this ported model) doesn't seem to handle well inputs with repeated sub-phrases, [content gets truncated](https://discuss.huggingface.co/t/issues-with-translating-inputs-containing-repeated-phrases/981) + +## Training data + +Pretrained weights were left identical to the original model released by fairseq. For more details, please, see the [paper](https://arxiv.org/abs/1907.06616). + +## Eval results + +pair | fairseq | transformers +-------|---------|---------- +{pair} | {scores[pair][0]} | {scores[pair][1]} + +The score is slightly below the score reported by `fairseq`, since `transformers`` currently doesn't support: +- model ensemble, therefore the best performing checkpoint was ported (``model4.pt``). +- re-ranking + +The score was calculated using this code: + +```bash +git clone https://github.com/huggingface/transformers +cd transformers +export PAIR={pair} +export DATA_DIR=data/$PAIR +export SAVE_DIR=data/$PAIR +export BS=8 +export NUM_BEAMS=15 +mkdir -p $DATA_DIR +sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source +sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target +echo $PAIR +PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS +``` +note: fairseq reports using a beam of 50, so you should get a slightly higher score if re-run with `--num_beams 50`. + + +## TODO + +- port model ensemble (fairseq uses 4 model checkpoints) + +""" + os.makedirs(model_card_dir, exist_ok=True) + path = os.path.join(model_card_dir, "README.md") + print(f"Generating {path}") + with open(path, "w", encoding="utf-8") as f: + f.write(readme) + +# make sure we are under the root of the project +repo_dir = Path(__file__).resolve().parent.parent.parent +model_cards_dir = repo_dir / "model_cards" + +for model_name in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]: + base, src_lang, tgt_lang = model_name.split("-") + model_card_dir = model_cards_dir / "facebook" / model_name + write_model_card(model_card_dir, src_lang=src_lang, tgt_lang=tgt_lang)