Skip to content

Commit

Permalink
Add the final changes to the spring-2024 training configs (#678)
Browse files Browse the repository at this point in the history
  • Loading branch information
eu9ene authored Oct 17, 2024
1 parent d1d1efc commit db41329
Show file tree
Hide file tree
Showing 36 changed files with 895 additions and 282 deletions.
11 changes: 9 additions & 2 deletions configs/spring-2024/bs-en-spring-2024.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ experiment:
mono-max-sentences-trg: 200_000_000
spm-sample-size: 10_000_000
spm-vocab-size: 32000
teacher-ensemble: 1
teacher-ensemble: 2
teacher-mode: two-stage
pretrained-models: {}
datasets:
Expand Down Expand Up @@ -111,9 +111,16 @@ marian-args:
early-stopping: '20'
training-student-finetuned:
early-stopping: '20'
target-stage: evaluate-teacher
target-stage: all
start-stage: train-teacher
previous_group_ids: ["M40EuFhERqSXuwlECYq9AQ"]
existing_tasks: { "train-teacher-bs-en-1": "Mp4y39ByTSG29ibwj7toaQ" }
wandb-publication: true
taskcluster:
split-chunks: 20
worker-classes:
default: gcp-spot
alignments-original: gcp-standard
alignments-backtranslated: gcp-standard
alignments-student: gcp-standard
shortlist: gcp-standard
18 changes: 15 additions & 3 deletions configs/spring-2024/cs-en-spring-2024.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@ experiment:
mono-max-sentences-trg: 200_000_000
spm-sample-size: 10_000_000
spm-vocab-size: 32000
teacher-ensemble: 1
teacher-ensemble: 2
teacher-mode: two-stage
pretrained-models: {}
pretrained-models:
train-backwards:
urls:
- "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/task/A_omF_rtRWatl_qbfvhmPg/artifacts/public/build"
mode: use
type: default
datasets:
devtest:
- mtdata_aug-mix_Neulab-tedtalks_dev-1-eng-ces
Expand Down Expand Up @@ -225,9 +230,16 @@ marian-args:
early-stopping: '20'
training-student-finetuned:
early-stopping: '20'
target-stage: evaluate-teacher
target-stage: all
start-stage: evaluate-teacher-ensemble
previous_group_ids: ["ThgMJX-PR4Kao_qkk4Aszw"]
existing_tasks: { "train-vocab-cs-en": "SgJB5LMMRyuQoYxjwueZQA" }
wandb-publication: true
taskcluster:
split-chunks: 20
worker-classes:
default: gcp-spot
alignments-original: gcp-standard
alignments-backtranslated: gcp-standard
alignments-student: gcp-standard
shortlist: gcp-standard
10 changes: 8 additions & 2 deletions configs/spring-2024/da-en-spring-2024.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ experiment:
mono-max-sentences-trg: 200_000_000
spm-sample-size: 10_000_000
spm-vocab-size: 32000
teacher-ensemble: 1
teacher-ensemble: 2
teacher-mode: two-stage
pretrained-models: {}
datasets:
Expand Down Expand Up @@ -231,9 +231,15 @@ marian-args:
early-stopping: '20'
training-student-finetuned:
early-stopping: '20'
target-stage: evaluate-teacher
target-stage: all
start-stage: evaluate-quantized
previous_group_ids: ["crE38R0zQO-SppXcyaSLYw"]
wandb-publication: true
taskcluster:
split-chunks: 20
worker-classes:
default: gcp-spot
alignments-original: gcp-standard
alignments-backtranslated: gcp-standard
alignments-student: gcp-standard
shortlist: gcp-standard
21 changes: 18 additions & 3 deletions configs/spring-2024/el-en-spring-2024.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@ experiment:
mono-max-sentences-trg: 200_000_000
spm-sample-size: 10_000_000
spm-vocab-size: 32000
teacher-ensemble: 1
teacher-ensemble: 2
teacher-mode: two-stage
pretrained-models: {}
pretrained-models:
train-backwards:
urls:
- "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/task/UMKG4cgoTFmsJMnpQVoczw/artifacts/public/build"
mode: use
type: default
datasets:
devtest:
- mtdata_aug-mix_Neulab-tedtalks_dev-1-eng-ell
Expand Down Expand Up @@ -258,13 +263,23 @@ marian-args:
early-stopping: '5'
training-teacher:
early-stopping: '20'
# teacher-2 diverged, reduce learning rate
learn-rate: '0.0002'
training-student:
early-stopping: '20'
training-student-finetuned:
early-stopping: '20'
target-stage: evaluate-teacher
target-stage: all
start-stage: merge-translated
previous_group_ids: ["EE6ytMIqTOGUpjNr4rUtUA"]
existing_tasks: { "train-vocab-el-en": "W60nDU12TWi7gVMtL30ZXQ", "train-teacher-el-en-2": "SO6jIGfVQpmBbfMmevxjnw" }
wandb-publication: true
taskcluster:
split-chunks: 20
worker-classes:
default: gcp-spot
alignments-original: gcp-standard
alignments-backtranslated: gcp-standard
alignments-student: gcp-standard
shortlist: gcp-standard
alignments-priors2: gcp-standard
4 changes: 4 additions & 0 deletions configs/spring-2024/en-bs-spring-2024.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,7 @@ taskcluster:
split-chunks: 20
worker-classes:
default: gcp-spot
alignments-original: gcp-standard
alignments-backtranslated: gcp-standard
alignments-student: gcp-standard
shortlist: gcp-standard
44 changes: 41 additions & 3 deletions configs/spring-2024/en-cs-spring-2024.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@ experiment:
mono-max-sentences-trg: 200_000_000
spm-sample-size: 10_000_000
spm-vocab-size: 32000
teacher-ensemble: 1
teacher-ensemble: 2
teacher-mode: two-stage
pretrained-models: {}
pretrained-models:
train-backwards:
urls:
- "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/task/DXImmtKYQQGRPG9U0THoNw/artifacts/public/build"
mode: use
type: default
datasets:
devtest:
- mtdata_aug-mix_Neulab-tedtalks_dev-1-eng-ces
Expand Down Expand Up @@ -227,9 +232,42 @@ marian-args:
early-stopping: '20'
training-student-finetuned:
early-stopping: '20'
target-stage: evaluate-teacher
target-stage: all
start-stage: train-student
previous_group_ids: ["DtSyAeaVRoGNZDnUKscGWw"]
existing_tasks: {
"build-docker-image-base": "BAvLUilqQ3SYqy6Ck55CUQ",
"build-docker-image-test": "f0gbptvMTDaKODjqL9hlOw",
"build-docker-image-toolchain-build": "LlZa8-L9TRemgyzQcAxuHw",
"build-docker-image-train": "fBMJa9R5SKaXd2wgWeD5yQ",
"fetch-browsermt-marian": "BRviRlEMTie8AUFf5prHvg",
"fetch-cuda": "Kc8iWZguSyeGMZKY7OxnTQ",
"fetch-cuda-11": "RjR9dsYTQhe0HQJPHNN4Tg",
"fetch-cyhunspell": "XNYpMzBvSraicoNKyUIwxA",
"fetch-extract-lex": "J2FS7TLLT4m2mjD0IGw91A",
"fetch-fast-align": "Tim8u7s-TAeTYG5VnzmXfA",
"fetch-hunspell": "Wn1pnCSQSpqKeRpCV52FqQ",
"fetch-kenlm": "J4U7RFz2TASaNNTTqoQ8sg",
"fetch-marian": "Sw_bpajdSgWxEDG3uW0-nQ",
"fetch-preprocess": "Scn2N5dLRXKCEU4T1JYE3A",
"toolchain-browsermt-marian": "aP5l3b05S9q3G25Nm85d6w",
"toolchain-cuda-toolkit": "UuUG70nvSj2pHcKt8JFbKw",
"toolchain-cuda-toolkit-11": "YhKI4TKlTFep-FpU7D2L7A",
"toolchain-cyhunspell": "DTvS_tZeSluSlAHkViW3lg",
"toolchain-extract-lex": "Xb7KAXA7TziSrxVQWS0Wmw",
"toolchain-fast-align": "Ia-7gLTQSJeCj_RLs7sg4w",
"toolchain-hunspell": "V84fX3jvQ-Knr4hZT9B8DQ",
"toolchain-kenlm": "X6SgAIzhQlyL7g_nIfE-YQ",
"toolchain-marian": "AoV-W4IzRo22lQBtJWsTxQ",
"toolchain-marian-cpu": "Za5VkFoyS6mauNnmEYxV7g",
"toolchain-preprocess": "ZozJMTdgQD-Bm9sSaG7soA"
}
wandb-publication: true
taskcluster:
split-chunks: 20
worker-classes:
default: gcp-spot
alignments-original: gcp-standard
alignments-backtranslated: gcp-standard
alignments-student: gcp-standard
shortlist: gcp-standard
37 changes: 23 additions & 14 deletions configs/spring-2024/en-da-spring-2024.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@ experiment:
mono-max-sentences-trg: 200_000_000
spm-sample-size: 10_000_000
spm-vocab-size: 32000
teacher-ensemble: 1
teacher-ensemble: 2
teacher-mode: two-stage
pretrained-models: {}
pretrained-models:
train-backwards:
urls:
- "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/task/Ft4VB6mVRz-iD4v4mQGyNQ/artifacts/public/build"
mode: use
type: default
datasets:
devtest:
- mtdata_aug-mix_Neulab-tedtalks_dev-1-eng-dan
Expand Down Expand Up @@ -159,16 +164,17 @@ datasets:
- opus_ELRC-901-Denmark_Prosecution_/v1 # 1,163 sentences
- opus_ELRC-900-Danish_Working_Envir/v1 # 1,138 sentences
- opus_ELRC-890-Holstebro_Kunstmuseu/v1 # 1,023 sentences
- opus_ELRC-3204-antibiotic/v1 # 801 sentences
- opus_ELRC-894-Gallery_Denmark/v1 # 769 sentences
- opus_ELRC-3295-EUROPARL_covid/v1 # 634 sentences
- opus_ELRC-3066-wikipedia_health/v1 # 523 sentences
- opus_ELRC-wikipedia_health/v1 # 523 sentences
- opus_ELRC_2922/v1 # 522 sentences
- opus_tldr-pages/v2023-08-29 # 495 sentences
- opus_ELRC-2754-vaccination/v1 # 462 sentences
- opus_ELRC-vaccination/v1 # 462 sentences
- opus_ELRC_2923/v1 # 389 sentences
# remove to work around https://github.com/mozilla/firefox-translations-training/issues/653
# - opus_ELRC-3204-antibiotic/v1 # 801 sentences
# - opus_ELRC-894-Gallery_Denmark/v1 # 769 sentences
# - opus_ELRC-3295-EUROPARL_covid/v1 # 634 sentences
# - opus_ELRC-3066-wikipedia_health/v1 # 523 sentences
# - opus_ELRC-wikipedia_health/v1 # 523 sentences
# - opus_ELRC_2922/v1 # 522 sentences
# - opus_tldr-pages/v2023-08-29 # 495 sentences
# - opus_ELRC-2754-vaccination/v1 # 462 sentences
# - opus_ELRC-vaccination/v1 # 462 sentences
# - opus_ELRC_2923/v1 # 389 sentences
- mtdata_ELRC-danish_higher_education_science_3-1-dan-eng # ~12,611 sentences (1.4 MB)
- mtdata_ELRC-danish_higher_education_science_2-1-dan-eng # ~13,011 sentences (1.5 MB)
- mtdata_ELRC-danish_higher_education_science-1-dan-eng # ~6,646 sentences (751.0 kB)
Expand All @@ -187,7 +193,6 @@ datasets:
- mtdata_Tilde-eesc-2017-dan-eng # ~1,936,973 sentences (218.9 MB)
- mtdata_Tilde-ema-2016-dan-eng # ~215,232 sentences (24.3 MB)
- mtdata_Tilde-rapid-2016-dan-eng # ~451,067 sentences (51.0 MB)

# The monolingual data contains:
# ~195,823,002 sentences
mono-src:
Expand Down Expand Up @@ -231,9 +236,13 @@ marian-args:
early-stopping: '20'
training-student-finetuned:
early-stopping: '20'
target-stage: evaluate-teacher
target-stage: all
wandb-publication: true
taskcluster:
split-chunks: 20
worker-classes:
default: gcp-spot
alignments-original: gcp-standard
alignments-backtranslated: gcp-standard
alignments-student: gcp-standard
shortlist: gcp-standard
Loading

0 comments on commit db41329

Please sign in to comment.