diff --git a/taskcluster/config.yml b/taskcluster/config.yml index 268b34df8..e7c09b5a6 100644 --- a/taskcluster/config.yml +++ b/taskcluster/config.yml @@ -60,78 +60,105 @@ valid-stages: workers: aliases: - # Use for quick tasks that don't require GPUs, eg: linting, tests - b-cpu: + b-linux-large-gcp-d2g: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux - worker-type: 'b-linux-large-gcp-d2g' - # Use for tasks that don't require GPUs, but need lots of disk space - # eg: dataset cleaning & merging - b-cpu-largedisk: + worker-type: '{alias}' + b-linux-large-gcp-d2g-300gb: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux - worker-type: 'b-linux-large-gcp-d2g-300gb' - # Use for tasks that don't require GPUs, but need immense amounts of disk space - # eg: alignments - b-cpu-xlargedisk: + worker-type: '{alias}' + b-linux-large-gcp-d2g-1tb: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux - worker-type: 'b-linux-large-gcp-d2g-1tb' - # Use for tasks that don't require GPUs, but need immense amounts of disk space - # and higher reliability - b-cpu-xlargedisk-standard: + worker-type: '{alias}' + b-linux-large-gcp-d2g-1tb-standard: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux - worker-type: 'b-linux-large-gcp-d2g-1tb-standard' - # Use for quick tasks that need a GPU, eg: evaluate - b-gpu: + worker-type: '{alias}' + b-linux-v100-gpu: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu' - # Use for tasks that need lots of GPU power, but not lots of disk space - # eg: translation & scoring - b-largegpu: + worker-type: '{alias}' + b-linux-v100-gpu-4: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4' - # Use for tasks that needs lots of GPU power and increased disk space - # eg: bicleaner - b-largegpu-largedisk: + worker-type: '{alias}' + b-linux-v100-gpu-4-300gb: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4-300gb' - # Use for tasks that need lots of GPU power and immensive amounts of disk space - # eg: training - b-largegpu-xlargedisk: + worker-type: '{alias}' + b-linux-v100-gpu-4-300gb-standard: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4-1tb' - # Use for tasks that needs lots of GPU power, increased disk space, and higher reliability - b-largegpu-largedisk-standard: + worker-type: '{alias}' + b-linux-v100-gpu-4-1tb: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4-300gb-standard' - # Use for tasks that needs lots of GPU power, increased disk space, and higher reliability - b-largegpu-xlargedisk-standard: + worker-type: '{alias}' + b-linux-v100-gpu-4-1tb-standard: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4-1tb-standard' + worker-type: '{alias}' images: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux worker-type: '{alias}-gcp' +# Ideally these would be in `workers.aliases` above, but those alias' are +# resolved by Taskgraph, which is unaware of the `worker-class` lookups +# we need to do below. +local-worker-aliases: + # Use for quick tasks that don't require GPUs, eg: linting, tests + b-cpu: + by-worker-class: + gcp-standard: 'b-linux-large-gcp-d2g' + default: 'b-linux-large-gcp-d2g' + b-cpu-largedisk: + by-worker-class: + gcp-standard: 'b-linux-large-gcp-d2g-300gb' + default: 'b-linux-large-gcp-d2g-300gb' + # Use for tasks that don't require GPUs, but need immense amounts of disk space + # eg: alignments + b-cpu-xlargedisk: + by-worker-class: + gcp-standard: 'b-linux-large-gcp-d2g-1tb-standard' + default: 'b-linux-large-gcp-d2g-1tb' + # Use for quick tasks that need a GPU, eg: evaluate + b-gpu: + by-worker-class: + gcp-standard: 'b-linux-v100-gpu' + default: 'b-linux-v100-gpu' + # Use for tasks that need lots of GPU power, but not lots of disk space + # eg: translation & scoring + b-largegpu: + by-worker-class: + gcp-standard: 'b-linux-v100-gpu-4' + default: 'b-linux-v100-gpu-4' + # Use for tasks that needs lots of GPU power and increased disk space + # eg: bicleaner + b-largegpu-largedisk: + by-worker-class: + gcp-standard: 'b-linux-v100-gpu-4-300gb-standard' + default: 'b-linux-v100-gpu-4-300gb' + # Use for tasks that need lots of GPU power and immensive amounts of disk space + # eg: training + b-largegpu-xlargedisk: + by-worker-class: + gcp-standard: 'b-linux-v100-gpu-4-1tb-standard' + default: 'b-linux-v100-gpu-4-1tb' + # Keys are worker type, and align with the `worker-type` entries in the # `worker.aliases` above. worker-configuration: diff --git a/taskcluster/configs/config.ci.yml b/taskcluster/configs/config.ci.yml index 263d56a9a..aa0f7f2f0 100644 --- a/taskcluster/configs/config.ci.yml +++ b/taskcluster/configs/config.ci.yml @@ -78,3 +78,5 @@ datasets: target-stage: all taskcluster: split-chunks: 2 + worker-classes: + default: gcp-spot diff --git a/taskcluster/configs/config.prod.yml b/taskcluster/configs/config.prod.yml index 6206352ad..7d0726293 100644 --- a/taskcluster/configs/config.prod.yml +++ b/taskcluster/configs/config.prod.yml @@ -218,3 +218,16 @@ taskcluster: # then split into an even number of chunks. # Adjust depending on the amount of data to translate split-chunks: 20 + # Worker classes by `kind`, and a default for `kinds` not specified. + # Available options are in `taskcluster/translations_taskgraph/actions/train.py`. + # By default we like to use `gcp-spot`, which are the cheapest option. To use + # standard (non-spot) instances for all training tasks you would configure + # as follows: + # worker-classes: + # finetune-student: gcp-spot + # train-backwards: gcp-spot + # train-teacher: gcp-spot + # train-student: gcp-spot + # default: gcp-spot + worker-classes: + default: gcp-spot diff --git a/taskcluster/kinds/alignments-backtranslated/kind.yml b/taskcluster/kinds/alignments-backtranslated/kind.yml index 93bc53a2e..17f2d63c2 100644 --- a/taskcluster/kinds/alignments-backtranslated/kind.yml +++ b/taskcluster/kinds/alignments-backtranslated/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/alignments-original/kind.yml b/taskcluster/kinds/alignments-original/kind.yml index f5f25d711..dce46f719 100644 --- a/taskcluster/kinds/alignments-original/kind.yml +++ b/taskcluster/kinds/alignments-original/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/alignments-student/kind.yml b/taskcluster/kinds/alignments-student/kind.yml index e57cf883f..74de65f26 100644 --- a/taskcluster/kinds/alignments-student/kind.yml +++ b/taskcluster/kinds/alignments-student/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/analyze-corpus/kind.yml b/taskcluster/kinds/analyze-corpus/kind.yml index df10f3f31..35205abbc 100644 --- a/taskcluster/kinds/analyze-corpus/kind.yml +++ b/taskcluster/kinds/analyze-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:per_dataset - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/analyze-mono/kind.yml b/taskcluster/kinds/analyze-mono/kind.yml index 1fe69aeea..7b1dd32ff 100644 --- a/taskcluster/kinds/analyze-mono/kind.yml +++ b/taskcluster/kinds/analyze-mono/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:mono - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/bicleaner-model/kind.yml b/taskcluster/kinds/bicleaner-model/kind.yml index 3b5b06d20..f633bb64a 100644 --- a/taskcluster/kinds/bicleaner-model/kind.yml +++ b/taskcluster/kinds/bicleaner-model/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/bicleaner/kind.yml b/taskcluster/kinds/bicleaner/kind.yml index bc91db92b..6a5620253 100644 --- a/taskcluster/kinds/bicleaner/kind.yml +++ b/taskcluster/kinds/bicleaner/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:per_dataset - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/cefilter/kind.yml b/taskcluster/kinds/cefilter/kind.yml index 5526c0b46..b68a9cf20 100644 --- a/taskcluster/kinds/cefilter/kind.yml +++ b/taskcluster/kinds/cefilter/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/clean-corpus/kind.yml b/taskcluster/kinds/clean-corpus/kind.yml index 0cf5fa749..6a85ddc0d 100644 --- a/taskcluster/kinds/clean-corpus/kind.yml +++ b/taskcluster/kinds/clean-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:per_dataset - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/clean-mono/kind.yml b/taskcluster/kinds/clean-mono/kind.yml index f34977366..426023e1a 100644 --- a/taskcluster/kinds/clean-mono/kind.yml +++ b/taskcluster/kinds/clean-mono/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:mono - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/collect-corpus/kind.yml b/taskcluster/kinds/collect-corpus/kind.yml index ce1ab903d..b90531f20 100644 --- a/taskcluster/kinds/collect-corpus/kind.yml +++ b/taskcluster/kinds/collect-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.from_deps - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/collect-mono-src/kind.yml b/taskcluster/kinds/collect-mono-src/kind.yml index f3e501061..e76698fec 100644 --- a/taskcluster/kinds/collect-mono-src/kind.yml +++ b/taskcluster/kinds/collect-mono-src/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.from_deps - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/collect-mono-trg/kind.yml b/taskcluster/kinds/collect-mono-trg/kind.yml index 89a34c4df..743521eca 100644 --- a/taskcluster/kinds/collect-mono-trg/kind.yml +++ b/taskcluster/kinds/collect-mono-trg/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.from_deps - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/dataset/kind.yml b/taskcluster/kinds/dataset/kind.yml index 713be8ac0..e46f4e43a 100644 --- a/taskcluster/kinds/dataset/kind.yml +++ b/taskcluster/kinds/dataset/kind.yml @@ -10,6 +10,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.from_datasets:per_dataset + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml index b6aeba6bc..c7b054605 100644 --- a/taskcluster/kinds/evaluate-quantized/kind.yml +++ b/taskcluster/kinds/evaluate-quantized/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.from_datasets:per_dataset - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml index 8484072f5..5fc744ae2 100644 --- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml +++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.from_datasets:per_dataset - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.from_deps - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml index 9571cd0a8..40b1bd09f 100644 --- a/taskcluster/kinds/evaluate/kind.yml +++ b/taskcluster/kinds/evaluate/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.from_datasets:per_dataset - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/kinds/export/kind.yml b/taskcluster/kinds/export/kind.yml index 1046869ef..55af108a0 100644 --- a/taskcluster/kinds/export/kind.yml +++ b/taskcluster/kinds/export/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/extract-best/kind.yml b/taskcluster/kinds/extract-best/kind.yml index 84f1a18ed..a440b0283 100644 --- a/taskcluster/kinds/extract-best/kind.yml +++ b/taskcluster/kinds/extract-best/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/kinds/finetune-student/kind.yml b/taskcluster/kinds/finetune-student/kind.yml index 744207d8e..5dfdde2c5 100644 --- a/taskcluster/kinds/finetune-student/kind.yml +++ b/taskcluster/kinds/finetune-student/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/merge-corpus/kind.yml b/taskcluster/kinds/merge-corpus/kind.yml index c289f5334..df234789f 100644 --- a/taskcluster/kinds/merge-corpus/kind.yml +++ b/taskcluster/kinds/merge-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.find_upstreams:by_locales - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/merge-devset/kind.yml b/taskcluster/kinds/merge-devset/kind.yml index 1edcfe40f..e3602f400 100644 --- a/taskcluster/kinds/merge-devset/kind.yml +++ b/taskcluster/kinds/merge-devset/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.find_upstreams:by_locales - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/merge-mono/kind.yml b/taskcluster/kinds/merge-mono/kind.yml index cc762c88d..bac4e3d20 100644 --- a/taskcluster/kinds/merge-mono/kind.yml +++ b/taskcluster/kinds/merge-mono/kind.yml @@ -5,6 +5,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.find_upstreams:mono - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/merge-translated/kind.yml b/taskcluster/kinds/merge-translated/kind.yml index 2193b5569..c1c3ad043 100644 --- a/taskcluster/kinds/merge-translated/kind.yml +++ b/taskcluster/kinds/merge-translated/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/quantize/kind.yml b/taskcluster/kinds/quantize/kind.yml index 2d65bbc56..20ab850ae 100644 --- a/taskcluster/kinds/quantize/kind.yml +++ b/taskcluster/kinds/quantize/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/score/kind.yml b/taskcluster/kinds/score/kind.yml index 7f3ca24cc..f7a7b5517 100644 --- a/taskcluster/kinds/score/kind.yml +++ b/taskcluster/kinds/score/kind.yml @@ -6,7 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/shortlist/kind.yml b/taskcluster/kinds/shortlist/kind.yml index d94b2f870..84aa8d14c 100644 --- a/taskcluster/kinds/shortlist/kind.yml +++ b/taskcluster/kinds/shortlist/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/split-corpus/kind.yml b/taskcluster/kinds/split-corpus/kind.yml index 67b38fd14..eaecb45f4 100644 --- a/taskcluster/kinds/split-corpus/kind.yml +++ b/taskcluster/kinds/split-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/split-mono-src/kind.yml b/taskcluster/kinds/split-mono-src/kind.yml index 47096536c..4d3019110 100644 --- a/taskcluster/kinds/split-mono-src/kind.yml +++ b/taskcluster/kinds/split-mono-src/kind.yml @@ -5,6 +5,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/split-mono-trg/kind.yml b/taskcluster/kinds/split-mono-trg/kind.yml index f94f228d1..fd16b8bfd 100644 --- a/taskcluster/kinds/split-mono-trg/kind.yml +++ b/taskcluster/kinds/split-mono-trg/kind.yml @@ -5,6 +5,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/tests/kind.yml b/taskcluster/kinds/tests/kind.yml index b28c6dc30..67bf4a93d 100644 --- a/taskcluster/kinds/tests/kind.yml +++ b/taskcluster/kinds/tests/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - taskgraph.transforms.task:transforms diff --git a/taskcluster/kinds/toolchain/kind.yml b/taskcluster/kinds/toolchain/kind.yml index 571e29bc3..62cdb87fc 100644 --- a/taskcluster/kinds/toolchain/kind.yml +++ b/taskcluster/kinds/toolchain/kind.yml @@ -8,6 +8,7 @@ kind-dependencies: - fetch transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.run:transforms - taskgraph.transforms.cached_tasks:transforms - taskgraph.transforms.task:transforms diff --git a/taskcluster/kinds/train-backwards/kind.yml b/taskcluster/kinds/train-backwards/kind.yml index 0c7b4a46e..e5fff3f18 100644 --- a/taskcluster/kinds/train-backwards/kind.yml +++ b/taskcluster/kinds/train-backwards/kind.yml @@ -8,7 +8,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.training_continuation:transforms - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms @@ -58,7 +58,7 @@ tasks: worker-type: by-tasks-for: github-pull-request: b-largegpu - default: b-largegpu-largedisk-standard + default: b-largegpu-largedisk worker: max-run-time: 2592000 env: diff --git a/taskcluster/kinds/train-student/kind.yml b/taskcluster/kinds/train-student/kind.yml index 678f9cc90..e5840d73f 100644 --- a/taskcluster/kinds/train-student/kind.yml +++ b/taskcluster/kinds/train-student/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms @@ -56,7 +56,7 @@ tasks: worker-type: by-tasks-for: github-pull-request: b-largegpu-largedisk - default: b-largegpu-xlargedisk-standard + default: b-largegpu-xlargedisk worker: max-run-time: 2592000 env: diff --git a/taskcluster/kinds/train-teacher/kind.yml b/taskcluster/kinds/train-teacher/kind.yml index a272b3cdb..520ee3cd2 100644 --- a/taskcluster/kinds/train-teacher/kind.yml +++ b/taskcluster/kinds/train-teacher/kind.yml @@ -8,7 +8,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.training_continuation:transforms - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking @@ -79,7 +79,7 @@ tasks: worker-type: by-tasks-for: github-pull-request: b-largegpu - default: b-largegpu-xlargedisk-standard + default: b-largegpu-xlargedisk worker: max-run-time: 2592000 env: diff --git a/taskcluster/kinds/train-vocab/kind.yml b/taskcluster/kinds/train-vocab/kind.yml index 0cfe33f34..f89a372d2 100644 --- a/taskcluster/kinds/train-vocab/kind.yml +++ b/taskcluster/kinds/train-vocab/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/translate-corpus/kind.yml b/taskcluster/kinds/translate-corpus/kind.yml index dd1ceb720..12387d7eb 100644 --- a/taskcluster/kinds/translate-corpus/kind.yml +++ b/taskcluster/kinds/translate-corpus/kind.yml @@ -6,7 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/kinds/translate-mono-src/kind.yml b/taskcluster/kinds/translate-mono-src/kind.yml index b0dda8c34..ec601b6c9 100644 --- a/taskcluster/kinds/translate-mono-src/kind.yml +++ b/taskcluster/kinds/translate-mono-src/kind.yml @@ -5,7 +5,7 @@ loader: taskgraph.loader.transform:loader transforms: - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/kinds/translate-mono-trg/kind.yml b/taskcluster/kinds/translate-mono-trg/kind.yml index bab9dc2b6..ef77f32db 100644 --- a/taskcluster/kinds/translate-mono-trg/kind.yml +++ b/taskcluster/kinds/translate-mono-trg/kind.yml @@ -6,7 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/test/params/large-lt-en.yml b/taskcluster/test/params/large-lt-en.yml index 4429255ba..7927ffa5b 100644 --- a/taskcluster/test/params/large-lt-en.yml +++ b/taskcluster/test/params/large-lt-en.yml @@ -163,4 +163,10 @@ training_config: target-stage: all taskcluster: split-chunks: 10 + worker-classes: + finetune-student: gcp-standard + train-backwards: gcp-standard + train-student: gcp-standard + train-teacher: gcp-standard + default: gcp-spot version: null diff --git a/taskcluster/test/params/small-ru-en.yml b/taskcluster/test/params/small-ru-en.yml index 8467c8fa6..245a7104e 100644 --- a/taskcluster/test/params/small-ru-en.yml +++ b/taskcluster/test/params/small-ru-en.yml @@ -91,4 +91,10 @@ training_config: target-stage: clean-corpus taskcluster: split-chunks: 2 + worker-classes: + finetune-student: gcp-standard + train-backwards: gcp-standard + train-student: gcp-standard + train-teacher: gcp-standard + default: gcp-spot version: null diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py index 74e0fa704..650e6fef9 100644 --- a/taskcluster/translations_taskgraph/actions/train.py +++ b/taskcluster/translations_taskgraph/actions/train.py @@ -15,6 +15,13 @@ "https://github.com/mozilla-releng/staging-firefox-translations-training", ) +WORKER_CLASSES = ( + # Regular, on-demand GCP instances + "gcp-standard", + # Spot instances in GCP + "gcp-spot", +) + def can_train(parameters): return parameters["head_repository"] in TRAIN_ON_PROJECTS or ( @@ -313,6 +320,15 @@ def validate_pretrained_models(params): "type": "number", "description": "The number of chunks (parallel jobs) to use in `split` steps", }, + "worker-classes": { + "type": "object", + "description": "The class of workers to use for this training, by kind", + "additionalProperties": { + "type": "string", + # TODO: add snakepit type(s) when they are brought online + "enum": ["gcp-standard", "gcp-spot"], + }, + }, }, }, }, diff --git a/taskcluster/translations_taskgraph/parameters.py b/taskcluster/translations_taskgraph/parameters.py index a77e42a22..da7516234 100644 --- a/taskcluster/translations_taskgraph/parameters.py +++ b/taskcluster/translations_taskgraph/parameters.py @@ -3,7 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. from taskgraph.parameters import extend_parameters_schema -from voluptuous import Optional, Required +from voluptuous import Extra, Optional, Required # These defaults line up with the `config.ci.yml` pipeline as much as possible. @@ -98,6 +98,9 @@ def get_defaults(_): # Taskcluster-specific configuration "taskcluster": { "split-chunks": 2, + "worker-classes": { + "default": "gcp-spot", + }, }, # Disable Weight & Biases publication on CI "wandb-publication": False, @@ -152,6 +155,10 @@ def get_defaults(_): }, Optional("taskcluster"): { Optional("split-chunks"): int, + Required("worker-classes"): { + Required("default"): str, + Extra: str, + }, }, Optional("wandb-publication"): bool, }, diff --git a/taskcluster/translations_taskgraph/transforms/worker_env.py b/taskcluster/translations_taskgraph/transforms/worker_env.py deleted file mode 100644 index c5f64b702..000000000 --- a/taskcluster/translations_taskgraph/transforms/worker_env.py +++ /dev/null @@ -1,54 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# -# This transform sequence injects worker-specific environment variables -# (such as those that dependent on the number and type of GPUs a worker has) -# into task definitions. This avoids the need to discover this information at -# runtime, or adjust in kinds when changing worker types. - -from taskgraph.transforms.base import TransformSequence -from taskgraph.util.schema import resolve_keyed_by - -transforms = TransformSequence() - - -@transforms.add -def evaluate_keyed_by(config, jobs): - for job in jobs: - resolve_keyed_by( - job, - "worker-type", - item_name=job["description"], - **{"tasks-for": config.params["tasks_for"]}, - ) - - yield job - - -@transforms.add -def inject_worker_env(config, jobs): - for job in jobs: - # This is called worker-type in jobs, but in reality it's an alias resolved in the graph config... - worker_alias = job["worker-type"] - - worker_definition = config.graph_config["workers"]["aliases"].get(worker_alias) - if not worker_definition: - raise Exception(f"Couldn't find worker definition for {worker_alias} in graph config!") - - worker_type = worker_definition["worker-type"] - worker_config = config.graph_config["worker-configuration"].get(worker_type) - if not worker_config: - raise Exception( - f"Couldn't find worker configuration for {worker_type} in graph config!" - ) - - worker_env = worker_config["env"] - if "GPUS" not in worker_env or "WORKSPACE" not in worker_env: - raise Exception( - "GPUS and/or WORKSPACE values missing from worker env, this is probably misconfiguration." - ) - - job["worker"]["env"].update(worker_env) - - yield job diff --git a/taskcluster/translations_taskgraph/transforms/worker_selection.py b/taskcluster/translations_taskgraph/transforms/worker_selection.py new file mode 100644 index 000000000..44ecd9b4b --- /dev/null +++ b/taskcluster/translations_taskgraph/transforms/worker_selection.py @@ -0,0 +1,71 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This transform sequence injects worker-specific environment variables +# (such as those that dependent on the number and type of GPUs a worker has) +# into task definitions. This avoids the need to discover this information at +# runtime, or adjust in kinds when changing worker types. + +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.schema import evaluate_keyed_by + +transforms = TransformSequence() + + +@transforms.add +def set_worker_type(config, jobs): + """Determines the general type of worker each task wants, which sometimes + depends on `tasks-for`. Tasks typically will end up specifying one of the + worker `aliases` from config.yml after this is evaluated, eg: b-cpu, + b-largegpu-largedisk.""" + + training_config = config.params.get("training_config") + worker_classes = training_config["taskcluster"]["worker-classes"] + worker_class = worker_classes.get(config.kind, worker_classes["default"]) + for job in jobs: + # First, evaluate the `keyed-by` in the initial task specification from + # the kind, if present. This should give us one of the keys from + # `worker-configuration` in config.yml. + task_worker_type = evaluate_keyed_by( + job["worker-type"], + job["description"], + {"tasks-for": config.params["tasks_for"]}, + ) + + # Now that we have one of the aliases, we need to resolve it to a + # specific worker type, as some of those aliases have their own + # `keyed-by` blocks, which may give different worker types depending + # on what's in the training config. + worker_alias_block = config.graph_config["local-worker-aliases"][task_worker_type].copy() + job["worker-type"] = evaluate_keyed_by( + worker_alias_block, + task_worker_type, + {"worker-class": worker_class}, + ) + + yield job + + +@transforms.add +def inject_worker_env(config, jobs): + for job in jobs: + # This is called worker-type in jobs, but in reality it's an alias resolved in the graph config... + worker_type = job["worker-type"] + worker_config = config.graph_config["worker-configuration"].get(worker_type, {}) + + worker_env = worker_config.get("env", {}) + if "GPUS" not in worker_env or "WORKSPACE" not in worker_env: + # GPU tasks will not function correctly without these set; make this an error + # before they even run. + if "gpu" in worker_type: + raise Exception( + "GPUS and/or WORKSPACE values missing from worker env, this is probably misconfiguration." + ) + else: + yield job + continue + + job["worker"]["env"].update(worker_env) + + yield job diff --git a/tests/fixtures/config.pytest.yml b/tests/fixtures/config.pytest.yml index 69661b79a..c17296686 100644 --- a/tests/fixtures/config.pytest.yml +++ b/tests/fixtures/config.pytest.yml @@ -57,3 +57,5 @@ marian-args: target-stage: all taskcluster: split-chunks: 10 + worker-classes: + default: gcp-spot