From 067ce65e3c19cce0de950401da802cf4bf07e7a3 Mon Sep 17 00:00:00 2001 From: "Ben Hearsum (he/him)" Date: Thu, 9 May 2024 09:20:57 -0400 Subject: [PATCH] allow runtime selection of worker classes through training config (fixes #300) (#532) * Rename worker_env.py to the more general worker_selection.py to accommodate upcoming runtime worker selection work * allow runtime selection of worker classes through training config This patch adds a new `worker-classes` section of the training config which allows for selecting the "class" of worker to use (currently GCP spot or GCP standard) by kind, with support for a default. This allows us quite flexible configuration, eg: using spot instances for translation and standard ones for training. We will also be able to add one or more classes for Snakepit machines when we bring those online. The default value is `{"default": "gcp-spot"}` - which means that we'll use all spot machines by default. (We can change the default in `config.prod.yml` if desired.) Most of this patch is quite a boring addition of the new `worker_selection` transform to the pipeline kinds. The most notable part otherwise is a fairly big rework of most things to do with workers in `config.yml`: * The taskgraph-required `workers.aliases` is now a very simple, straightforward list of all available worker types. * A new `local-worker-aliases` has been introduced. This maps the generic names like `b-largegpu` to concrete worker types `by-worker-class`. This removes the need for `-standard` variants for the generic names. * This necessitated a new transform function that looks up the concrete worker type in each kind before we hand off to the `task` transforms. (Previously, that transform simply looked up things like `b-largegpu` in the `workers.aliases` block. If we had the ability to feed it `worker-class` information we could have kept all our mappings there - but I couldn't come up with a reasonable way to do this upstream.) --- taskcluster/config.yml | 99 ++++++++++++------- taskcluster/configs/config.ci.yml | 2 + taskcluster/configs/config.prod.yml | 13 +++ .../kinds/alignments-backtranslated/kind.yml | 1 + .../kinds/alignments-original/kind.yml | 1 + taskcluster/kinds/alignments-student/kind.yml | 1 + taskcluster/kinds/analyze-corpus/kind.yml | 1 + taskcluster/kinds/analyze-mono/kind.yml | 1 + taskcluster/kinds/bicleaner-model/kind.yml | 1 + taskcluster/kinds/bicleaner/kind.yml | 1 + taskcluster/kinds/cefilter/kind.yml | 1 + taskcluster/kinds/clean-corpus/kind.yml | 1 + taskcluster/kinds/clean-mono/kind.yml | 1 + taskcluster/kinds/collect-corpus/kind.yml | 1 + taskcluster/kinds/collect-mono-src/kind.yml | 1 + taskcluster/kinds/collect-mono-trg/kind.yml | 1 + taskcluster/kinds/dataset/kind.yml | 1 + taskcluster/kinds/evaluate-quantized/kind.yml | 2 +- .../kinds/evaluate-teacher-ensemble/kind.yml | 2 +- taskcluster/kinds/evaluate/kind.yml | 2 +- taskcluster/kinds/export/kind.yml | 1 + taskcluster/kinds/extract-best/kind.yml | 1 + taskcluster/kinds/finetune-student/kind.yml | 2 +- taskcluster/kinds/merge-corpus/kind.yml | 1 + taskcluster/kinds/merge-devset/kind.yml | 1 + taskcluster/kinds/merge-mono/kind.yml | 1 + taskcluster/kinds/merge-translated/kind.yml | 1 + taskcluster/kinds/quantize/kind.yml | 1 + taskcluster/kinds/score/kind.yml | 2 +- taskcluster/kinds/shortlist/kind.yml | 1 + taskcluster/kinds/split-corpus/kind.yml | 1 + taskcluster/kinds/split-mono-src/kind.yml | 1 + taskcluster/kinds/split-mono-trg/kind.yml | 1 + taskcluster/kinds/tests/kind.yml | 1 + taskcluster/kinds/toolchain/kind.yml | 1 + taskcluster/kinds/train-backwards/kind.yml | 4 +- taskcluster/kinds/train-student/kind.yml | 4 +- taskcluster/kinds/train-teacher/kind.yml | 4 +- taskcluster/kinds/train-vocab/kind.yml | 1 + taskcluster/kinds/translate-corpus/kind.yml | 2 +- taskcluster/kinds/translate-mono-src/kind.yml | 2 +- taskcluster/kinds/translate-mono-trg/kind.yml | 2 +- taskcluster/test/params/large-lt-en.yml | 6 ++ taskcluster/test/params/small-ru-en.yml | 6 ++ .../translations_taskgraph/actions/train.py | 16 +++ .../translations_taskgraph/parameters.py | 9 +- .../transforms/worker_env.py | 54 ---------- .../transforms/worker_selection.py | 71 +++++++++++++ tests/fixtures/config.pytest.yml | 2 + 49 files changed, 229 insertions(+), 105 deletions(-) delete mode 100644 taskcluster/translations_taskgraph/transforms/worker_env.py create mode 100644 taskcluster/translations_taskgraph/transforms/worker_selection.py diff --git a/taskcluster/config.yml b/taskcluster/config.yml index 268b34df8..e7c09b5a6 100644 --- a/taskcluster/config.yml +++ b/taskcluster/config.yml @@ -60,78 +60,105 @@ valid-stages: workers: aliases: - # Use for quick tasks that don't require GPUs, eg: linting, tests - b-cpu: + b-linux-large-gcp-d2g: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux - worker-type: 'b-linux-large-gcp-d2g' - # Use for tasks that don't require GPUs, but need lots of disk space - # eg: dataset cleaning & merging - b-cpu-largedisk: + worker-type: '{alias}' + b-linux-large-gcp-d2g-300gb: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux - worker-type: 'b-linux-large-gcp-d2g-300gb' - # Use for tasks that don't require GPUs, but need immense amounts of disk space - # eg: alignments - b-cpu-xlargedisk: + worker-type: '{alias}' + b-linux-large-gcp-d2g-1tb: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux - worker-type: 'b-linux-large-gcp-d2g-1tb' - # Use for tasks that don't require GPUs, but need immense amounts of disk space - # and higher reliability - b-cpu-xlargedisk-standard: + worker-type: '{alias}' + b-linux-large-gcp-d2g-1tb-standard: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux - worker-type: 'b-linux-large-gcp-d2g-1tb-standard' - # Use for quick tasks that need a GPU, eg: evaluate - b-gpu: + worker-type: '{alias}' + b-linux-v100-gpu: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu' - # Use for tasks that need lots of GPU power, but not lots of disk space - # eg: translation & scoring - b-largegpu: + worker-type: '{alias}' + b-linux-v100-gpu-4: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4' - # Use for tasks that needs lots of GPU power and increased disk space - # eg: bicleaner - b-largegpu-largedisk: + worker-type: '{alias}' + b-linux-v100-gpu-4-300gb: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4-300gb' - # Use for tasks that need lots of GPU power and immensive amounts of disk space - # eg: training - b-largegpu-xlargedisk: + worker-type: '{alias}' + b-linux-v100-gpu-4-300gb-standard: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4-1tb' - # Use for tasks that needs lots of GPU power, increased disk space, and higher reliability - b-largegpu-largedisk-standard: + worker-type: '{alias}' + b-linux-v100-gpu-4-1tb: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4-300gb-standard' - # Use for tasks that needs lots of GPU power, increased disk space, and higher reliability - b-largegpu-xlargedisk-standard: + worker-type: '{alias}' + b-linux-v100-gpu-4-1tb-standard: provisioner: '{trust-domain}-{level}' implementation: generic-worker os: linux - worker-type: 'b-linux-v100-gpu-4-1tb-standard' + worker-type: '{alias}' images: provisioner: '{trust-domain}-{level}' implementation: docker-worker os: linux worker-type: '{alias}-gcp' +# Ideally these would be in `workers.aliases` above, but those alias' are +# resolved by Taskgraph, which is unaware of the `worker-class` lookups +# we need to do below. +local-worker-aliases: + # Use for quick tasks that don't require GPUs, eg: linting, tests + b-cpu: + by-worker-class: + gcp-standard: 'b-linux-large-gcp-d2g' + default: 'b-linux-large-gcp-d2g' + b-cpu-largedisk: + by-worker-class: + gcp-standard: 'b-linux-large-gcp-d2g-300gb' + default: 'b-linux-large-gcp-d2g-300gb' + # Use for tasks that don't require GPUs, but need immense amounts of disk space + # eg: alignments + b-cpu-xlargedisk: + by-worker-class: + gcp-standard: 'b-linux-large-gcp-d2g-1tb-standard' + default: 'b-linux-large-gcp-d2g-1tb' + # Use for quick tasks that need a GPU, eg: evaluate + b-gpu: + by-worker-class: + gcp-standard: 'b-linux-v100-gpu' + default: 'b-linux-v100-gpu' + # Use for tasks that need lots of GPU power, but not lots of disk space + # eg: translation & scoring + b-largegpu: + by-worker-class: + gcp-standard: 'b-linux-v100-gpu-4' + default: 'b-linux-v100-gpu-4' + # Use for tasks that needs lots of GPU power and increased disk space + # eg: bicleaner + b-largegpu-largedisk: + by-worker-class: + gcp-standard: 'b-linux-v100-gpu-4-300gb-standard' + default: 'b-linux-v100-gpu-4-300gb' + # Use for tasks that need lots of GPU power and immensive amounts of disk space + # eg: training + b-largegpu-xlargedisk: + by-worker-class: + gcp-standard: 'b-linux-v100-gpu-4-1tb-standard' + default: 'b-linux-v100-gpu-4-1tb' + # Keys are worker type, and align with the `worker-type` entries in the # `worker.aliases` above. worker-configuration: diff --git a/taskcluster/configs/config.ci.yml b/taskcluster/configs/config.ci.yml index 263d56a9a..aa0f7f2f0 100644 --- a/taskcluster/configs/config.ci.yml +++ b/taskcluster/configs/config.ci.yml @@ -78,3 +78,5 @@ datasets: target-stage: all taskcluster: split-chunks: 2 + worker-classes: + default: gcp-spot diff --git a/taskcluster/configs/config.prod.yml b/taskcluster/configs/config.prod.yml index 6206352ad..7d0726293 100644 --- a/taskcluster/configs/config.prod.yml +++ b/taskcluster/configs/config.prod.yml @@ -218,3 +218,16 @@ taskcluster: # then split into an even number of chunks. # Adjust depending on the amount of data to translate split-chunks: 20 + # Worker classes by `kind`, and a default for `kinds` not specified. + # Available options are in `taskcluster/translations_taskgraph/actions/train.py`. + # By default we like to use `gcp-spot`, which are the cheapest option. To use + # standard (non-spot) instances for all training tasks you would configure + # as follows: + # worker-classes: + # finetune-student: gcp-spot + # train-backwards: gcp-spot + # train-teacher: gcp-spot + # train-student: gcp-spot + # default: gcp-spot + worker-classes: + default: gcp-spot diff --git a/taskcluster/kinds/alignments-backtranslated/kind.yml b/taskcluster/kinds/alignments-backtranslated/kind.yml index 93bc53a2e..17f2d63c2 100644 --- a/taskcluster/kinds/alignments-backtranslated/kind.yml +++ b/taskcluster/kinds/alignments-backtranslated/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/alignments-original/kind.yml b/taskcluster/kinds/alignments-original/kind.yml index f5f25d711..dce46f719 100644 --- a/taskcluster/kinds/alignments-original/kind.yml +++ b/taskcluster/kinds/alignments-original/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/alignments-student/kind.yml b/taskcluster/kinds/alignments-student/kind.yml index e57cf883f..74de65f26 100644 --- a/taskcluster/kinds/alignments-student/kind.yml +++ b/taskcluster/kinds/alignments-student/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/analyze-corpus/kind.yml b/taskcluster/kinds/analyze-corpus/kind.yml index df10f3f31..35205abbc 100644 --- a/taskcluster/kinds/analyze-corpus/kind.yml +++ b/taskcluster/kinds/analyze-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:per_dataset - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/analyze-mono/kind.yml b/taskcluster/kinds/analyze-mono/kind.yml index 1fe69aeea..7b1dd32ff 100644 --- a/taskcluster/kinds/analyze-mono/kind.yml +++ b/taskcluster/kinds/analyze-mono/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:mono - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/bicleaner-model/kind.yml b/taskcluster/kinds/bicleaner-model/kind.yml index 3b5b06d20..f633bb64a 100644 --- a/taskcluster/kinds/bicleaner-model/kind.yml +++ b/taskcluster/kinds/bicleaner-model/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/bicleaner/kind.yml b/taskcluster/kinds/bicleaner/kind.yml index bc91db92b..6a5620253 100644 --- a/taskcluster/kinds/bicleaner/kind.yml +++ b/taskcluster/kinds/bicleaner/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:per_dataset - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/cefilter/kind.yml b/taskcluster/kinds/cefilter/kind.yml index 5526c0b46..b68a9cf20 100644 --- a/taskcluster/kinds/cefilter/kind.yml +++ b/taskcluster/kinds/cefilter/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/clean-corpus/kind.yml b/taskcluster/kinds/clean-corpus/kind.yml index 0cf5fa749..6a85ddc0d 100644 --- a/taskcluster/kinds/clean-corpus/kind.yml +++ b/taskcluster/kinds/clean-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:per_dataset - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/clean-mono/kind.yml b/taskcluster/kinds/clean-mono/kind.yml index f34977366..426023e1a 100644 --- a/taskcluster/kinds/clean-mono/kind.yml +++ b/taskcluster/kinds/clean-mono/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.from_datasets:mono - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/collect-corpus/kind.yml b/taskcluster/kinds/collect-corpus/kind.yml index ce1ab903d..b90531f20 100644 --- a/taskcluster/kinds/collect-corpus/kind.yml +++ b/taskcluster/kinds/collect-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.from_deps - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/collect-mono-src/kind.yml b/taskcluster/kinds/collect-mono-src/kind.yml index f3e501061..e76698fec 100644 --- a/taskcluster/kinds/collect-mono-src/kind.yml +++ b/taskcluster/kinds/collect-mono-src/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.from_deps - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/collect-mono-trg/kind.yml b/taskcluster/kinds/collect-mono-trg/kind.yml index 89a34c4df..743521eca 100644 --- a/taskcluster/kinds/collect-mono-trg/kind.yml +++ b/taskcluster/kinds/collect-mono-trg/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.from_deps - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/dataset/kind.yml b/taskcluster/kinds/dataset/kind.yml index 713be8ac0..e46f4e43a 100644 --- a/taskcluster/kinds/dataset/kind.yml +++ b/taskcluster/kinds/dataset/kind.yml @@ -10,6 +10,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.from_datasets:per_dataset + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml index b6aeba6bc..c7b054605 100644 --- a/taskcluster/kinds/evaluate-quantized/kind.yml +++ b/taskcluster/kinds/evaluate-quantized/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.from_datasets:per_dataset - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml index 8484072f5..5fc744ae2 100644 --- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml +++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.from_datasets:per_dataset - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.from_deps - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml index 9571cd0a8..40b1bd09f 100644 --- a/taskcluster/kinds/evaluate/kind.yml +++ b/taskcluster/kinds/evaluate/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.from_datasets:per_dataset - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/kinds/export/kind.yml b/taskcluster/kinds/export/kind.yml index 1046869ef..55af108a0 100644 --- a/taskcluster/kinds/export/kind.yml +++ b/taskcluster/kinds/export/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/extract-best/kind.yml b/taskcluster/kinds/extract-best/kind.yml index 84f1a18ed..a440b0283 100644 --- a/taskcluster/kinds/extract-best/kind.yml +++ b/taskcluster/kinds/extract-best/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/kinds/finetune-student/kind.yml b/taskcluster/kinds/finetune-student/kind.yml index 744207d8e..5dfdde2c5 100644 --- a/taskcluster/kinds/finetune-student/kind.yml +++ b/taskcluster/kinds/finetune-student/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/merge-corpus/kind.yml b/taskcluster/kinds/merge-corpus/kind.yml index c289f5334..df234789f 100644 --- a/taskcluster/kinds/merge-corpus/kind.yml +++ b/taskcluster/kinds/merge-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.find_upstreams:by_locales - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/merge-devset/kind.yml b/taskcluster/kinds/merge-devset/kind.yml index 1edcfe40f..e3602f400 100644 --- a/taskcluster/kinds/merge-devset/kind.yml +++ b/taskcluster/kinds/merge-devset/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - translations_taskgraph.transforms.find_upstreams:by_locales - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/merge-mono/kind.yml b/taskcluster/kinds/merge-mono/kind.yml index cc762c88d..bac4e3d20 100644 --- a/taskcluster/kinds/merge-mono/kind.yml +++ b/taskcluster/kinds/merge-mono/kind.yml @@ -5,6 +5,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.find_upstreams:mono - taskgraph.transforms.run:transforms diff --git a/taskcluster/kinds/merge-translated/kind.yml b/taskcluster/kinds/merge-translated/kind.yml index 2193b5569..c1c3ad043 100644 --- a/taskcluster/kinds/merge-translated/kind.yml +++ b/taskcluster/kinds/merge-translated/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/quantize/kind.yml b/taskcluster/kinds/quantize/kind.yml index 2d65bbc56..20ab850ae 100644 --- a/taskcluster/kinds/quantize/kind.yml +++ b/taskcluster/kinds/quantize/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/score/kind.yml b/taskcluster/kinds/score/kind.yml index 7f3ca24cc..f7a7b5517 100644 --- a/taskcluster/kinds/score/kind.yml +++ b/taskcluster/kinds/score/kind.yml @@ -6,7 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/shortlist/kind.yml b/taskcluster/kinds/shortlist/kind.yml index d94b2f870..84aa8d14c 100644 --- a/taskcluster/kinds/shortlist/kind.yml +++ b/taskcluster/kinds/shortlist/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/split-corpus/kind.yml b/taskcluster/kinds/split-corpus/kind.yml index 67b38fd14..eaecb45f4 100644 --- a/taskcluster/kinds/split-corpus/kind.yml +++ b/taskcluster/kinds/split-corpus/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/split-mono-src/kind.yml b/taskcluster/kinds/split-mono-src/kind.yml index 47096536c..4d3019110 100644 --- a/taskcluster/kinds/split-mono-src/kind.yml +++ b/taskcluster/kinds/split-mono-src/kind.yml @@ -5,6 +5,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/split-mono-trg/kind.yml b/taskcluster/kinds/split-mono-trg/kind.yml index f94f228d1..fd16b8bfd 100644 --- a/taskcluster/kinds/split-mono-trg/kind.yml +++ b/taskcluster/kinds/split-mono-trg/kind.yml @@ -5,6 +5,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/tests/kind.yml b/taskcluster/kinds/tests/kind.yml index b28c6dc30..67bf4a93d 100644 --- a/taskcluster/kinds/tests/kind.yml +++ b/taskcluster/kinds/tests/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - taskgraph.transforms.task:transforms diff --git a/taskcluster/kinds/toolchain/kind.yml b/taskcluster/kinds/toolchain/kind.yml index 571e29bc3..62cdb87fc 100644 --- a/taskcluster/kinds/toolchain/kind.yml +++ b/taskcluster/kinds/toolchain/kind.yml @@ -8,6 +8,7 @@ kind-dependencies: - fetch transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.run:transforms - taskgraph.transforms.cached_tasks:transforms - taskgraph.transforms.task:transforms diff --git a/taskcluster/kinds/train-backwards/kind.yml b/taskcluster/kinds/train-backwards/kind.yml index 0c7b4a46e..e5fff3f18 100644 --- a/taskcluster/kinds/train-backwards/kind.yml +++ b/taskcluster/kinds/train-backwards/kind.yml @@ -8,7 +8,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.training_continuation:transforms - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms @@ -58,7 +58,7 @@ tasks: worker-type: by-tasks-for: github-pull-request: b-largegpu - default: b-largegpu-largedisk-standard + default: b-largegpu-largedisk worker: max-run-time: 2592000 env: diff --git a/taskcluster/kinds/train-student/kind.yml b/taskcluster/kinds/train-student/kind.yml index 678f9cc90..e5840d73f 100644 --- a/taskcluster/kinds/train-student/kind.yml +++ b/taskcluster/kinds/train-student/kind.yml @@ -7,7 +7,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms @@ -56,7 +56,7 @@ tasks: worker-type: by-tasks-for: github-pull-request: b-largegpu-largedisk - default: b-largegpu-xlargedisk-standard + default: b-largegpu-xlargedisk worker: max-run-time: 2592000 env: diff --git a/taskcluster/kinds/train-teacher/kind.yml b/taskcluster/kinds/train-teacher/kind.yml index a272b3cdb..520ee3cd2 100644 --- a/taskcluster/kinds/train-teacher/kind.yml +++ b/taskcluster/kinds/train-teacher/kind.yml @@ -8,7 +8,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.training_continuation:transforms - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking @@ -79,7 +79,7 @@ tasks: worker-type: by-tasks-for: github-pull-request: b-largegpu - default: b-largegpu-xlargedisk-standard + default: b-largegpu-xlargedisk worker: max-run-time: 2592000 env: diff --git a/taskcluster/kinds/train-vocab/kind.yml b/taskcluster/kinds/train-vocab/kind.yml index 0cfe33f34..f89a372d2 100644 --- a/taskcluster/kinds/train-vocab/kind.yml +++ b/taskcluster/kinds/train-vocab/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - taskgraph.transforms.run:transforms - translations_taskgraph.transforms.cached_tasks:transforms diff --git a/taskcluster/kinds/translate-corpus/kind.yml b/taskcluster/kinds/translate-corpus/kind.yml index dd1ceb720..12387d7eb 100644 --- a/taskcluster/kinds/translate-corpus/kind.yml +++ b/taskcluster/kinds/translate-corpus/kind.yml @@ -6,7 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/kinds/translate-mono-src/kind.yml b/taskcluster/kinds/translate-mono-src/kind.yml index b0dda8c34..ec601b6c9 100644 --- a/taskcluster/kinds/translate-mono-src/kind.yml +++ b/taskcluster/kinds/translate-mono-src/kind.yml @@ -5,7 +5,7 @@ loader: taskgraph.loader.transform:loader transforms: - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/kinds/translate-mono-trg/kind.yml b/taskcluster/kinds/translate-mono-trg/kind.yml index bab9dc2b6..ef77f32db 100644 --- a/taskcluster/kinds/translate-mono-trg/kind.yml +++ b/taskcluster/kinds/translate-mono-trg/kind.yml @@ -6,7 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: - translations_taskgraph.transforms.marian_args:transforms - - translations_taskgraph.transforms.worker_env + - translations_taskgraph.transforms.worker_selection - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to - taskgraph.transforms.chunking diff --git a/taskcluster/test/params/large-lt-en.yml b/taskcluster/test/params/large-lt-en.yml index 4429255ba..7927ffa5b 100644 --- a/taskcluster/test/params/large-lt-en.yml +++ b/taskcluster/test/params/large-lt-en.yml @@ -163,4 +163,10 @@ training_config: target-stage: all taskcluster: split-chunks: 10 + worker-classes: + finetune-student: gcp-standard + train-backwards: gcp-standard + train-student: gcp-standard + train-teacher: gcp-standard + default: gcp-spot version: null diff --git a/taskcluster/test/params/small-ru-en.yml b/taskcluster/test/params/small-ru-en.yml index 8467c8fa6..245a7104e 100644 --- a/taskcluster/test/params/small-ru-en.yml +++ b/taskcluster/test/params/small-ru-en.yml @@ -91,4 +91,10 @@ training_config: target-stage: clean-corpus taskcluster: split-chunks: 2 + worker-classes: + finetune-student: gcp-standard + train-backwards: gcp-standard + train-student: gcp-standard + train-teacher: gcp-standard + default: gcp-spot version: null diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py index 74e0fa704..650e6fef9 100644 --- a/taskcluster/translations_taskgraph/actions/train.py +++ b/taskcluster/translations_taskgraph/actions/train.py @@ -15,6 +15,13 @@ "https://github.com/mozilla-releng/staging-firefox-translations-training", ) +WORKER_CLASSES = ( + # Regular, on-demand GCP instances + "gcp-standard", + # Spot instances in GCP + "gcp-spot", +) + def can_train(parameters): return parameters["head_repository"] in TRAIN_ON_PROJECTS or ( @@ -313,6 +320,15 @@ def validate_pretrained_models(params): "type": "number", "description": "The number of chunks (parallel jobs) to use in `split` steps", }, + "worker-classes": { + "type": "object", + "description": "The class of workers to use for this training, by kind", + "additionalProperties": { + "type": "string", + # TODO: add snakepit type(s) when they are brought online + "enum": ["gcp-standard", "gcp-spot"], + }, + }, }, }, }, diff --git a/taskcluster/translations_taskgraph/parameters.py b/taskcluster/translations_taskgraph/parameters.py index a77e42a22..da7516234 100644 --- a/taskcluster/translations_taskgraph/parameters.py +++ b/taskcluster/translations_taskgraph/parameters.py @@ -3,7 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. from taskgraph.parameters import extend_parameters_schema -from voluptuous import Optional, Required +from voluptuous import Extra, Optional, Required # These defaults line up with the `config.ci.yml` pipeline as much as possible. @@ -98,6 +98,9 @@ def get_defaults(_): # Taskcluster-specific configuration "taskcluster": { "split-chunks": 2, + "worker-classes": { + "default": "gcp-spot", + }, }, # Disable Weight & Biases publication on CI "wandb-publication": False, @@ -152,6 +155,10 @@ def get_defaults(_): }, Optional("taskcluster"): { Optional("split-chunks"): int, + Required("worker-classes"): { + Required("default"): str, + Extra: str, + }, }, Optional("wandb-publication"): bool, }, diff --git a/taskcluster/translations_taskgraph/transforms/worker_env.py b/taskcluster/translations_taskgraph/transforms/worker_env.py deleted file mode 100644 index c5f64b702..000000000 --- a/taskcluster/translations_taskgraph/transforms/worker_env.py +++ /dev/null @@ -1,54 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# -# This transform sequence injects worker-specific environment variables -# (such as those that dependent on the number and type of GPUs a worker has) -# into task definitions. This avoids the need to discover this information at -# runtime, or adjust in kinds when changing worker types. - -from taskgraph.transforms.base import TransformSequence -from taskgraph.util.schema import resolve_keyed_by - -transforms = TransformSequence() - - -@transforms.add -def evaluate_keyed_by(config, jobs): - for job in jobs: - resolve_keyed_by( - job, - "worker-type", - item_name=job["description"], - **{"tasks-for": config.params["tasks_for"]}, - ) - - yield job - - -@transforms.add -def inject_worker_env(config, jobs): - for job in jobs: - # This is called worker-type in jobs, but in reality it's an alias resolved in the graph config... - worker_alias = job["worker-type"] - - worker_definition = config.graph_config["workers"]["aliases"].get(worker_alias) - if not worker_definition: - raise Exception(f"Couldn't find worker definition for {worker_alias} in graph config!") - - worker_type = worker_definition["worker-type"] - worker_config = config.graph_config["worker-configuration"].get(worker_type) - if not worker_config: - raise Exception( - f"Couldn't find worker configuration for {worker_type} in graph config!" - ) - - worker_env = worker_config["env"] - if "GPUS" not in worker_env or "WORKSPACE" not in worker_env: - raise Exception( - "GPUS and/or WORKSPACE values missing from worker env, this is probably misconfiguration." - ) - - job["worker"]["env"].update(worker_env) - - yield job diff --git a/taskcluster/translations_taskgraph/transforms/worker_selection.py b/taskcluster/translations_taskgraph/transforms/worker_selection.py new file mode 100644 index 000000000..44ecd9b4b --- /dev/null +++ b/taskcluster/translations_taskgraph/transforms/worker_selection.py @@ -0,0 +1,71 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This transform sequence injects worker-specific environment variables +# (such as those that dependent on the number and type of GPUs a worker has) +# into task definitions. This avoids the need to discover this information at +# runtime, or adjust in kinds when changing worker types. + +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.schema import evaluate_keyed_by + +transforms = TransformSequence() + + +@transforms.add +def set_worker_type(config, jobs): + """Determines the general type of worker each task wants, which sometimes + depends on `tasks-for`. Tasks typically will end up specifying one of the + worker `aliases` from config.yml after this is evaluated, eg: b-cpu, + b-largegpu-largedisk.""" + + training_config = config.params.get("training_config") + worker_classes = training_config["taskcluster"]["worker-classes"] + worker_class = worker_classes.get(config.kind, worker_classes["default"]) + for job in jobs: + # First, evaluate the `keyed-by` in the initial task specification from + # the kind, if present. This should give us one of the keys from + # `worker-configuration` in config.yml. + task_worker_type = evaluate_keyed_by( + job["worker-type"], + job["description"], + {"tasks-for": config.params["tasks_for"]}, + ) + + # Now that we have one of the aliases, we need to resolve it to a + # specific worker type, as some of those aliases have their own + # `keyed-by` blocks, which may give different worker types depending + # on what's in the training config. + worker_alias_block = config.graph_config["local-worker-aliases"][task_worker_type].copy() + job["worker-type"] = evaluate_keyed_by( + worker_alias_block, + task_worker_type, + {"worker-class": worker_class}, + ) + + yield job + + +@transforms.add +def inject_worker_env(config, jobs): + for job in jobs: + # This is called worker-type in jobs, but in reality it's an alias resolved in the graph config... + worker_type = job["worker-type"] + worker_config = config.graph_config["worker-configuration"].get(worker_type, {}) + + worker_env = worker_config.get("env", {}) + if "GPUS" not in worker_env or "WORKSPACE" not in worker_env: + # GPU tasks will not function correctly without these set; make this an error + # before they even run. + if "gpu" in worker_type: + raise Exception( + "GPUS and/or WORKSPACE values missing from worker env, this is probably misconfiguration." + ) + else: + yield job + continue + + job["worker"]["env"].update(worker_env) + + yield job diff --git a/tests/fixtures/config.pytest.yml b/tests/fixtures/config.pytest.yml index 69661b79a..c17296686 100644 --- a/tests/fixtures/config.pytest.yml +++ b/tests/fixtures/config.pytest.yml @@ -57,3 +57,5 @@ marian-args: target-stage: all taskcluster: split-chunks: 10 + worker-classes: + default: gcp-spot