diff --git a/taskcluster/ci/evaluate-teacher-ensemble/kind.yml b/taskcluster/ci/evaluate-teacher-ensemble/kind.yml index cc48c0515..19321a3dc 100644 --- a/taskcluster/ci/evaluate-teacher-ensemble/kind.yml +++ b/taskcluster/ci/evaluate-teacher-ensemble/kind.yml @@ -16,7 +16,6 @@ transforms: kind-dependencies: - dataset - train-teacher - - train-vocab - alignments - toolchain @@ -110,13 +109,11 @@ tasks: unique-kinds: false kinds: - train-teacher - - train-vocab fetches: train-teacher: - artifact: final.model.npz.best-{best_model}.npz dest: model{this_chunk} - artifact: final.model.npz.best-chrf.npz.decoder.yml - train-vocab: - artifact: vocab.spm extract: false diff --git a/taskcluster/ci/evaluate/kind.yml b/taskcluster/ci/evaluate/kind.yml index b05c7ad9f..2f383267d 100644 --- a/taskcluster/ci/evaluate/kind.yml +++ b/taskcluster/ci/evaluate/kind.yml @@ -18,7 +18,6 @@ kind-dependencies: - dataset - train-backwards - train-teacher - - train-vocab - train-student - finetune-student - alignments @@ -112,7 +111,6 @@ tasks: dependencies: dataset: dataset-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale} train-backwards: train-backwards-{src_locale}-{trg_locale} - train-vocab: train-vocab-{src_locale}-{trg_locale} fetches: dataset: - artifact: "{dataset_sanitized}.{src_locale}.zst" @@ -124,7 +122,6 @@ tasks: extract: false - artifact: final.model.npz.best-{best_model}.npz.decoder.yml extract: false - train-vocab: - artifact: vocab.spm extract: false toolchain: @@ -162,7 +159,6 @@ tasks: dependencies: dataset: dataset-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale} train-teacher: train-teacher-{src_locale}-{trg_locale}-{this_chunk}/{total_chunks} - train-vocab: train-vocab-{src_locale}-{trg_locale} fetches: dataset: - artifact: "{dataset_sanitized}.{src_locale}.zst" @@ -174,7 +170,6 @@ tasks: extract: false - artifact: final.model.npz.best-{best_model}.npz.decoder.yml extract: false - train-vocab: - artifact: vocab.spm extract: false toolchain: @@ -201,7 +196,6 @@ tasks: dependencies: dataset: dataset-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale} train-student: train-student-{src_locale}-{trg_locale} - train-vocab: train-vocab-{src_locale}-{trg_locale} fetches: dataset: - artifact: "{dataset_sanitized}.{src_locale}.zst" @@ -213,7 +207,6 @@ tasks: extract: false - artifact: final.model.npz.best-{best_model}.npz.decoder.yml extract: false - train-vocab: - artifact: vocab.spm extract: false toolchain: @@ -240,7 +233,6 @@ tasks: dependencies: dataset: dataset-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale} finetune-student: finetune-student-{src_locale}-{trg_locale} - train-vocab: train-vocab-{src_locale}-{trg_locale} fetches: dataset: - artifact: "{dataset_sanitized}.{src_locale}.zst" @@ -252,7 +244,6 @@ tasks: extract: false - artifact: final.model.npz.best-{best_model}.npz.decoder.yml extract: false - train-vocab: - artifact: vocab.spm extract: false toolchain: diff --git a/taskcluster/ci/finetune-student/kind.yml b/taskcluster/ci/finetune-student/kind.yml index 85a74e820..a4ea43ef1 100644 --- a/taskcluster/ci/finetune-student/kind.yml +++ b/taskcluster/ci/finetune-student/kind.yml @@ -84,7 +84,7 @@ tasks: pip3 install -r $VCS_PATH/pipeline/train/requirements/train.txt && export PATH="$HOME/.local/bin:$PATH" && export MARIAN=$MOZ_FETCHES_DIR && - $VCS_PATH/pipeline/train/train.sh + $VCS_PATH/taskcluster/scripts/pipeline/train-taskcluster.sh student finetune {src_locale} @@ -92,9 +92,10 @@ tasks: $MOZ_FETCHES_DIR/corpus $MOZ_FETCHES_DIR/devset $TASK_WORKDIR/artifacts - $MOZ_FETCHES_DIR/vocab.spm {best_model} $MOZ_FETCHES_DIR/corpus.aln.zst + None + None --pretrained-model $MOZ_FETCHES_DIR/final.model.npz.best-{best_model}.npz {marian_args} diff --git a/taskcluster/ci/score/kind.yml b/taskcluster/ci/score/kind.yml index c80ea5aff..e2602e638 100644 --- a/taskcluster/ci/score/kind.yml +++ b/taskcluster/ci/score/kind.yml @@ -13,7 +13,6 @@ transforms: kind-dependencies: - train-backwards - - train-vocab - merge-translated - toolchain @@ -83,7 +82,6 @@ tasks: dependencies: train-backwards: train-backwards-{src_locale}-{trg_locale} - train-vocab: train-vocab-{src_locale}-{trg_locale} merge-translated: merge-translated-{src_locale}-{trg_locale} fetches: @@ -92,7 +90,6 @@ tasks: train-backwards: - artifact: final.model.npz.best-{best_model}.npz extract: false - train-vocab: - artifact: vocab.spm extract: false merge-translated: diff --git a/taskcluster/ci/train-backwards/kind.yml b/taskcluster/ci/train-backwards/kind.yml index a9b12847c..1ecb86fd0 100644 --- a/taskcluster/ci/train-backwards/kind.yml +++ b/taskcluster/ci/train-backwards/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.training_continuation:transforms - translations_taskgraph.transforms.marian_args:transforms - taskgraph.transforms.task_context - taskgraph.transforms.job:transforms @@ -29,16 +30,20 @@ tasks: type: train-backwards resources: - pipeline/train/train.sh + - taskcluster/scripts/pipeline/train-taskcluster.sh - pipeline/train/configs/model/backward.yml - pipeline/train/configs/opustrainer/backward.yml - pipeline/train/configs/training/backward.train.yml from-parameters: marian_args: training_config.marian-args.training-backward + pretrained_backward: training_config.experiment.pretrained-models.train-backwards task-context: from-parameters: best_model: training_config.experiment.best-model src_locale: training_config.experiment.src trg_locale: training_config.experiment.trg + pretrained_backward_mode: training_config.experiment.pretrained-models.train-backwards.mode + pretrained_backward_type: training_config.experiment.pretrained-models.train-backwards.type substitution-fields: - description - name @@ -81,7 +86,7 @@ tasks: pip3 install -r $VCS_PATH/pipeline/train/requirements/train.txt && export PATH="$HOME/.local/bin:$PATH" && export MARIAN=$MOZ_FETCHES_DIR && - $VCS_PATH/pipeline/train/train.sh + $VCS_PATH/taskcluster/scripts/pipeline/train-taskcluster.sh backward train {trg_locale} @@ -89,9 +94,10 @@ tasks: $MOZ_FETCHES_DIR/corpus $MOZ_FETCHES_DIR/devset $TASK_WORKDIR/artifacts - $MOZ_FETCHES_DIR/vocab.spm {best_model} None + {pretrained_backward_mode} + {pretrained_backward_type} {marian_args} dependencies: diff --git a/taskcluster/ci/train-student/kind.yml b/taskcluster/ci/train-student/kind.yml index 0e24b3933..168cfa88b 100644 --- a/taskcluster/ci/train-student/kind.yml +++ b/taskcluster/ci/train-student/kind.yml @@ -81,7 +81,7 @@ tasks: pip3 install -r $VCS_PATH/pipeline/train/requirements/train.txt && export PATH="$HOME/.local/bin:$PATH" && export MARIAN=$MOZ_FETCHES_DIR && - $VCS_PATH/pipeline/train/train.sh + $VCS_PATH/taskcluster/scripts/pipeline/train-taskcluster.sh student train {src_locale} @@ -89,9 +89,10 @@ tasks: $MOZ_FETCHES_DIR/corpus $MOZ_FETCHES_DIR/devset $TASK_WORKDIR/artifacts - $MOZ_FETCHES_DIR/vocab.spm {best_model} $MOZ_FETCHES_DIR/corpus.aln.zst + None + None {marian_args} dependencies: diff --git a/taskcluster/ci/train-teacher/kind.yml b/taskcluster/ci/train-teacher/kind.yml index 7d5dda039..a403e99ae 100644 --- a/taskcluster/ci/train-teacher/kind.yml +++ b/taskcluster/ci/train-teacher/kind.yml @@ -6,6 +6,7 @@ loader: taskgraph.loader.transform:loader transforms: + - translations_taskgraph.transforms.training_continuation:transforms - translations_taskgraph.transforms.marian_args:transforms - taskgraph.transforms.task_context - translations_taskgraph.transforms.cast_to @@ -41,6 +42,8 @@ tasks: trg_locale: training_config.experiment.trg best_model: training_config.experiment.best-model teacher_ensemble: training_config.experiment.teacher-ensemble + pretrained_teacher_mode: training_config.experiment.pretrained-models.train-teacher.mode + pretrained_teacher_type: training_config.experiment.pretrained-models.train-teacher.type substitution-fields: - description - name @@ -61,9 +64,11 @@ tasks: - pipeline/train/configs/opustrainer/teacher.yml - pipeline/train/configs/training/teacher.train.yml - pipeline/train/train.sh + - taskcluster/scripts/pipeline/train-taskcluster.sh from-parameters: marian_args: training_config.marian-args.training-teacher teacher-ensemble: training_config.experiment.teacher-ensemble + pretrained_teacher: training_config.experiment.pretrained-models.train-teacher worker-type: b-linux-v100-gpu-4-1tb expires-after: "90 days" worker: @@ -99,7 +104,7 @@ tasks: pip3 install -r $VCS_PATH/pipeline/train/requirements/train.txt && export PATH="$HOME/.local/bin:$PATH" && export MARIAN=$MOZ_FETCHES_DIR && - $VCS_PATH/pipeline/train/train.sh + $VCS_PATH/taskcluster/scripts/pipeline/train-taskcluster.sh teacher train {src_locale} @@ -107,9 +112,10 @@ tasks: $MOZ_FETCHES_DIR/corpus,$MOZ_FETCHES_DIR/mono $MOZ_FETCHES_DIR/devset $TASK_WORKDIR/artifacts - $MOZ_FETCHES_DIR/vocab.spm {best_model} None + {pretrained_teacher_mode} + {pretrained_teacher_type} {marian_args} dependencies: diff --git a/taskcluster/ci/translate-corpus/kind.yml b/taskcluster/ci/translate-corpus/kind.yml index 61409576d..95bdf3ed7 100644 --- a/taskcluster/ci/translate-corpus/kind.yml +++ b/taskcluster/ci/translate-corpus/kind.yml @@ -17,7 +17,6 @@ transforms: kind-dependencies: - split-corpus - train-teacher - - train-vocab - toolchain tasks: @@ -59,17 +58,15 @@ tasks: kinds: - train-teacher - split-corpus - - train-vocab fetches: split-corpus: - artifact: src-file.{this_chunk}.zst extract: true - train-vocab: - - artifact: vocab.spm - extract: false train-teacher: - artifact: final.model.npz.best-{best_model}.npz dest: model{this_chunk} + - artifact: vocab.spm + extract: false task-context: from-parameters: diff --git a/taskcluster/ci/translate-mono-src/kind.yml b/taskcluster/ci/translate-mono-src/kind.yml index d4d1ef2fc..d96381b12 100644 --- a/taskcluster/ci/translate-mono-src/kind.yml +++ b/taskcluster/ci/translate-mono-src/kind.yml @@ -16,7 +16,6 @@ transforms: kind-dependencies: - split-mono-src - train-teacher - - train-vocab - toolchain task-defaults: @@ -114,14 +113,12 @@ tasks: kinds: - train-teacher - split-mono-src - - train-vocab fetches: split-mono-src: - artifact: out-file.{this_chunk}.zst extract: true - train-vocab: - - artifact: vocab.spm - extract: false train-teacher: - artifact: final.model.npz.best-{best_model}.npz dest: model{this_chunk} + - artifact: vocab.spm + extract: false diff --git a/taskcluster/ci/translate-mono-trg/kind.yml b/taskcluster/ci/translate-mono-trg/kind.yml index 9ec38a531..d32440037 100644 --- a/taskcluster/ci/translate-mono-trg/kind.yml +++ b/taskcluster/ci/translate-mono-trg/kind.yml @@ -16,7 +16,6 @@ transforms: kind-dependencies: - split-mono-trg - train-backwards - - train-vocab - toolchain task-defaults: @@ -84,13 +83,13 @@ task-defaults: {marian_args} dependencies: - train-vocab: train-vocab-{src_locale}-{trg_locale} + train-backwards: train-backwards-{src_locale}-{trg_locale} fetches: toolchain: - marian - cuda-toolkit - train-vocab: + train-backwards: - artifact: vocab.spm extract: false diff --git a/taskcluster/scripts/pipeline/train-taskcluster.sh b/taskcluster/scripts/pipeline/train-taskcluster.sh new file mode 100755 index 000000000..e4e303d1e --- /dev/null +++ b/taskcluster/scripts/pipeline/train-taskcluster.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -x +set -euo pipefail + +[[ -v MOZ_FETCHES_DIR ]] || { echo "MOZ_FETCHES_DIR is not set"; exit 1; } + +pushd `dirname $0`/../../.. &>/dev/null +VCS_ROOT=$(pwd) +popd &>/dev/null + +if [ "$#" -lt 10 ]; then + echo "Usage: $0 [extra_params...]" + exit 1 +fi + +model_type=$1 +training_type=$2 +src=$3 +trg=$4 +train_set_prefix=$5 +valid_set_prefix=$6 +model_dir=$7 +best_model_metric=$8 +alignments=$9 +pretrained_model_mode=${10} +pretrained_model_type=${11} +extra_params=( "${@:12}" ) + +if [ "$pretrained_model_mode" == "None" ]; then + vocab="$MOZ_FETCHES_DIR/vocab.spm" +else + vocab="$TASK_WORKDIR/artifacts/vocab.spm" +fi + +export MARIAN=$MOZ_FETCHES_DIR + +case "$pretrained_model_mode" in + "use") + echo "The training mode is 'use', using existing model without further training." + exit 0 + ;; + "continue"|"init"|"None") + if [ "$pretrained_model_mode" == "init" ]; then + extra_params+=("--pretrained-model" "$TASK_WORKDIR/artifacts/final.model.npz.best-$best_model_metric.npz" "--no-restore-corpus") + fi + $VCS_ROOT/pipeline/train/train.sh \ + "$model_type" \ + "$training_type" \ + "$src" \ + "$trg" \ + "$train_set_prefix" \ + "$valid_set_prefix" \ + "$model_dir" \ + "$vocab" \ + "$best_model_metric" \ + "$alignments" \ + "${extra_params[@]}" + if [ "$pretrained_model_mode" == "None" ]; then + cp "$vocab" "$model_dir" + fi + ;; +esac diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py index abf82a29a..05e87daa1 100644 --- a/taskcluster/translations_taskgraph/actions/train.py +++ b/taskcluster/translations_taskgraph/actions/train.py @@ -21,6 +21,27 @@ def can_train(parameters): defaults = get_defaults("")["training_config"] +def validate_pretrained_models(params): + pretrained_models = params["training_config"]["experiment"].get("pretrained-models", {}) + train_teacher = pretrained_models.get("train-teacher") + if train_teacher: + teacher_ensemble = params["training_config"]["experiment"]["teacher-ensemble"] + if len(train_teacher["urls"]) != teacher_ensemble: + raise Exception( + f"The experiment's 'teacher-ensemble' ({teacher_ensemble}) " + f"does not match the number of provided model 'urls' ({len(train_teacher['urls'])}) " + f"for the pretrained 'train-teacher' ensemble." + ) + train_backwards = pretrained_models.get("train-backwards") + if train_backwards: + if len(train_backwards["urls"]) != 1: + raise Exception( + f"The experiment's 'pretrained-models.backward.urls' ({len(train_backwards['urls'])}) " + f"must be equal to one (1). " + f"The pipeline's backward model is _not_ an ensemble." + ) + + @register_callback_action( name="train", title="Train", @@ -96,14 +117,6 @@ def can_train(parameters): "type": "number", "description": "Number of teachers to train", }, - "backward-model": { - "type": "string", - "description": "???", - }, - "vocab": { - "type": "string", - "description": "???", - }, "mono-max-sentences-src": { "type": "number", "description": "limits per downloaded src dataset", @@ -149,6 +162,53 @@ def can_train(parameters): "default-threshold", ], }, + # We are using urls because pretrained-models should be flexible enough + # to point at model (ensembles) that are not in taskcluster. + # Models could be in a long-term storage bucket, or we may use + # pretrained models hosted elsewhere. + "pretrained-models": { + "type": "object", + "properties": { + "train-teacher": { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": {"type": "string", "format": "uri"}, + "minItems": 1, + }, + "mode": { + "type": "string", + "enum": ["continue", "init", "use"], + }, + "type": { + "type": "string", + "enum": ["default", "opusmt"], + }, + }, + "required": ["urls", "mode", "type"], + }, + "train-backwards": { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": {"type": "string", "format": "uri"}, + "minItems": 1, + }, + "mode": { + "type": "string", + "enum": ["continue", "init", "use"], + }, + "type": { + "type": "string", + "enum": ["default", "opusmt"], + }, + }, + "required": ["urls", "mode", "type"], + }, + }, + }, }, "required": [ "name", @@ -291,5 +351,7 @@ def train_action(parameters, graph_config, input, task_group_id, task_id): parameters["tasks_for"] = "action" parameters["training_config"] = input + validate_pretrained_models(parameters) + parameters = Parameters(**parameters) taskgraph_decision({"root": graph_config.root_dir}, parameters=parameters) diff --git a/taskcluster/translations_taskgraph/parameters.py b/taskcluster/translations_taskgraph/parameters.py index aa4ac500f..965a48043 100644 --- a/taskcluster/translations_taskgraph/parameters.py +++ b/taskcluster/translations_taskgraph/parameters.py @@ -20,10 +20,6 @@ def get_defaults(_): "src": "ru", "trg": "en", "teacher-ensemble": 1, - # Used for providing a pretrained backward model. We do not support this yet. - "backward-model": "NOT-YET-SUPPORTED", - # Used for providing a pretrained vocab. We do not support this yet. - "vocab": "NOT-YET-SUPPORTED", "mono-max-sentences-trg": 10000, "mono-max-sentences-src": 10000, "split-length": 5000, @@ -124,8 +120,6 @@ def get_defaults(_): Required("src"): str, Required("trg"): str, Required("teacher-ensemble"): int, - Required("backward-model"): str, - Required("vocab"): str, Required("mono-max-sentences-trg"): int, Required("mono-max-sentences-src"): int, Required("split-length"): int, @@ -139,6 +133,18 @@ def get_defaults(_): str: float, }, }, + Optional("pretrained-models"): { + Optional("train-teacher"): { + Required("urls"): [str], + Required("mode"): str, + Required("type"): str, + }, + Optional("train-backwards"): { + Required("urls"): [str], + Required("mode"): str, + Required("type"): str, + }, + }, }, Optional("datasets"): { str: [str], diff --git a/taskcluster/translations_taskgraph/transforms/training_continuation.py b/taskcluster/translations_taskgraph/transforms/training_continuation.py new file mode 100644 index 000000000..588ecefab --- /dev/null +++ b/taskcluster/translations_taskgraph/transforms/training_continuation.py @@ -0,0 +1,78 @@ +from taskgraph.transforms.base import TransformSequence +from urllib.parse import urljoin +import os + +CONTINUE_TRAINING_ARTIFACTS = ( + "devset.out", + "model.npz", + "model.npz.best-bleu-detok.npz", + "model.npz.best-bleu-detok.npz.decoder.yml", + "model.npz.best-ce-mean-words.npz", + "model.npz.best-ce-mean-words.npz.decoder.yml", + "final.model.npz.best-chrf.npz", + "model.npz.best-chrf.npz", + "final.model.npz.best-chrf.npz.decoder.yml", + "model.npz.best-chrf.npz.decoder.yml", + "model.npz.decoder.yml", + "model.npz.optimizer.npz", + "model.npz.progress.yml", + "model.npz.yml", + "train.log", + "valid.log", + "vocab.spm", +) + +INITIALIZE_MODEL_ARTIFACTS = ( + "model.npz.best-bleu-detok.npz", + "model.npz.best-ce-mean-words.npz", + "final.model.npz.best-chrf.npz", + "model.npz.best-chrf.npz", +) + + +def get_artifact_mount(url, directory, artifact_name): + normalized_url = f"{url}/" if not url.endswith("/") else url + artifact_url = urljoin(normalized_url, artifact_name) + return { + "content": { + "url": artifact_url, + }, + "file": os.path.join(directory, artifact_name), + } + + +def get_artifact_mounts(urls, directory, artifact_names): + for url in urls: + artifact_mounts = [] + for artifact_name in artifact_names: + artifact_mounts.append(get_artifact_mount(url, directory, artifact_name)) + yield artifact_mounts + + +transforms = TransformSequence() + + +@transforms.add +def add_pretrained_model_mounts(config, jobs): + pretrained_models = config.params["training_config"]["experiment"].get("pretrained-models", {}) + for job in jobs: + pretrained_models_training_artifact_mounts = { + pretrained_model: get_artifact_mounts( + pretrained_models[pretrained_model]["urls"], + "./artifacts", + INITIALIZE_MODEL_ARTIFACTS + if pretrained_models[pretrained_model]["mode"] == "init" + else CONTINUE_TRAINING_ARTIFACTS, + ) + for pretrained_model in pretrained_models + } + pretrained_model_training_artifact_mounts = next( + pretrained_models_training_artifact_mounts.get(config.kind, iter((None,))) + ) + if pretrained_model_training_artifact_mounts: + mounts = job["worker"].get("mounts", []) + mounts.extend(pretrained_model_training_artifact_mounts) + job["worker"]["mounts"] = mounts + job["dependencies"].pop("train-vocab") + job["fetches"].pop("train-vocab") + yield job