Publish evaluation metrics (#598)

* Configure evaluation tasks * Extract w&b code into module * Do not check taskcluwter when publication is disabled * Publish evaluation metrics to W&B * Fix running eval tracking on CI * Use args.wandb_run_name instead of default teacher * Remove duplicated arguments * Retrieve dataset from Taskcluster directly * Add missing calls to publisher and logging * Allow publishing metrics as a table on existing runs (i.e. previous trainings) * Update regex to parse labels ending with '-1' * Generic support for train/eval different naming * Update tests * Support disabled publication --------- Co-authored-by: Bastien Abadie <[email protected]> Co-authored-by: Bastien Abadie <[email protected]> Co-authored-by: Evgeny Pavlov <[email protected]>
mozilla · May 22, 2024 · 8a1d8ef · 8a1d8ef
1 parent 6411267
commit 8a1d8ef
Show file tree

Hide file tree

Showing 11 changed files with 315 additions and 162 deletions.
diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py
@@ -54,6 +54,14 @@
 from pipeline.common.logging import get_logger
 
 logger = get_logger("eval")
+try:
+    from translations_parser.utils import metric_from_tc_context
+    from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher
+
+    WANDB_AVAILABLE = True
+except ImportError as e:
+    print(f"Failed to import tracking module: {e}")
+    WANDB_AVAILABLE = False
 
 
 def run_bash_oneliner(command: str):
@@ -136,6 +144,11 @@ def main(args_list: Optional[list[str]] = None) -> None:
     parser.add_argument(
         "--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)"
     )
+
+    # Add Weight & Biases CLI args when module is loaded
+    if WANDB_AVAILABLE:
+        add_wandb_arguments(parser)
+
     args = parser.parse_args(args_list)
 
     src = args.src
@@ -329,6 +342,24 @@ def main(args_list: Optional[list[str]] = None) -> None:
     with open(metrics_file, "w") as file:
         file.write(f"{bleu_details['score']}\n" f"{chrf_details['score']}\n" f"{comet_score}\n")
 
+    if WANDB_AVAILABLE:
+        wandb = get_wandb_publisher(  # noqa
+            project_name=args.wandb_project,
+            group_name=args.wandb_group,
+            run_name=args.wandb_run_name,
+            taskcluster_secret=args.taskcluster_secret,
+            artifacts=args.wandb_artifacts,
+            publication=args.wandb_publication,
+        )
+        if wandb:
+            logger.info("Initializing Weight & Biases client")
+            # Allow publishing metrics as a table on existing runs (i.e. previous trainings)
+            wandb.open(resume=True)
+            logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})")
+            metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"])
+            wandb.handle_metrics(metrics=[metric])
+            wandb.close()
+
 
 if __name__ == "__main__":
     main()
diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml
@@ -45,9 +45,11 @@ tasks:
             substitution-fields:
                 - run.command
                 - fetches
+                - worker.env
             from-parameters:
                 src_locale: training_config.experiment.src
                 trg_locale: training_config.experiment.trg
+                wandb_publication: training_config.wandb-publication
 
         worker-type: b-gpu
         worker:
@@ -60,6 +62,19 @@ tasks:
                 # This is a separate environment variable so tests can override it.
                 BMT_MARIAN: $MOZ_FETCHES_DIR
 
+                # Weight & Biases trigger
+                WANDB_PUBLICATION: "{wandb_publication}"
+
+                # Weight & Biases publication token is stored in that secret
+                TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
+
+            # Taskcluster proxy is required to read secrets
+            taskcluster-proxy: true
+
+        # The task needs to be able to read that secret to publish on Weight & Biases
+        scopes:
+          - secrets:get:project/translations/level-1/weights-and-biases
+
         # Don't run unless explicitly scheduled
         run-on-tasks-for: []
 
@@ -69,9 +84,11 @@ tasks:
                 - bash
                 - -c
                 - >-
-                    pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
-                    export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
                     export PATH=$PATH:~/.local/bin &&
+                    export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    pip install --upgrade pip &&
+                    pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
+                    pip install $VCS_PATH/tracking &&
                     zstd --rm -d $MOZ_FETCHES_DIR/lex.s2t.pruned.zst &&
                     $VCS_PATH/pipeline/eval/eval.py
                     --src               {src_locale}

diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
@@ -43,10 +43,13 @@ tasks:
             substitution-fields:
                 - fetches
                 - run.command
+                - worker.env
             from-parameters:
                 best_model: training_config.experiment.best-model
                 src_locale: training_config.experiment.src
                 trg_locale: training_config.experiment.trg
+                wandb_publication: training_config.wandb-publication
+
         worker-type: b-gpu
         worker:
             artifacts:
@@ -58,6 +61,19 @@ tasks:
                 # This is a separate environment variable so tests can override it.
                 MARIAN: $MOZ_FETCHES_DIR
 
+                # Weight & Biases trigger
+                WANDB_PUBLICATION: "{wandb_publication}"
+
+                # Weight & Biases publication token is stored in that secret
+                TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
+
+            # Taskcluster proxy is required to read secrets
+            taskcluster-proxy: true
+
+        # The task needs to be able to read that secret to publish on Weight & Biases
+        scopes:
+          - secrets:get:project/translations/level-1/weights-and-biases
+
         # Don't run unless explicitly scheduled
         run-on-tasks-for: []
 
@@ -73,9 +89,11 @@ tasks:
                 - bash
                 - -c
                 - >-
-                    pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
-                    export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
                     export PATH=$PATH:~/.local/bin &&
+                    export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    pip install --upgrade pip &&
+                    pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
+                    pip install $VCS_PATH/tracking &&
                     sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
                     sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
                     $VCS_PATH/pipeline/eval/eval.py

diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml
@@ -41,11 +41,13 @@ task-defaults:
     task-context:
         substitution-fields:
             - run.command
+            - worker.env
         from-parameters:
             best_model: training_config.experiment.best-model
             src_locale: training_config.experiment.src
             trg_locale: training_config.experiment.trg
             split_chunks: training_config.experiment.teacher-ensemble
+            wandb_publication: training_config.wandb-publication
     worker-type: b-gpu
     worker:
         artifacts:
@@ -57,6 +59,19 @@ task-defaults:
             # This is a separate environment variable so tests can override it.
             MARIAN: $MOZ_FETCHES_DIR
 
+            # Weight & Biases trigger
+            WANDB_PUBLICATION: "{wandb_publication}"
+
+            # Weight & Biases publication token is stored in that secret
+            TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
+
+        # Taskcluster proxy is required to read secrets
+        taskcluster-proxy: true
+
+    # The task needs to be able to read that secret to publish on Weight & Biases
+    scopes:
+      - secrets:get:project/translations/level-1/weights-and-biases
+
     # Don't run unless explicitly scheduled
     run-on-tasks-for: []
 
@@ -72,9 +87,11 @@ task-defaults:
             - bash
             - -c
             - >-
-                pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
                 export PATH=$PATH:~/.local/bin &&
                 export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                pip install --upgrade pip &&
+                pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
+                pip install $VCS_PATH/tracking &&
                 sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
                 sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
                 $VCS_PATH/pipeline/eval/eval.py

diff --git a/tests/test_tracking_cli.py b/tests/test_tracking_cli.py
@@ -123,9 +123,9 @@ def test_experiments_marian_1_10(wandb_mock, getargs_mock, caplog, samples_dir,
             ),
             (logging.INFO, "Found 2 quantized metrics from speed folder"),
             (logging.INFO, "Found 16 metrics from task logs"),
-            (logging.INFO, "Creating missing run backward with associated metrics"),
+            (logging.INFO, "Creating missing run backwards with associated metrics"),
             (logging.INFO, "Creating missing run quantized with associated metrics"),
-            (logging.INFO, "Creating missing run student-finetuned with associated metrics"),
+            (logging.INFO, "Creating missing run student-finetune with associated metrics"),
             (logging.INFO, "Creating missing run teacher-base-0 with associated metrics"),
             (logging.INFO, "Creating missing run teacher-base-1 with associated metrics"),
             (logging.INFO, "Creating missing run teacher-ensemble with associated metrics"),

diff --git a/tests/test_tracking_utils.py b/tests/test_tracking_utils.py
@@ -33,7 +33,7 @@
         ),
         (
             "eval_student-finetuned_flores_devtest",
-            ("student-finetuned", "flores", "devtest", None),
+            ("student-finetune", "flores", "devtest", None),
         ),
         (
             "eval_teacher-base0_flores_devtest",
@@ -65,7 +65,7 @@
         ),
         (
             "evaluate-backward-url-gcp_pytest-dataset_a0017e-en-ru",
-            ("backward", "url", "gcp_pytest-dataset_a0017e", None),
+            ("backwards", "url", "gcp_pytest-dataset_a0017e", None),
         ),
         (
             "train-teacher-ast-en-1",
@@ -76,6 +76,10 @@
             "evaluate-student-sacrebleu-wmt19-ast-en",
             ("student", "sacrebleu", "wmt19", None),
         ),
+        (
+            "evaluate-teacher-flores-devtest-ru-en-1",
+            ("teacher-1", "flores", "devtest", None),
+        ),
     ],
 )
 def test_parse_task_label(task_label, parsed_values):