Skip to content

Commit

Permalink
Publish evaluation metrics (#598)
Browse files Browse the repository at this point in the history
* Configure evaluation tasks

* Extract w&b code into module

* Do not check taskcluwter when publication is disabled

* Publish evaluation metrics to W&B

* Fix running eval tracking on CI

* Use args.wandb_run_name instead of default teacher

* Remove duplicated arguments

* Retrieve dataset from Taskcluster directly

* Add missing calls to publisher and logging

* Allow publishing metrics as a table on existing runs (i.e. previous trainings)

* Update regex to parse labels ending with '-1'

* Generic support for train/eval different naming

* Update tests

* Support disabled publication

---------

Co-authored-by: Bastien Abadie <[email protected]>
Co-authored-by: Bastien Abadie <[email protected]>
Co-authored-by: Evgeny Pavlov <[email protected]>
  • Loading branch information
4 people authored May 22, 2024
1 parent 6411267 commit 8a1d8ef
Show file tree
Hide file tree
Showing 11 changed files with 315 additions and 162 deletions.
31 changes: 31 additions & 0 deletions pipeline/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@
from pipeline.common.logging import get_logger

logger = get_logger("eval")
try:
from translations_parser.utils import metric_from_tc_context
from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher

WANDB_AVAILABLE = True
except ImportError as e:
print(f"Failed to import tracking module: {e}")
WANDB_AVAILABLE = False


def run_bash_oneliner(command: str):
Expand Down Expand Up @@ -136,6 +144,11 @@ def main(args_list: Optional[list[str]] = None) -> None:
parser.add_argument(
"--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)"
)

# Add Weight & Biases CLI args when module is loaded
if WANDB_AVAILABLE:
add_wandb_arguments(parser)

args = parser.parse_args(args_list)

src = args.src
Expand Down Expand Up @@ -329,6 +342,24 @@ def main(args_list: Optional[list[str]] = None) -> None:
with open(metrics_file, "w") as file:
file.write(f"{bleu_details['score']}\n" f"{chrf_details['score']}\n" f"{comet_score}\n")

if WANDB_AVAILABLE:
wandb = get_wandb_publisher( # noqa
project_name=args.wandb_project,
group_name=args.wandb_group,
run_name=args.wandb_run_name,
taskcluster_secret=args.taskcluster_secret,
artifacts=args.wandb_artifacts,
publication=args.wandb_publication,
)
if wandb:
logger.info("Initializing Weight & Biases client")
# Allow publishing metrics as a table on existing runs (i.e. previous trainings)
wandb.open(resume=True)
logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})")
metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"])
wandb.handle_metrics(metrics=[metric])
wandb.close()


if __name__ == "__main__":
main()
21 changes: 19 additions & 2 deletions taskcluster/kinds/evaluate-quantized/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ tasks:
substitution-fields:
- run.command
- fetches
- worker.env
from-parameters:
src_locale: training_config.experiment.src
trg_locale: training_config.experiment.trg
wandb_publication: training_config.wandb-publication

worker-type: b-gpu
worker:
Expand All @@ -60,6 +62,19 @@ tasks:
# This is a separate environment variable so tests can override it.
BMT_MARIAN: $MOZ_FETCHES_DIR

# Weight & Biases trigger
WANDB_PUBLICATION: "{wandb_publication}"

# Weight & Biases publication token is stored in that secret
TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
- secrets:get:project/translations/level-1/weights-and-biases

# Don't run unless explicitly scheduled
run-on-tasks-for: []

Expand All @@ -69,9 +84,11 @@ tasks:
- bash
- -c
- >-
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export PATH=$PATH:~/.local/bin &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
pip install $VCS_PATH/tracking &&
zstd --rm -d $MOZ_FETCHES_DIR/lex.s2t.pruned.zst &&
$VCS_PATH/pipeline/eval/eval.py
--src {src_locale}
Expand Down
22 changes: 20 additions & 2 deletions taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,13 @@ tasks:
substitution-fields:
- fetches
- run.command
- worker.env
from-parameters:
best_model: training_config.experiment.best-model
src_locale: training_config.experiment.src
trg_locale: training_config.experiment.trg
wandb_publication: training_config.wandb-publication

worker-type: b-gpu
worker:
artifacts:
Expand All @@ -58,6 +61,19 @@ tasks:
# This is a separate environment variable so tests can override it.
MARIAN: $MOZ_FETCHES_DIR

# Weight & Biases trigger
WANDB_PUBLICATION: "{wandb_publication}"

# Weight & Biases publication token is stored in that secret
TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
- secrets:get:project/translations/level-1/weights-and-biases

# Don't run unless explicitly scheduled
run-on-tasks-for: []

Expand All @@ -73,9 +89,11 @@ tasks:
- bash
- -c
- >-
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export PATH=$PATH:~/.local/bin &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
pip install $VCS_PATH/tracking &&
sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
$VCS_PATH/pipeline/eval/eval.py
Expand Down
19 changes: 18 additions & 1 deletion taskcluster/kinds/evaluate/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ task-defaults:
task-context:
substitution-fields:
- run.command
- worker.env
from-parameters:
best_model: training_config.experiment.best-model
src_locale: training_config.experiment.src
trg_locale: training_config.experiment.trg
split_chunks: training_config.experiment.teacher-ensemble
wandb_publication: training_config.wandb-publication
worker-type: b-gpu
worker:
artifacts:
Expand All @@ -57,6 +59,19 @@ task-defaults:
# This is a separate environment variable so tests can override it.
MARIAN: $MOZ_FETCHES_DIR

# Weight & Biases trigger
WANDB_PUBLICATION: "{wandb_publication}"

# Weight & Biases publication token is stored in that secret
TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
- secrets:get:project/translations/level-1/weights-and-biases

# Don't run unless explicitly scheduled
run-on-tasks-for: []

Expand All @@ -72,9 +87,11 @@ task-defaults:
- bash
- -c
- >-
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
export PATH=$PATH:~/.local/bin &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
pip install $VCS_PATH/tracking &&
sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
$VCS_PATH/pipeline/eval/eval.py
Expand Down
4 changes: 2 additions & 2 deletions tests/test_tracking_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,9 @@ def test_experiments_marian_1_10(wandb_mock, getargs_mock, caplog, samples_dir,
),
(logging.INFO, "Found 2 quantized metrics from speed folder"),
(logging.INFO, "Found 16 metrics from task logs"),
(logging.INFO, "Creating missing run backward with associated metrics"),
(logging.INFO, "Creating missing run backwards with associated metrics"),
(logging.INFO, "Creating missing run quantized with associated metrics"),
(logging.INFO, "Creating missing run student-finetuned with associated metrics"),
(logging.INFO, "Creating missing run student-finetune with associated metrics"),
(logging.INFO, "Creating missing run teacher-base-0 with associated metrics"),
(logging.INFO, "Creating missing run teacher-base-1 with associated metrics"),
(logging.INFO, "Creating missing run teacher-ensemble with associated metrics"),
Expand Down
8 changes: 6 additions & 2 deletions tests/test_tracking_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
),
(
"eval_student-finetuned_flores_devtest",
("student-finetuned", "flores", "devtest", None),
("student-finetune", "flores", "devtest", None),
),
(
"eval_teacher-base0_flores_devtest",
Expand Down Expand Up @@ -65,7 +65,7 @@
),
(
"evaluate-backward-url-gcp_pytest-dataset_a0017e-en-ru",
("backward", "url", "gcp_pytest-dataset_a0017e", None),
("backwards", "url", "gcp_pytest-dataset_a0017e", None),
),
(
"train-teacher-ast-en-1",
Expand All @@ -76,6 +76,10 @@
"evaluate-student-sacrebleu-wmt19-ast-en",
("student", "sacrebleu", "wmt19", None),
),
(
"evaluate-teacher-flores-devtest-ru-en-1",
("teacher-1", "flores", "devtest", None),
),
],
)
def test_parse_task_label(task_label, parsed_values):
Expand Down
Loading

0 comments on commit 8a1d8ef

Please sign in to comment.