From ea95bc0cb18e164b222269c201099ec5a0da3415 Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Wed, 15 May 2024 20:35:54 +0200 Subject: [PATCH] Override W&B data on a resumed training (#595) * Override W&B data on a resumed training * Suggestions --- tracking/translations_parser/publishers.py | 43 +++++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/tracking/translations_parser/publishers.py b/tracking/translations_parser/publishers.py index 9057c1eb9..7f6fa16ed 100644 --- a/tracking/translations_parser/publishers.py +++ b/tracking/translations_parser/publishers.py @@ -1,5 +1,6 @@ import csv import logging +import os from abc import ABC from collections import defaultdict from pathlib import Path @@ -99,8 +100,9 @@ def open(self, parser) -> None: config.update(self.extra_kwargs.pop("config", {})) try: - # Check if a W&B run already exists with this name project = next(filter(lambda p: p.name == self.project, wandb.Api().projects()), None) + # Check if a W&B run already exists with this name + existing_runs = [] if project and (name := self.extra_kwargs.get("name")): existing_runs = list( wandb.Api().runs( @@ -108,18 +110,39 @@ def open(self, parser) -> None: filters={"display_name": name, "group": self.extra_kwargs.get("group")}, ) ) - if len(existing_runs) > 0: + if len(existing_runs) == 0: + # Start a new W&B run + self.wandb = wandb.init( + project=self.project, + config=config, + **self.extra_kwargs, + ) + return + elif len(existing_runs) == 1: + run = existing_runs[0] + # Avoid overriding an existing run on a first training, this should not happen + if int(os.environ.get("RUN_ID", 0)) < 1: logger.warning( - f"This run already exists on W&B: {existing_runs}. No data will be published." + f"A W&B run already exists with name '{name}': {run}. No data will be published." ) return - - # Start a W&B run - self.wandb = wandb.init( - project=self.project, - config=config, - **self.extra_kwargs, - ) + # Resume an existing run + logger.info( + f"Training has been resumed from an earlier run wit name '{name}', " + f"continue W&B publication with run {run}." + ) + self.wandb = wandb.init( + project=self.project, + config=config, + id=run.id, + resume="must", + **self.extra_kwargs, + ) + else: + logger.warning( + f"Multiple W&B runs already exist with name '{name}': {existing_runs}. No data will be published." + ) + return except Exception as e: logger.error(f"WandB client could not be initialized: {e}. No data will be published.")