Skip to content

Commit

Permalink
Merge pull request #555 from allenai/shanea/wandb-cancel-failure-bypass
Browse files Browse the repository at this point in the history
Catch and ignore CommError during W&B cancel check
  • Loading branch information
2015aroras authored Apr 23, 2024
2 parents 7be71cd + 40b6853 commit ccf7bf0
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions olmo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,7 @@ def check_if_cancelled(self) -> Tuple[bool, int]:
# Finally, check if someone canceled the run from W&B by adding the 'cancel' / 'canceled' tag..
# We won't see it in the run object. So we have to use the import/export API to check.
from requests.exceptions import RequestException
from wandb.errors import CommError

try:
api = wandb.Api(api_key=api_key)
Expand All @@ -924,8 +925,8 @@ def check_if_cancelled(self) -> Tuple[bool, int]:
cancel_reason = "Weights & Biases tag"
extra_steps = self.cfg.extra_steps_after_cancel
break
except RequestException:
pass
except (RequestException, CommError):
log.info("Failed to check if W&B run is cancelled, continuing run.")

run_canceled = synchronize_flag(should_cancel, self.device)
if run_canceled:
Expand Down

0 comments on commit ccf7bf0

Please sign in to comment.