MLFlow issue: Every single run is marked as FINISHED never FAILED #20827

Saya47 · 2025-05-14T07:39:24Z

Saya47
May 14, 2025

Hello good day to everybody.

I track my experiments using MLFLow. My issue is that even if my code has bugs and raises during run, Lightning marks all my runs as FINISHED in MLFLow. Below I asked an LLM to demonstrate this:

import sys

import mlflow
import pytorch_lightning as pl
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.loggers import MLFlowLogger

# Configuration
EXPERIMENT_NAME = "test_experiment"
TRACKING_URI = "file:./mlruns"  # Local MLflow tracking


# Simple Lightning model
class TestModel(pl.LightningModule):
    def __init__(self, fail=False):
        super().__init__()
        self.layer = torch.nn.Linear(2, 1)
        self.fail = fail  # Toggle to simulate failure

    def forward(self, x):
        return self.layer(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        if self.fail and batch_idx == 2:  # Simulate failure
            raise RuntimeError("Simulated training error")
        y_hat = self(x)
        loss = torch.nn.functional.mse_loss(y_hat, y)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)

    def train_dataloader(self):
        # Synthetic data: 100 samples, 2 features
        x = torch.randn(100, 2)
        y = torch.sum(x, dim=1, keepdim=True)
        dataset = torch.utils.data.TensorDataset(x, y)
        return torch.utils.data.DataLoader(dataset, batch_size=10)


# Callback to handle training exceptions
class MLflowStatusCallback(Callback):
    def __init__(self, mlflow_run_id):
        super().__init__()
        self.mlflow_run_id = mlflow_run_id

    def on_exception(self, trainer, pl_module, exception):
        with mlflow.start_run(run_id=self.mlflow_run_id, nested=True):
            mlflow.end_run(status="FAILED")
        raise exception


# Main function
def main(fail):
    # Set up MLflow
    mlflow.set_tracking_uri(TRACKING_URI)
    mlflow.set_experiment(EXPERIMENT_NAME)

    # Start MLflow run manually
    with mlflow.start_run() as active_run:
        run_id = active_run.info.run_id

        # Configure MLFlowLogger with manual run ID
        mlf_logger = MLFlowLogger(
            experiment_name=EXPERIMENT_NAME, tracking_uri=TRACKING_URI, run_id=run_id
        )

        # Initialize model and trainer
        model = TestModel(fail=fail)
        trainer = Trainer(
            logger=mlf_logger,
            max_epochs=3,
            callbacks=[MLflowStatusCallback(run_id)],
            enable_progress_bar=True,
        )

        try:
            trainer.fit(model)
            mlflow.end_run(status="FINISHED")
            print("Training completed successfully")
        except KeyboardInterrupt:
            print("Training interrupted by user")
            mlflow.end_run(status="KILLED")
            sys.exit(1)
        except Exception as e:
            print(f"Training failed: {e}")
            mlflow.end_run(status="FAILED")
            raise


if __name__ == "__main__":
    # Run with fail=False for normal completion or Ctrl+C to test interrupt
    # Run with fail=True to test failure
    main(fail=True)  # Change to True to simulate a failure

The run is marked with status 3 (FINISHED) on exceptions which is really bad.
I use the status to filter out bad runs because I use MLFLow to aggregate metrics/losses across epochs/runs and resume checkpoints.

Saya47 · 2025-05-14T20:41:35Z

Saya47
May 14, 2025
Author

Figured out my code had a bug, sorry if I took anyone's time, I've struggled too long with this until I posted this then after 12 hours realized the bug.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

MLFlow issue: Every single run is marked as FINISHED never FAILED #20827

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

MLFlow issue: Every single run is marked as FINISHED never FAILED #20827

Uh oh!

Saya47 May 14, 2025

Replies: 1 comment

Uh oh!

Saya47 May 14, 2025 Author

Saya47
May 14, 2025

Saya47
May 14, 2025
Author