From 215c8f6aba0e6a787c0cd659a8cc3bc3290410ed Mon Sep 17 00:00:00 2001 From: Imahn Shekhzadeh Date: Thu, 16 May 2024 23:45:45 +0200 Subject: [PATCH] add logging of energy consumption of entire training, add package for this to `pyproject.toml` --- lstm_vision/train.py | 20 ++++++++++++++++++-- lstm_vision/utils.py | 10 ++++++++-- pyproject.toml | 1 + 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/lstm_vision/train.py b/lstm_vision/train.py index d06bc6b..3c5732f 100644 --- a/lstm_vision/train.py +++ b/lstm_vision/train.py @@ -11,9 +11,10 @@ from torch.cuda.amp import GradScaler from torch.nn.utils import clip_grad_norm_ from torch.utils.data import DataLoader +from zeus.monitor import ZeusMonitor from utils import ( - end_timer_and_print, + log_training_stats, print__batch_info, save_checkpoint, start_timer, @@ -77,12 +78,22 @@ def train_and_validate( reduction="mean", label_smoothing=label_smoothing ) + # auxiliary variables start_time = start_timer(device=rank) train_losses, val_losses, train_accs, val_accs = [], [], [], [] min_val_loss = float("inf") + # AMP scaler = GradScaler(enabled=use_amp) + # measure energy consumption (rank 0 already measures energy consumption + # of all GPUs) + if rank in [0, torch.device("cuda:0"), torch.device("cuda")]: + monitor = ZeusMonitor( + gpu_indices=[i for i in range(torch.cuda.device_count())] + ) + monitor.begin_window("training") + for epoch in range(num_epochs): t0 = start_timer(device=rank) @@ -172,12 +183,17 @@ def train_and_validate( "%\n" ) + # stop energy consumption measurement + if rank in [0, torch.device("cuda:0"), torch.device("cuda")]: + measurement = monitor.end_window("training") + # number of iterations per device num_iters = len(train_loader) * num_epochs if rank in [0, torch.device("cpu")]: - end_timer_and_print( + log_training_stats( start_time=start_time, + energy_consump=measurement.total_energy, device=rank, local_msg=( f"Training {num_epochs} epochs ({num_iters} iterations)" diff --git a/lstm_vision/utils.py b/lstm_vision/utils.py index fa15913..ff80d8b 100644 --- a/lstm_vision/utils.py +++ b/lstm_vision/utils.py @@ -380,8 +380,11 @@ def start_timer(device: torch.device | int) -> float: return perf_counter() -def end_timer_and_print( - start_time: float, device: torch.device | int, local_msg: str = "" +def log_training_stats( + start_time: float, + energy_consump: float, + device: torch.device | int, + local_msg: str = "", ) -> float: """ End the timer and print the time it took to execute the code as well as the @@ -389,6 +392,7 @@ def end_timer_and_print( Args: start_time: Time at which the training started. + energy_consump: Energy consumption of the entire training in Joules. device: Device on which the code was executed. Can also be an int representing the GPU ID. local_msg: Local message to print. @@ -412,6 +416,8 @@ def end_timer_and_print( msg = f"{local_msg}\n\tTotal execution time = {time_diff:.3f} [sec]" if device.type == "cuda": msg += ( + f"\n\tEnergy consumption of entire training = " + f"{energy_consump / 1e3:.3f} [kJ]" f"\n\tMax memory used by tensors = " f"{torch.cuda.max_memory_allocated(device=device) / 1024**2:.3f} " "[MB]" diff --git a/pyproject.toml b/pyproject.toml index c53ff9d..6bdabd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "python-dotenv~=1.0", "matplotlib~=3.2", "pre-commit", + "zeus-ml~=0.9" ] [tool.isort]