diff --git a/megatron/training.py b/megatron/training.py index 5adef3c11..a9debff30 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -838,7 +838,10 @@ def build_train_valid_test_data_iterators( assert args.train_samples is None, \ 'only backward compatiblity support for iteration-based training' args.consumed_train_samples = args.iteration * args.global_batch_size - if args.iteration > 0 and args.consumed_valid_samples == 0: + # it's possible that train was run, but not eval and it's valid if + # args.consumed_valid_samples == 0 + # TODO: eval_interval could have changed between runs, so this might still be wrong + if args.iteration // args.eval_interval > 0 and args.consumed_valid_samples == 0: assert args.train_samples is None, \ 'only backward compatiblity support for iteration-based training' args.consumed_valid_samples = (args.iteration // args.eval_interval) * \