allenai · 2015aroras · May 16, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added original legacy unsharding implementation back, as the default. The new
 shared memory implementation can be used by passing `use_legacy_shared_mem_impl` to `unshard.py`.
 
+### Fixed
+
+- Changed from `ignored_index` to `ignore_index` for `cross_entropy_loss` when `flash-attn>=2.5.8`.
+
 ## [v0.3.0](https://github.com/allenai/OLMo/releases/tag/v0.3.0) - 2024-04-25
 
 ### Added

diff --git a/olmo/train.py b/olmo/train.py
@@ -20,6 +20,7 @@
 import torch.distributed as dist
 import torch.nn.functional as F
 import wandb
+from packaging import version
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.utils.data import DataLoader
 
@@ -145,22 +146,31 @@ class Trainer:
 
     def __post_init__(self):
         if self.cfg.fused_loss:
+            import flash_attn
             from flash_attn.ops.triton.cross_entropy import (  # type: ignore
                 cross_entropy_loss,
             )
 
+            # The `ignored_index` parameter of `cross_entropy_loss` was changed to `ignore_index` in v2.5.8 with commit https://github.com/Dao-AILab/flash-attention/commit/ec6d22143b5d375e253b2ebfc563b26a43f43684
+            ce_loss_use_ignore_index_param = version.parse(flash_attn.__version__) >= version.parse("2.5.8")
+
             def fused_loss_fn(
                 logits, labels, ignore_index: int = -100, reduction: str = "mean", compute_z_loss: bool = False
             ):
+                if ce_loss_use_ignore_index_param:
+                    ignore_index_kwarg = {"ignore_index": ignore_index}
+                else:
+                    ignore_index_kwarg = {"ignored_index": ignore_index}
+
                 loss, z_loss = cross_entropy_loss(
                     logits,
                     labels,
                     label_smoothing=0.0,
                     logit_scale=1.0,
                     lse_square_scale=0.0,
-                    ignored_index=ignore_index,
                     inplace_backward=False,
                     process_group=None,
+                    **ignore_index_kwarg,
                 )
 
                 mask = labels != ignore_index