Generalize JSD to FKL/RKL (#393)

yundai424 · web-flow · commit 998f4e4cad41 · 2024-11-21T14:41:42.000-08:00
diff --git a/README.md b/README.md
@@ -256,8 +256,8 @@ $$\text{GeGLU}(x)=\text{GELU}(xW+b)\otimes(xV+c)$$
 <!-- TODO: verify vocab sizes are accurate  -->
 - **FusedLinearCrossEntropy**: Peak memory usage of cross entropy loss is further improved by fusing the model head with the CE loss and chunking the input for block-wise loss and gradient calculation, a technique inspired by [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy). It achieves >4X memory reduction for 128k vocab size. **This is highly effective for large batch size, large sequence length, and large vocabulary sizes.** Please refer to the [Medusa example](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa) for individual kernel usage.
 - **KLDivergence**: [KL Divergence](https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html) is implemented by fusing the forward into a single triton kernel, with reduction done outside the kernel. It achieves ~1.5X speed and ~15% memory reduction for 128K vocab size.
-- **JSD**: [Generalized JSD](https://arxiv.org/pdf/2306.13649) (Jensen-Shannon divergence), is implemented by computing both the loss and gradient in the forward pass. It achieves ~1.5X speed and ~54% memory reduction for 128k vocab size.
-- **FusedLinearJSD**: Peak memory usage of JSD loss is further improved by fusing the model head with the JSD and chunking the input for block-wise loss and gradient calculation. It achieves ~85% memory reduction for 128k vocab size where batch size $\times$ sequence length is 8192.
+- **JSD**: [Generalized JSD](https://arxiv.org/pdf/2306.13649) (Jensen-Shannon divergence), is implemented by computing both the loss and gradient in the forward pass. It achieves ~1.5X speed and ~54% memory reduction for 128k vocab size. **NOTE**: It implements forward/reverse KL when `beta` equals 0 and 1 respectively.
+- **FusedLinearJSD**: Peak memory usage of JSD loss is further improved by fusing the model head with the JSD and chunking the input for block-wise loss and gradient calculation. It achieves ~85% memory reduction for 128k vocab size where batch size $\times$ sequence length is 8192. **NOTE**: It implements forward/reverse KL when `beta` equals 0 and 1 respectively.
 
 
 ### Experimental Kernels
diff --git a/src/liger_kernel/ops/fused_linear_jsd.py b/src/liger_kernel/ops/fused_linear_jsd.py
@@ -202,7 +202,7 @@ def forward(
             teacher_input (torch.tensor): input of the last projection layer in teacher model, with shape (B*T, H), where B is batch size, T is sequence length, H is hidden dimension.
             teacher_weight (torch.tensor): the last projection layer in teacher model, with shape (V, H), where V is vocab size
             shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
-            jsd_beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+            jsd_beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
             ignore_index (int): the index to ignore. Default: -100
             temperature (float): temperature in softmax function to control the output probability distribution. Default: `1.0`
 
diff --git a/src/liger_kernel/ops/jsd.py b/src/liger_kernel/ops/jsd.py
@@ -18,7 +18,7 @@ def _jsd_kernel(
     dX_ptr,
     dX_stride,
     label_ptr,
-    beta,
+    beta: tl.constexpr,
     n_non_ignore: int,
     ignore_index: tl.constexpr,
     n_cols,
@@ -50,17 +50,26 @@ def _jsd_kernel(
         X = tl.load(X_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
         Y = tl.load(Y_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
 
-        Q = tl.exp(X)
-        P = tl.exp(Y)
-        M = beta * P + (1 - beta) * Q
-        log_M = tl.log(M)
+        if beta == 0.0:  # forward KL
+            Y_prob = tl.exp(Y)
+            loss = Y_prob * (Y - X)
+            dX = -Y_prob
+        elif beta == 1.0:
+            X_prob = tl.exp(X)
+            loss = X_prob * (X - Y)
+            dX = loss + X_prob
+        else:
+            Q = tl.exp(X)
+            P = tl.exp(Y)
+            M = beta * P + (1 - beta) * Q
+            log_M = tl.log(M)
+
+            loss = beta * P * Y + (1 - beta) * Q * X - M * log_M
+            dX = (1 - beta) * Q * (X - log_M)
 
-        loss = beta * P * Y + (1 - beta) * Q * X - M * log_M
-        # reduction == "batchmean"
         loss = loss / n_non_ignore
+        dX = dX / n_non_ignore
         tl.store(loss_ptr + offsets, loss, mask=mask)
-
-        dX = (1 - beta) * Q * (X - log_M) / n_non_ignore
         tl.store(dX_ptr + offsets, dX, mask=mask)
 
 
@@ -142,7 +151,7 @@ def forward(
             _input (torch.Tensor): predict values with shape (BT, V) in logspace
             target (torch.Tensor): ground truth values with shape (BT, V) in logspace
             shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
-            beta (float): coefficient beta of generalized JSD in the open interval (0, 1)
+            beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
             ignore_index (int): the index to ignore. Default: -100
 
         Returns:
diff --git a/src/liger_kernel/transformers/fused_linear_jsd.py b/src/liger_kernel/transformers/fused_linear_jsd.py
@@ -12,7 +12,7 @@ class LigerFusedLinearJSD(torch.nn.Module):
     the materialization of the large logits tensor.
 
     Args:
-        jsd_beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+        jsd_beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
         ignore_index (int): The index to ignore in the target. Default: `-100`
         temperature (float): temperature in softmax function to control the output probability distribution. Default: `1.0`
 
@@ -70,9 +70,6 @@ class LigerFusedLinearJSD(torch.nn.Module):
 
     def __init__(self, jsd_beta=0.5, ignore_index=-100, temperature=1.0):
         super().__init__()
-        assert (
-            jsd_beta > 0 and jsd_beta < 1
-        ), f"beta must be greater than 0 and less than 1. Got: {jsd_beta}"
         assert temperature != 0, "temperature cannot be 0."
         self.jsd_beta = jsd_beta
         self.temperature = temperature
diff --git a/src/liger_kernel/transformers/jsd.py b/src/liger_kernel/transformers/jsd.py
@@ -18,7 +18,7 @@ class LigerJSD(torch.nn.Module):
     :math:`P` denotes the teacher model and :math:`Q` denotes the student model.
 
     Args:
-        beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+        beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
         ignore_index (int): The index to ignore in the target. Default: `-100`
 
     Shape:
@@ -58,9 +58,6 @@ class LigerJSD(torch.nn.Module):
 
     def __init__(self, beta: float = 0.5, ignore_index: int = -100):
         super().__init__()
-        assert (
-            beta > 0 and beta < 1
-        ), f"beta must be greater than 0 and less than 1. Got: {beta}"
         self.beta = beta
         self.ignore_index = ignore_index
 
diff --git a/test/transformers/test_fused_linear_jsd.py b/test/transformers/test_fused_linear_jsd.py
@@ -105,6 +105,8 @@ def forward(self, student_input, teacher_input, label=None):
     [
         (1.0, 0.5),
         (2.0, 0.1),
+        (1.0, 0.0),  # FKL
+        (1.0, 1.0),  # RKL
     ],
 )
 def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):
@@ -177,7 +179,9 @@ def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):
     "temperature, beta, ignore_index",
     [
         (1.0, 0.5, 2),
+        (1.0, 0.0, 2),
         (2.0, 0.1, 42),
+        (1.0, 1.0, 2),
     ],
 )
 def test_correctness_with_ignore_index(
diff --git a/test/transformers/test_jsd.py b/test/transformers/test_jsd.py
@@ -30,12 +30,19 @@ def forward(
         log_p: torch.Tensor,  # target
         label: Optional[torch.Tensor] = None,
     ):
-        log_p, log_q = log_p.to(torch.float), log_q.to(torch.float)
-        log_p, log_q = log_p.view(-1, log_p.size(-1)), log_q.view(-1, log_q.size(-1))
-        m = torch.lerp(torch.exp(log_q), torch.exp(log_p), self.beta)
-        loss = self.beta * self.kl(torch.log(m), log_p).sum(dim=-1) + (
-            1 - self.beta
-        ) * self.kl(torch.log(m), log_q).sum(dim=-1)
+        if self.beta == 0.0:
+            loss = self.kl(log_q, log_p).sum(dim=-1)
+        elif self.beta == 1.0:
+            loss = self.kl(log_p, log_q).sum(dim=-1)
+        else:
+            log_p, log_q = log_p.to(torch.float), log_q.to(torch.float)
+            log_p, log_q = log_p.view(-1, log_p.size(-1)), log_q.view(
+                -1, log_q.size(-1)
+            )
+            m = torch.lerp(torch.exp(log_q), torch.exp(log_p), self.beta)
+            loss = self.beta * self.kl(torch.log(m), log_p).sum(dim=-1) + (
+                1 - self.beta
+            ) * self.kl(torch.log(m), log_q).sum(dim=-1)
 
         if label is not None:
             loss = torch.where(label != self.ignore_index, loss, 0.0)
@@ -251,7 +258,7 @@ def test_correctness_not_last(B, T, V, dtype, atol, rtol):
 
 @pytest.mark.parametrize(*_SHAPE_PARAMS)
 @pytest.mark.parametrize(*_DTYPE_PARAMS)
-@pytest.mark.parametrize("beta", [0.1, 0.5, 0.9])
+@pytest.mark.parametrize("beta", [0.0, 0.1, 0.5, 0.9, 1.0])
 def test_correctness_with_beta(B, T, V, beta, dtype, atol, rtol):
     liger_jsd = LigerJSD(beta=beta)
     _test_correctness_with_beta_once(liger_jsd, beta, B, T, V, dtype, atol, rtol)

Original file line number	Diff line number	Diff line change
`@@ -105,6 +105,8 @@ def forward(self, student_input, teacher_input, label=None):`
`105`	`105`	`[`
`106`	`106`	`(1.0, 0.5),`
`107`	`107`	`(2.0, 0.1),`
	`108`	`+ (1.0, 0.0), # FKL`
	`109`	`+ (1.0, 1.0), # RKL`
`108`	`110`	`],`
`109`	`111`	`)`
`110`	`112`	`def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):`
`@@ -177,7 +179,9 @@ def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):`
`177`	`179`	`"temperature, beta, ignore_index",`
`178`	`180`	`[`
`179`	`181`	`(1.0, 0.5, 2),`
	`182`	`+ (1.0, 0.0, 2),`
`180`	`183`	`(2.0, 0.1, 42),`
	`184`	`+ (1.0, 1.0, 2),`
`181`	`185`	`],`
`182`	`186`	`)`
`183`	`187`	`def test_correctness_with_ignore_index(`