bitsandbytes-foundation · TimDettmers · Sep 20, 2022 · Aug 23, 2022 · Aug 23, 2022 · Aug 23, 2022
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -1,4 +1,6 @@
 import operator
+import warnings
+
 import torch
 import bitsandbytes.functional as F
 
@@ -210,32 +212,29 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()):
             ctx.B = B
             ctx.bias = bias
             if A.shape[-1] == B.shape[0]:
-                return torch.empty(A.shape[:-1]+B.shape[1:], dtype=torch.float16, device=A.device)
+                return torch.empty(A.shape[:-1]+B.shape[1:], dtype=A.dtype, device=A.device)
             else:
-                return torch.empty(A.shape[:-1]+B.shape[:1], dtype=torch.float16, device=A.device)
+                return torch.empty(A.shape[:-1]+B.shape[:1], dtype=A.dtype, device=A.device)
 
         # 1. Quantize A
         # 2. Quantize B
         # 3. Matmul
         # 4. Mixed-precision decomposition matmul
         # 5. Save state
-        requires_gradA = A.requires_grad
-        requires_gradB = B.requires_grad
-        requires_gradBias = bias is not None and bias.requires_grad
         formatB = state.formatB
         input_shape = A.shape
         if state.outlier_pool is None:
             state.outlier_pool = GlobalOutlierPooler.get_instance()
 
         # Cast A to fp16
-        A_dtype = A.dtype
-        A = A.to(torch.float16)
+        if A.dtype != torch.float16:
+            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
 
         # 1. Quantize A
         if len(A.shape) == 3:
             A = A.view(-1, A.shape[-1]).contiguous()
         CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(
-            A, threshold=state.threshold
+            A.to(torch.float16), threshold=state.threshold
         )
 
         if state.threshold > 0.0 and coo_tensorA is not None:
@@ -271,7 +270,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()):
                     state.SCB,
                     state.SCBt,
                     coo_tensorB,
-                ) = F.double_quant(B)
+                ) = F.double_quant(B.to(torch.float16))
                 state.CxB, state.SB = F.transform(CB, to_order=formatB)
         else:
             has_grad = False
@@ -292,7 +291,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()):
                 (outliers * state.SCB.view(-1, 1) / 127.0)
                 .t()
                 .contiguous()
-                .half()
+                .to(A.dtype)
             )
             CA[:, state.idx.long()] = 0
             CAt[:, state.idx.long()] = 0
@@ -309,7 +308,13 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()):
         C32A, SA = F.transform(CA, "col32")
         out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
         # we apply the fused bias here
-        output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)
+
+        if bias is None or bias.dtype == torch.float16:
+            output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)
+            output = output.to(A.dtype)
+        else:  # apply bias separately
+            output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=None)
+            output = output.to(A.dtype).add_(bias)
 
         # 4. Mixed-precision decomposition matmul
         if coo_tensorA is not None and subA is not None:
@@ -320,18 +325,16 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()):
 
         ctx.formatB = formatB
         ctx.grad_shape = input_shape
-        ctx.req_grads = [requires_gradA, requires_gradB, requires_gradBias]
+        ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
 
-        if requires_gradA or requires_gradB:
+        if any(ctx.needs_input_grad[:2]):
             ctx.tensors = (CAt, subA)
             ctx.tensor_states = (SCAt, state.idx)
         else:
             ctx.tensors = [None, None]
             ctx.tensor_states = (None, None)
             ctx.save_for_backward(None, None)
 
-        # Cast fp16 output back to A.dtype
-        output = output.to(A_dtype)
 
         clone_func = torch.clone if len(output_shape) == 3 else lambda x : x
         return clone_func(output.view(output_shape))
@@ -341,24 +344,24 @@ def backward(ctx, grad_output):
         if ctx.is_empty:
             bias_grad = (None if ctx.bias is None else torch.zeros_like(ctx.bias))
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
-        req_gradA, req_gradB, req_gradBias = ctx.req_grads
+        req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
         CAt, subA = ctx.tensors
         SCAt, idx = ctx.tensor_states
         formatB = ctx.formatB
         state = ctx.state
+        grad_A = grad_B = grad_bias = None
 
-        # Cast grad_output to fp16
-        grad_output_dtype = grad_output.dtype
-        grad_output = grad_output.to(torch.float16)
+        if req_gradBias:
+            # compute grad_bias first before changing grad_output dtype
+            grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias)
 
+        # Cast grad_output to fp16
         if len(grad_output.shape) == 3:
             grad_output = grad_output.reshape(
                 -1, grad_output.shape[-1]
             ).contiguous()
 
-        grad_A = grad_B = grad_bias = None
-
-        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output)
+        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
         if req_gradB:
             CxAt, SAt = F.transform(CAt, formatB, transpose=True)
             C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)
@@ -375,21 +378,14 @@ def backward(ctx, grad_output):
                         state.CBt, to_order=formatB, transpose=True
                     )
                 gradA32, SgradA32 = F.igemmlt(C32grad, state.CxBt, Sgrad, state.SBt)
-                grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape)
+                grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A)
+
             elif state.CB is not None:
-                CB = state.CB.half()
-                SCB = (state.SCB.unsqueeze(1) / 127.0).half()
-                CB *= SCB
-                grad_A = torch.mm(grad_output, CB).view(ctx.grad_shape)
+                CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).div(127.0))
+                grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)
             else:
                 raise Exception('State must contain either CBt or CB matrix for backward')
 
-        if req_gradBias:
-            grad_bias = grad_output.sum(0)
-
-        # Cast grad_A back to grad_output_dtype
-        grad_output = grad_output.to(grad_output_dtype)
-
         return grad_A, grad_B, None, grad_bias, None
 
 

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -237,7 +237,9 @@ def __init__(
         if threshold > 0.0 and not has_fp16_weights:
             self.state.use_pool = True
 
-        self.weight = Int8Params(self.weight.data, has_fp16_weights=has_fp16_weights)
+        self.weight = Int8Params(
+            self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights
+        )
 
     def init_8bit_state(self):
         self.state.CB = self.weight.CB

diff --git a/tests/test_autograd.py b/tests/test_autograd.py
@@ -253,7 +253,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
 
 transpose = [(False, True), (False, False)]
 str_transpose = ["NT", "NN"]
-dtype = [torch.float16]
+dtype = [torch.float16, torch.bfloat16]
 has_fp16_weights = [True, False]
 has_bias = [True, False]
 values = list(
@@ -354,7 +354,7 @@ def test_matmullt(
                     state.SCB,
                     SCBt,
                     coo_tensorB,
-                ) = bnb.functional.double_quant(B2)
+                ) = bnb.functional.double_quant(B2.to(torch.float16))
                 B2 = state.CB
 
             if not transpose[0] and transpose[1]:
@@ -367,11 +367,14 @@ def test_matmullt(
             if has_bias:
                 out_torch += bias
 
+            assert out_bnb.dtype == A.dtype, f"bnb matmullt received {A.dtype} but returned {out_bnb.dtype}"
+
             n = out_bnb.numel()
             err = torch.abs(out_bnb - out_torch).mean().item()
             # print(f'abs error {err:.4f}')
+
             idx = torch.isclose(out_bnb, out_torch, atol=0.01, rtol=0.1)
-            assert (idx == 0).sum().item() <= n * 0.0175
+            assert (idx == 0).sum().item() <= n * (0.0175 if dtype == torch.float16 else 0.021)
             idx = torch.isclose(out_bnb, out_torch, atol=0.035, rtol=0.2)
             assert (idx == 0).sum().item() <= n * 0.001
 

diff --git a/tests/test_modules.py b/tests/test_modules.py
@@ -14,13 +14,15 @@ def __init__(self, initial_data):
 
 
 class MLP8bit(torch.nn.Module):
-    def __init__(self, dim1, dim2, has_fp16_weights=True, threshold=0.0):
+    def __init__(self, dim1, dim2, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0):
         super(MLP8bit, self).__init__()
         self.fc1 = bnb.nn.Linear8bitLt(
-            dim1, dim2, has_fp16_weights=has_fp16_weights, threshold=threshold
+            dim1, dim2, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward,
+            threshold=threshold
         )
         self.fc2 = bnb.nn.Linear8bitLt(
-            dim2, dim1, has_fp16_weights=has_fp16_weights, threshold=threshold
+            dim2, dim1, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward,
+            threshold=threshold
         )
 
     def forward(self, x):
@@ -451,9 +453,12 @@ def test_linear8bitlt_accumulated_gradient():
 
 
 @pytest.mark.parametrize("threshold", values, ids=names)
-def test_linear8bitlt_no_fp16_weights(threshold):
+@pytest.mark.parametrize("memory_efficient_backward", [True, False])
+def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
     l1 = (
-        bnb.nn.Linear8bitLt(32, 64, threshold=threshold, has_fp16_weights=False)
+        bnb.nn.Linear8bitLt(
+            32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward
+        )
         .cuda()
         .half()
     )
@@ -513,7 +518,9 @@ def test_linear8bitlt_no_fp16_weights(threshold):
     assert mlp.fc2.weight.dtype == torch.int8
 
     mlp = (
-        MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False)
+        MLP8bit(
+            32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward
+        )
         .half()
         .to("cuda")
     )
@@ -531,11 +538,11 @@ def test_linear8bitlt_no_fp16_weights(threshold):
     assert mlp.fc1.weight.device.type == "cuda"
     assert mlp.fc2.weight.device.type == "cuda"
 
-    mlp = (
-        MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False)
-        .to(torch.float16)
-        .to("cuda")
-    )
+    mlp = MLP8bit(
+            32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward
+        )
+    w1, w2 = mlp.fc1.weight.clone().cuda(), mlp.fc2.weight.clone().cuda()  # grab weights before quantization,
+    mlp = mlp.cuda().half()  # and this line triggers quantization
 
     for i in range(100):
         b1 = torch.randn(16, 8, 32, device="cuda").half()
@@ -545,11 +552,30 @@ def test_linear8bitlt_no_fp16_weights(threshold):
             assert mlp.fc1.state.idx is not None
         if threshold > 0:
             assert mlp.fc2.state.idx is not None
+
     assert mlp.fc1.weight.dtype == torch.int8
     assert mlp.fc2.weight.dtype == torch.int8
     assert mlp.fc1.weight.device.type == "cuda"
     assert mlp.fc2.weight.device.type == "cuda"
 
+    if memory_efficient_backward:
+        b1 = torch.randn(16, 8, 32, device="cuda", requires_grad=True, dtype=torch.half)
+        o1 = mlp(b1)
+        assert o1.dtype == torch.float16
+        assert o1.requires_grad
+        grad_proj = torch.randn_like(o1)
+
+        mlp.zero_grad()
+        (o1 * grad_proj).sum().backward()
+        grad_ref = grad_proj.flatten(2) @ w2.half() @ w1.half()
+        scale = grad_ref.abs().mean()
+        assert torch.allclose(b1.grad, grad_ref, rtol=0, atol=0.05 * scale)
+
+
+
+
+
+
 
 def test_linear8bitlt_fp32_bias():
     # casts model to fp16 -> int8 automatically