huggingface · regisss · Nov 6, 2024 · Oct 24, 2024 · yafshar · Oct 24, 2024
@@ -578,7 +578,7 @@ def attention_all_reduce(self, attn_output):
 
     def post_attn_forward(self, attn_output):
         if hasattr(self.dense, "all_reduce"):
-            self.dense.post_all_reduce(attn_output)
+            return self.dense.post_all_reduce(attn_output)
         return attn_output
 
 
@@ -598,7 +598,7 @@ def mlp_all_reduce(self, x):
 
     def post_mlp_forward(self, x):
         if hasattr(self.dense_4h_to_h, "all_reduce"):
-            self.dense_4h_to_h.post_all_reduce(x)
+            return self.dense_4h_to_h.post_all_reduce(x)
         return x
 
 

@@ -357,7 +357,7 @@ def attention_all_reduce(self, attn_output):
 
     def post_attn_forward(self, attn_output):
         if hasattr(self.o_proj, "post_all_reduce"):
-            self.o_proj.post_all_reduce(attn_output)
+            return self.o_proj.post_all_reduce(attn_output)
         return attn_output
 
 

@@ -760,7 +760,7 @@ def attention_all_reduce(self, attn_output):
 
     def post_attn_forward(self, attn_output):
         if hasattr(self.o_proj, "post_all_reduce"):
-            self.o_proj.post_all_reduce(attn_output)
+            return self.o_proj.post_all_reduce(attn_output)
         return attn_output
 
 

@@ -164,7 +164,5 @@ def all_reduce(self, input):
             dist.inference_all_reduce(input, group=self.mp_group)
 
     def post_all_reduce(self, input):
-        # inplace addition needed for correct results
-        if self.bias is not None:
-            input += self.bias
-        return input
+        output = input + self.bias if (self.bias is not None) else input
+        return output
@@ -419,7 +419,7 @@ def attention_all_reduce(self, attn_output):
 
     def post_attn_forward(self, attn_output):
         if hasattr(self.o_proj, "post_all_reduce"):
-            self.o_proj.post_all_reduce(attn_output)
+            return self.o_proj.post_all_reduce(attn_output)
         return attn_output
 
 

@@ -491,7 +491,7 @@ def attention_all_reduce(self, attn_output):
 
     def post_attn_forward(self, attn_output):
         if hasattr(self.o_proj, "post_all_reduce"):
-            self.o_proj.post_all_reduce(attn_output)
+            return self.o_proj.post_all_reduce(attn_output)
         return attn_output
 
 

@@ -17,6 +17,7 @@
 ###############################################################################
 
 import math
+import os
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -307,7 +308,8 @@ def pre_attn_forward(
 
             if q_len == 1:
                 # next token
-                with ht.sdp_kernel(enable_recompute=False):
+                use_recompute = True if os.getenv("QUANT_CONFIG", "") else False
-                use_recompute = True if os.getenv("QUANT_CONFIG", "") else False
+                use_recompute = bool(os.getenv("QUANT_CONFIG"))
-                use_recompute = True if os.getenv("QUANT_CONFIG", "") else False
+                use_recompute = bool(os.getenv("QUANT_CONFIG"))
+                with ht.sdp_kernel(enable_recompute=use_recompute):
                     attn_output = FusedSDPA.apply(
                         query_states, key_states, value_states, attention_mask, 0.0, False, None
                     )
@@ -374,7 +376,7 @@ def attention_all_reduce(self, attn_output):
 
     def post_attn_forward(self, attn_output):
         if hasattr(self.o_proj, "post_all_reduce"):
-            self.o_proj.post_all_reduce(attn_output)
+            return self.o_proj.post_all_reduce(attn_output)
         return attn_output