[Fix] Refactor to avoid two passes of the same kernel

AllenFarcas · AllenFarcas · commit d58a503cd27d · 2025-10-27T10:11:07.000-05:00
diff --git a/transformer_engine/common/util/rocm_cast_kernels.cuh b/transformer_engine/common/util/rocm_cast_kernels.cuh
@@ -457,6 +457,18 @@ void fp8_quantize_rocm(const Tensor &input, const Tensor *act_input, const Tenso
       const size_t rows = input.flat_first_dim();
       const size_t cols = input.flat_last_dim();
 
+      if constexpr (IS_DBIAS) {
+        NVTE_CHECK(dbias, "DBias tensor must be provided when IS_DBIAS is true.");
+        NVTE_CHECK(workspace, "Workspace must be provided when IS_DBIAS is true.");
+        if (workspace->data.dptr == nullptr ||
+            workspace->data.dtype != DType::kFloat32 ||
+            workspace->data.shape != std::vector<size_t>{rows, cols}) {
+          workspace->data.shape = {rows, cols};
+          workspace->data.dtype = DType::kFloat32;
+          return;
+        }
+      }
+
       if (output && output->data.dptr) {
         if constexpr (IS_DACT) {
           NVTE_CHECK(act_input, "Gradient tensor must be provided for DACT output.");
@@ -470,17 +482,6 @@ void fp8_quantize_rocm(const Tensor &input, const Tensor *act_input, const Tenso
         const void *ptr_to_reduce = nullptr;
         DType dtype_to_reduce;
 
-        NVTE_CHECK(dbias, "DBias tensor must be provided when IS_DBIAS is true.");
-        NVTE_CHECK(workspace, "Workspace must be provided when IS_DBIAS is true.");
-
-        if (workspace->data.dptr == nullptr ||
-            workspace->data.dtype != DType::kFloat32 ||
-            workspace->data.shape != std::vector<size_t>{rows, cols}) {
-          workspace->data.shape = {rows, cols};
-          workspace->data.dtype = DType::kFloat32;
-          return;
-        }
-
         workspace->amax = {};
         workspace->scale = {};
         workspace->scale_inv = {};
@@ -508,7 +509,7 @@ void fp8_quantize_rocm(const Tensor &input, const Tensor *act_input, const Tenso
                 dbias, rows, cols, stream, workspace);
             );
         );
-        }
+      }
       break;
     }
     case NVTE_MXFP8_1D_SCALING: {