Guard kernel compilation and call

pytorch · Nov 13, 2024 · 02ec7a2 · 02ec7a2
1 parent c9a4a04
commit 02ec7a2
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 9 deletions.
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
@@ -623,14 +623,14 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd
     {%- else %}
     constexpr bool is_weighted = false;
     {%- endif %}
-    {{optimizer}}_kernel_arg_t opt_karg;
+    rocm::{{optimizer}}_kernel_arg_t opt_karg;
     opt_karg.p_momentum = momentum1_dev.data();
     opt_karg.eps = eps;
     opt_karg.learning_rate = learning_rate;
     // weight_decay(_mode) is supplied as args.split_function_args_no_defaults
     opt_karg.weight_decay_mode = weight_decay_mode_v;
     opt_karg.weight_decay = weight_decay;
-    auto batch_mdiv = [](uint32_t d) -> magic_div_u32_t {
+    auto batch_mdiv = [](uint32_t d) -> rocm::magic_div_u32_t {
         assert(d >= 1 && d <= INT32_MAX);
         uint8_t shift;
         for(shift = 0; shift < 32; shift++)
@@ -641,14 +641,14 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd
         uint64_t magic = ((one << 32) * ((one << shift) - d)) / d + 1;
         assert(magic <= 0xffffffffUL);
 
-        magic_div_u32_t result;
+        rocm::magic_div_u32_t result;
         result.magic = magic;
         result.shift = shift;
         return result;
     }(batch);
-    split_tbe_backward_hip_kernel_{{kdesc}}<
-        {{optimizer}}_optimizer_t<cache_t, emb_t, embedding_dim, weight_decay_mode_v>,
-        {{optimizer}}_kernel_arg_t,
+    rocm::split_tbe_backward_hip_kernel_{{kdesc}}<
+        rocm::{{optimizer}}_optimizer_t<cache_t, emb_t, embedding_dim, weight_decay_mode_v>,
+        rocm::{{optimizer}}_kernel_arg_t,
         emb_t,
         cache_t,
         grad_t,

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -26,6 +26,10 @@
 #include "fbgemm_gpu/split_embeddings_utils.cuh"
 #include "fbgemm_gpu/utils/ops_utils.h"
 
+{%- if is_rocm %}
+#include "fbgemm_gpu/rocm/cdna_guard.h"
+{%- endif %}
+
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
 
@@ -1194,7 +1198,7 @@ Tensor {{ embedding_cuda_op }}(
                         not dense and not is_gwd_kernel and not vbe and not ssd and not nobag %}
                     const bool isSupportedWeightsType = dev_weights.scalar_type() == at::ScalarType::Half 
                                                       || dev_weights.scalar_type() == at::ScalarType::Float;
-                    if(isSupportedWeightsType && !mixed_D)
+                    if(isSupportedWeightsType && !mixed_D && rocm::is_supported_cdna())
                     {
                         constexpr int segments_per_workgroup = 4;
                         {%- for kDimSize in [64, 128, 160, 192, 256] %}

diff --git a/...mm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip b/...mm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
@@ -26,6 +26,7 @@
 
 #include "fbgemm_gpu/rocm/split_embeddings_common.h"
 
+namespace fbgemm_gpu::rocm {
 template <typename cache_t, typename emb_t, int32_t embedding_dim, int32_t weight_decay_mode>
 struct rowwise_adagrad_optimizer_t
 {
@@ -457,4 +458,4 @@ L_tail_grad_acc:
 
     store_row_per_warp<emb_t, embedding_dim, emb_t>::run(&emb_data[0], p_emb_table + emb_idx * embedding_dim, lane_id);
 }
-
+} // namespace fbgemm_gpu::rocm
diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/cdna_guard.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/cdna_guard.h
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ * Copyright (c) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <set>
+#include <string>
+
+#define HIP_CHECK(c)                                                           \
+  {                                                                            \
+    if (c != hipSuccess) {                                                     \
+      printf("HIP Error : %s", hipGetErrorString(c));                          \
+      printf(" %s %d\n", __FILE__, __LINE__);                                  \
+      exit(c);                                                                 \
+    }                                                                          \
+  }
+
+namespace fbgemm_gpu::rocm {
+
+[[nodiscard]] inline bool is_supported_cdna() {
+  const std::set<std::string> supported_archs{"gfx942", "gfx90a"};
+  int device_id = 0;
+  HIP_CHECK(hipGetDevice(&device_id));
+  hipDeviceProp_t dev_props;
+  HIP_CHECK(hipGetDeviceProperties(&dev_props, device_id));
+  std::string gcn_arch = dev_props.gcnArchName;
+  gcn_arch = gcn_arch.substr(0, gcn_arch.find(":"));
+  return supported_archs.contains(gcn_arch);
+}
+
+} // namespace fbgemm_gpu::rocm
diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
@@ -32,6 +32,7 @@ typedef float floatx2_t __attribute__((ext_vector_type(2)));
 #define THREADS_PER_ROW 64
 #define BLOCK_SIZE 256
 
+namespace fbgemm_gpu::rocm {
 template <typename T> union amdgcn_buffer_resource {
   // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
   int32x4_t content;
@@ -389,6 +390,7 @@ __device__ __forceinline__ void generic_dpp_reduction(data_t &result) {
 // of trivial operation with an option to use custom operation
 template <typename data_t, typename reduce_op_t, int wave_size = 64>
 __device__ __forceinline__ void dpp_reduction(data_t &result) {
+#if defined(__gfx942__) || defined(__gfx90a__)
   if constexpr (std::is_same_v<reduce_op_t, reduce_op::sum>) {
     DPP_REDUCE_F16_F32(add);
     return;
@@ -404,6 +406,7 @@ __device__ __forceinline__ void dpp_reduction(data_t &result) {
   } else {
     generic_dpp_reduction<data_t, reduce_op_t, wave_size>(result);
   }
+#endif
 }
 
 template <typename reduce_op_t, typename data_t, int wave_size>
@@ -458,4 +461,5 @@ magic_div_u32_run_with_mod(const magic_div_u32_t &mdiv, const uint32_t &n,
                            const uint32_t d, uint32_t &quo, uint32_t &rem) {
   quo = magic_div_u32_run(mdiv, n);
   rem = n - quo * d;
-}
+}
+} // namespace fbgemm_gpu::rocm