Skip to content

Commit

Permalink
Guard kernel compilation and call
Browse files Browse the repository at this point in the history
  • Loading branch information
avbokovoy committed Nov 13, 2024
1 parent c9a4a04 commit 02ec7a2
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -623,14 +623,14 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd
{%- else %}
constexpr bool is_weighted = false;
{%- endif %}
{{optimizer}}_kernel_arg_t opt_karg;
rocm::{{optimizer}}_kernel_arg_t opt_karg;
opt_karg.p_momentum = momentum1_dev.data();
opt_karg.eps = eps;
opt_karg.learning_rate = learning_rate;
// weight_decay(_mode) is supplied as args.split_function_args_no_defaults
opt_karg.weight_decay_mode = weight_decay_mode_v;
opt_karg.weight_decay = weight_decay;
auto batch_mdiv = [](uint32_t d) -> magic_div_u32_t {
auto batch_mdiv = [](uint32_t d) -> rocm::magic_div_u32_t {
assert(d >= 1 && d <= INT32_MAX);
uint8_t shift;
for(shift = 0; shift < 32; shift++)
Expand All @@ -641,14 +641,14 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd
uint64_t magic = ((one << 32) * ((one << shift) - d)) / d + 1;
assert(magic <= 0xffffffffUL);

magic_div_u32_t result;
rocm::magic_div_u32_t result;
result.magic = magic;
result.shift = shift;
return result;
}(batch);
split_tbe_backward_hip_kernel_{{kdesc}}<
{{optimizer}}_optimizer_t<cache_t, emb_t, embedding_dim, weight_decay_mode_v>,
{{optimizer}}_kernel_arg_t,
rocm::split_tbe_backward_hip_kernel_{{kdesc}}<
rocm::{{optimizer}}_optimizer_t<cache_t, emb_t, embedding_dim, weight_decay_mode_v>,
rocm::{{optimizer}}_kernel_arg_t,
emb_t,
cache_t,
grad_t,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
#include "fbgemm_gpu/split_embeddings_utils.cuh"
#include "fbgemm_gpu/utils/ops_utils.h"

{%- if is_rocm %}
#include "fbgemm_gpu/rocm/cdna_guard.h"
{%- endif %}

using Tensor = at::Tensor;
using namespace fbgemm_gpu;

Expand Down Expand Up @@ -1194,7 +1198,7 @@ Tensor {{ embedding_cuda_op }}(
not dense and not is_gwd_kernel and not vbe and not ssd and not nobag %}
const bool isSupportedWeightsType = dev_weights.scalar_type() == at::ScalarType::Half
|| dev_weights.scalar_type() == at::ScalarType::Float;
if(isSupportedWeightsType && !mixed_D)
if(isSupportedWeightsType && !mixed_D && rocm::is_supported_cdna())
{
constexpr int segments_per_workgroup = 4;
{%- for kDimSize in [64, 128, 160, 192, 256] %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

#include "fbgemm_gpu/rocm/split_embeddings_common.h"

namespace fbgemm_gpu::rocm {
template <typename cache_t, typename emb_t, int32_t embedding_dim, int32_t weight_decay_mode>
struct rowwise_adagrad_optimizer_t
{
Expand Down Expand Up @@ -457,4 +458,4 @@ L_tail_grad_acc:

store_row_per_warp<emb_t, embedding_dim, emb_t>::run(&emb_data[0], p_emb_table + emb_idx * embedding_dim, lane_id);
}

} // namespace fbgemm_gpu::rocm
51 changes: 51 additions & 0 deletions fbgemm_gpu/include/fbgemm_gpu/rocm/cdna_guard.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*******************************************************************************
* Copyright (c) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
******************************************************************************/
#pragma once

#include <hip/hip_runtime.h>
#include <set>
#include <string>

#define HIP_CHECK(c) \
{ \
if (c != hipSuccess) { \
printf("HIP Error : %s", hipGetErrorString(c)); \
printf(" %s %d\n", __FILE__, __LINE__); \
exit(c); \
} \
}

namespace fbgemm_gpu::rocm {

[[nodiscard]] inline bool is_supported_cdna() {
const std::set<std::string> supported_archs{"gfx942", "gfx90a"};
int device_id = 0;
HIP_CHECK(hipGetDevice(&device_id));
hipDeviceProp_t dev_props;
HIP_CHECK(hipGetDeviceProperties(&dev_props, device_id));
std::string gcn_arch = dev_props.gcnArchName;
gcn_arch = gcn_arch.substr(0, gcn_arch.find(":"));
return supported_archs.contains(gcn_arch);
}

} // namespace fbgemm_gpu::rocm
6 changes: 5 additions & 1 deletion fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ typedef float floatx2_t __attribute__((ext_vector_type(2)));
#define THREADS_PER_ROW 64
#define BLOCK_SIZE 256

namespace fbgemm_gpu::rocm {
template <typename T> union amdgcn_buffer_resource {
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
int32x4_t content;
Expand Down Expand Up @@ -389,6 +390,7 @@ __device__ __forceinline__ void generic_dpp_reduction(data_t &result) {
// of trivial operation with an option to use custom operation
template <typename data_t, typename reduce_op_t, int wave_size = 64>
__device__ __forceinline__ void dpp_reduction(data_t &result) {
#if defined(__gfx942__) || defined(__gfx90a__)
if constexpr (std::is_same_v<reduce_op_t, reduce_op::sum>) {
DPP_REDUCE_F16_F32(add);
return;
Expand All @@ -404,6 +406,7 @@ __device__ __forceinline__ void dpp_reduction(data_t &result) {
} else {
generic_dpp_reduction<data_t, reduce_op_t, wave_size>(result);
}
#endif
}

template <typename reduce_op_t, typename data_t, int wave_size>
Expand Down Expand Up @@ -458,4 +461,5 @@ magic_div_u32_run_with_mod(const magic_div_u32_t &mdiv, const uint32_t &n,
const uint32_t d, uint32_t &quo, uint32_t &rem) {
quo = magic_div_u32_run(mdiv, n);
rem = n - quo * d;
}
}
} // namespace fbgemm_gpu::rocm

0 comments on commit 02ec7a2

Please sign in to comment.