Skip to content

Commit b6cc488

Browse files
q10facebook-github-bot
authored andcommitted
Patch D66310520 to make it build in OSS (pytorch#3409)
Summary: Pull Request resolved: pytorch#3409 X-link: facebookresearch/FBGEMM#497 - Patch D66310520 to make the code build in OSS Reviewed By: sryap Differential Revision: D66399304
1 parent 9d78337 commit b6cc488

16 files changed

+1673
-12
lines changed

.github/scripts/fbgemm_gpu_test.bash

-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,6 @@ __configure_fbgemm_gpu_test_cuda () {
102102

103103
ignored_tests=(
104104
)
105-
106105
}
107106

108107
__configure_fbgemm_gpu_test_rocm () {

fbgemm_gpu/FbgemmGpu.cmake

+20-1
Original file line numberDiff line numberDiff line change
@@ -279,18 +279,27 @@ foreach(optimizer ${SSD_OPTIMIZERS})
279279
"gen_embedding_backward_${optimizer}_ssd_${wdesc}_kernel_cta.cu"
280280
"gen_embedding_backward_${optimizer}_ssd_${wdesc}_kernel_warp.cu")
281281
endforeach()
282+
282283
foreach(wdesc weighted unweighted)
283284
list(APPEND gen_gpu_kernel_source_files
284285
"gen_embedding_backward_${optimizer}_ssd_${wdesc}_vbe_cuda.cu"
285286
"gen_embedding_backward_${optimizer}_ssd_${wdesc}_vbe_kernel_cta.cu"
286287
"gen_embedding_backward_${optimizer}_ssd_${wdesc}_vbe_kernel_warp.cu")
287288
endforeach()
288-
289289
endforeach()
290290

291291
list(APPEND gen_defused_optim_py_files
292292
${CMAKE_BINARY_DIR}/optimizer_args.py)
293293

294+
################################################################################
295+
# FBGEMM_GPU Generated HIP-Specific Sources
296+
################################################################################
297+
298+
set(gen_hip_kernel_source_files)
299+
foreach(wdesc weighted unweighted unweighted_nobag)
300+
list(APPEND gen_hip_kernel_source_files
301+
"gen_embedding_backward_split_${wdesc}_device_kernel_hip.hip")
302+
endforeach()
294303

295304
################################################################################
296305
# FBGEMM_GPU Static Sources
@@ -426,6 +435,9 @@ set(fbgemm_gpu_sources_gpu_gen
426435
${gen_gpu_host_source_files}
427436
${gen_defused_optim_source_files})
428437

438+
set(fbgemm_gpu_sources_hip_gen
439+
${gen_hip_kernel_source_files})
440+
429441
if(USE_ROCM)
430442
prepend_filepaths(
431443
PREFIX ${CMAKE_BINARY_DIR}
@@ -436,6 +448,11 @@ if(USE_ROCM)
436448
PREFIX ${CMAKE_BINARY_DIR}
437449
INPUT ${fbgemm_gpu_sources_gpu_gen}
438450
OUTPUT fbgemm_gpu_sources_gpu_gen)
451+
452+
prepend_filepaths(
453+
PREFIX ${CMAKE_BINARY_DIR}
454+
INPUT ${fbgemm_gpu_sources_hip_gen}
455+
OUTPUT fbgemm_gpu_sources_hip_gen)
439456
endif()
440457

441458

@@ -478,6 +495,8 @@ gpu_cpp_library(
478495
GPU_SRCS
479496
${fbgemm_gpu_sources_gpu_static}
480497
${fbgemm_gpu_sources_gpu_gen}
498+
HIP_SPECIFIC_SRCS
499+
${fbgemm_gpu_sources_hip_gen}
481500
GPU_FLAGS
482501
${TORCH_CUDA_OPTIONS}
483502
DEPS

fbgemm_gpu/codegen/genscript/generate_backward_split.py

+22
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,27 @@ def generate_backward_indices() -> None:
310310
ssd=ssd,
311311
)
312312

313+
@staticmethod
314+
def generate_rocm_backward_split(**kwargs: Any) -> None:
315+
# Generate backward device kernels based on weighted (True/False), VBE
316+
# (True/False), no bag (True/False)
317+
template_filepath = (
318+
"training/backward/rocm/embedding_backward_split_device_kernel_template.hip"
319+
)
320+
321+
BackwardSplitGenerator.render_backward_templates(
322+
template_filepath,
323+
"",
324+
"{}gen_embedding_backward_{}_device_kernel_hip.hip",
325+
{
326+
"has_gpu_support": True,
327+
"has_vbe_support": False,
328+
"has_ssd_support": False,
329+
"dense": False,
330+
"gen_once": False,
331+
},
332+
)
333+
313334
@staticmethod
314335
def generate_python_sources(
315336
all_optimizers: List[str], ssd_optimizers: List[str]
@@ -369,6 +390,7 @@ def generate() -> None:
369390
BackwardSplitGenerator.generate_backward_split(
370391
ssd_tensors=ssd_tensors, **optimizer
371392
)
393+
BackwardSplitGenerator.generate_rocm_backward_split()
372394

373395
# Generate common device kernels for backwards
374396
BackwardSplitGenerator.generate_backward_device()

fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host_cpu.cpp

+4-3
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,8 @@ Tensor split_embedding_codegen_lookup_dense_function(
171171
Tensor>& /* vbe_B_offsets_rank_per_feature = std::nullopt */,
172172
c10::SymInt /* max_B = -1 */,
173173
c10::SymInt /* max_B_feature_rank = -1 */,
174-
c10::SymInt /* vbe_output_size = -1 */) {
174+
c10::SymInt /* vbe_output_size = -1 */,
175+
bool /* mixed_D = false */) {
175176
return SplitLookupFunction_Dense_Op::apply(
176177
host_weights,
177178
weights_offsets,
@@ -190,15 +191,15 @@ Tensor split_embedding_codegen_lookup_dense_function(
190191
// Deprecated for fb namespace! Please use fbgemm namespace instead!
191192
TORCH_LIBRARY_FRAGMENT(fb, m) {
192193
m.def(
193-
"dense_embedding_codegen_lookup_function(Tensor dev_weights, Tensor weights_offsets, Tensor D_offsets, SymInt total_D, SymInt max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, int output_dtype=0, Tensor? B_offsets=None, Tensor? vbe_output_offsets_feature_rank=None, Tensor? vbe_B_offsets_rank_per_feature=None, SymInt max_B=-1, SymInt max_B_feature_rank=-1, SymInt vbe_output_size=-1) -> Tensor");
194+
"dense_embedding_codegen_lookup_function(Tensor dev_weights, Tensor weights_offsets, Tensor D_offsets, SymInt total_D, SymInt max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, int output_dtype=0, Tensor? B_offsets=None, Tensor? vbe_output_offsets_feature_rank=None, Tensor? vbe_B_offsets_rank_per_feature=None, SymInt max_B=-1, SymInt max_B_feature_rank=-1, SymInt vbe_output_size=-1, bool mixed_D=False) -> Tensor");
194195
DISPATCH_TO_CPU(
195196
"dense_embedding_codegen_lookup_function",
196197
split_embedding_codegen_lookup_dense_function);
197198
}
198199

199200
TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
200201
m.def(
201-
"dense_embedding_codegen_lookup_function(Tensor dev_weights, Tensor weights_offsets, Tensor D_offsets, SymInt total_D, SymInt max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, int output_dtype=0, Tensor? B_offsets=None, Tensor? vbe_output_offsets_feature_rank=None, Tensor? vbe_B_offsets_rank_per_feature=None, SymInt max_B=-1, SymInt max_B_feature_rank=-1, SymInt vbe_output_size=-1) -> Tensor");
202+
"dense_embedding_codegen_lookup_function(Tensor dev_weights, Tensor weights_offsets, Tensor D_offsets, SymInt total_D, SymInt max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, int output_dtype=0, Tensor? B_offsets=None, Tensor? vbe_output_offsets_feature_rank=None, Tensor? vbe_B_offsets_rank_per_feature=None, SymInt max_B=-1, SymInt max_B_feature_rank=-1, SymInt vbe_output_size=-1, bool mixed_D=False) -> Tensor");
202203
DISPATCH_TO_CPU(
203204
"dense_embedding_codegen_lookup_function",
204205
split_embedding_codegen_lookup_dense_function);

fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp

+12-3
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ enum SSDTensor {
152152
{%- else %}
153153
D_offsets,
154154
max_D,
155+
mixed_D,
155156
{%- endif %} {# /* if nobag */ #}
156157
hash_size_cumsum,
157158
total_hash_size_bits,
@@ -224,6 +225,7 @@ enum SSDTensor {
224225
Variable(), // D_offsets
225226
Variable(), // total_D
226227
Variable(), // max_D
228+
Variable(), // mixed_D
227229
{%- endif %}
228230
Variable(), // hash_size_cumsum
229231
Variable(), //total_hash_size_bits
@@ -304,6 +306,7 @@ enum SSDTensor {
304306
D_offsets,
305307
total_D,
306308
max_D,
309+
mixed_D,
307310
{%- endif %}
308311
hash_size_cumsum,
309312
total_hash_size_bits,
@@ -484,6 +487,7 @@ Tensor
484487
{%- else %}
485488
const Tensor& D_offsets,
486489
const c10::SymInt max_D,
490+
const bool mixed_D,
487491
{%- endif %}
488492
const Tensor& hash_size_cumsum,
489493
const int64_t total_hash_size_bits,
@@ -566,6 +570,7 @@ class {{ autograd_func }} :
566570
const Tensor& D_offsets,
567571
const c10::SymInt total_D,
568572
const c10::SymInt max_D,
573+
const bool mixed_D,
569574
{%- else %}
570575
const c10::SymInt D,
571576
{%- endif %}
@@ -762,6 +767,7 @@ class {{ autograd_func }} :
762767

763768
{%- if not nobag %}
764769
ctx->saved_data["max_D"] = max_D;
770+
ctx->saved_data["mixed_D"] = mixed_D;
765771
ctx->saved_data["pooling_mode"] = pooling_mode;
766772
{%- else %}
767773
ctx->saved_data["D"] = D;
@@ -877,6 +883,7 @@ class {{ autograd_func }} :
877883

878884
{%- if not nobag %}
879885
auto max_D = ctx->saved_data["max_D"].toSymInt();
886+
const auto mixed_D = ctx->saved_data["mixed_D"].toBool();
880887
auto pooling_mode = ctx->saved_data["pooling_mode"].toInt();
881888
{%- else %}
882889
auto D = ctx->saved_data["D"].toSymInt();
@@ -1072,10 +1079,11 @@ Tensor {{ bwd_mdesc }}_embedding_codegen_lookup_{{ optimizer }}_function(
10721079
{%- if ssd %}
10731080
const std::optional<at::TensorList>& ssd_tensors = std::nullopt,
10741081
{%- endif %}
1075-
const double gwd_lower_bound = 0
1082+
const double gwd_lower_bound = 0,
10761083
{%- else %}
1077-
const c10::SymInt vbe_output_size = -1
1084+
const c10::SymInt vbe_output_size = -1,
10781085
{%- endif %}
1086+
const bool mixed_D = false
10791087
) {
10801088
// TODO: refactor into macro
10811089
{%- if has_gpu_support %}
@@ -1191,7 +1199,8 @@ TORCH_LIBRARY_FRAGMENT({{ lib_name }}, m) {
11911199
{%- if ssd %}
11921200
" Tensor[]? ssd_tensors=None,"
11931201
{%- endif %}
1194-
" float gwd_lower_bound=0 "
1202+
" float gwd_lower_bound=0, "
1203+
" bool mixed_D=False"
11951204
") -> Tensor",
11961205
{PT2_COMPLIANT_TAG});
11971206

0 commit comments

Comments
 (0)