diff --git a/fbgemm_gpu/codegen/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_template.cu index a8d9c4b70..a0a8f211b 100644 --- a/fbgemm_gpu/codegen/embedding_backward_split_template.cu +++ b/fbgemm_gpu/codegen/embedding_backward_split_template.cu @@ -930,11 +930,11 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_ ->philox_cuda_state(4); } {% endif %} - {% for kMaxVecsPerThread in range(1, max_embedding_dim // 128 + 1) %} + {% for kMaxVecsPerThread in range(1, max_embedding_dim // 256 + 1) %} {% if not nobag %} - if (max_D <= {{ 128 * kMaxVecsPerThread }}) { + if (max_D <= {{ 256 * kMaxVecsPerThread }}) { {% else %} - if (D <= {{ 128 * kMaxVecsPerThread }}) { + if (D <= {{ 256 * kMaxVecsPerThread }}) { {% endif %} // Stay under used_shared_kb of shared memory (V100: 64 KB; A100: 96 KB), BT_block_size must be a power of two. while (BT_block_size * sizeof(at::acc_type<{{ "scalar_t" if dense else "cache_t" }}, true>) * 4 * kWarpSize * {{ kMaxVecsPerThread }} >= used_shared_bytes) { diff --git a/fbgemm_gpu/codegen/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_template.cu index bd2c7e93e..1bc667de4 100644 --- a/fbgemm_gpu/codegen/embedding_forward_split_template.cu +++ b/fbgemm_gpu/codegen/embedding_forward_split_template.cu @@ -395,8 +395,8 @@ Tensor {{ "dense" if dense else "split" }}_embedding{{ "_nobag" if nobag else "" {% endif %} "batched_embedding{{ "_nobag" if nobag else "" }}_forward_kernel_2", [&] { {% if not nobag %} - {% for kMaxVecsPerThread in range(1, max_embedding_dim // 128 + 1) %} - if (max_D <= {{ 128 * kMaxVecsPerThread }}) { + {% for kMaxVecsPerThread in range(1, max_embedding_dim // 256 + 1) %} + if (max_D <= {{ 256 * kMaxVecsPerThread }}) { {% if not dense %} split_embedding_codegen_forward_{{ wdesc }}_kernel<<< {% else %}