Skip to content

Commit

Permalink
[Codegen][LLVMGPU] Avoid long compilation times of warp reduction pip…
Browse files Browse the repository at this point in the history
…eline (#19381)

The warp reduction pipeline tile size logic isnt very robust for dynamic
dimensions. For now use a fallback in case where dynamic dimensions
exist to allow for reasonable compilation times.

Fixes #19377

---------

Signed-off-by: MaheshRavishankar <[email protected]>
  • Loading branch information
MaheshRavishankar authored Dec 6, 2024
1 parent d88d0a7 commit cb59389
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 4 deletions.
5 changes: 3 additions & 2 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1577,6 +1577,7 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
return failure();
}
}
int numDynamicDims = llvm::count_if(bounds, ShapedType::isDynamic);

// Distribution of multi-dim masked writes currently aren't fully supported.
if (numDynamicReductionDims > 1) {
Expand Down Expand Up @@ -1617,9 +1618,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
size_t numLoops = partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1;
SmallVector<int64_t> workgroupTileSizes(numLoops, 1);

// Without any bounds on dynamic reduction dims, we need specialization to
// Without any bounds on dynamic dims, we need specialization to
// get peak performance. For now, just use the warp size.
if (numDynamicReductionDims) {
if (numDynamicDims > 0) {
SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0);
int64_t preferredSubgroupSize = target.getPreferredSubgroupSize();
reductionTileSizes[reductionDims[0]] = preferredSubgroupSize;
Expand Down
31 changes: 31 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,34 @@ func.func @not_vmt() {
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[$CONFIG]]

// -----

func.func @dynamic_parallel_dims(%dynsize : index, %input : tensor<4x?x4096xf16>) -> tensor<4x?xf32> {
%cst = arith.constant 0.0 : f32
%0 = tensor.empty(%dynsize) : tensor<4x?xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x?xf32>) -> tensor<4x?xf32>
%2 = linalg.generic {
indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"]}
ins(%input : tensor<4x?x4096xf16>) outs(%1 : tensor<4x?xf32>) {
^bb0(%in: f16, %out: f32):
%3 = arith.extf %in : f16 to f32
%4 = arith.addf %3, %out : f32
linalg.yield %4 : f32
} -> tensor<4x?xf32>
return %2 : tensor<4x?xf32>
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 64]{{\]}}
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1]>
// CHECK: func @dynamic_parallel_dims
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]

// CDNA3-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 32]{{\]}}
// CDNA3-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [32, 1, 1]>
// CDNA3: func @dynamic_parallel_dims
// CDNA3-SAME: translation_info = #[[TRANSLATION]]
// CDNA3: linalg.generic
// CDNA3-SAME: lowering_config = #[[CONFIG]]
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,8 @@ func.func @i4_dequant_matvec() {
return
}

// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 256]{{\]}}>
// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 32]{{\]}}>
// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [32, 1, 1]>
// CHECK-LABEL: func.func @i4_dequant_matvec()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
Expand Down

0 comments on commit cb59389

Please sign in to comment.