diff --git a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp index 25163fa85c58..dba316994288 100644 --- a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp +++ b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp @@ -303,6 +303,10 @@ FusionGroup::getRootParallelLoopToOpMap(Operation *op) const { if (failed(composedMap) || (newMap && composedMap != newMap)) { return failure(); } + if (composedMap.value().getNumResults() == + composedMap.value().getNumOfZeroResults()) { + return failure(); + } newMap = composedMap.value(); } } else { @@ -336,6 +340,8 @@ FusionGroup::getRootParallelLoopToOpMap(Operation *op) const { } } } + + // Fail if there is no mapping or if there are no parallel loops in common. if (!newMap) { return failure(); } diff --git a/compiler/src/iree/compiler/DispatchCreation/test/form_dispatch_regions.mlir b/compiler/src/iree/compiler/DispatchCreation/test/form_dispatch_regions.mlir index c9ac66f7c207..fcffa4520167 100644 --- a/compiler/src/iree/compiler/DispatchCreation/test/form_dispatch_regions.mlir +++ b/compiler/src/iree/compiler/DispatchCreation/test/form_dispatch_regions.mlir @@ -1875,3 +1875,38 @@ util.func public @no_fusion_use_from_above(%arg0 : tensor, // CHECK: tensor.extract %[[DISPATCH0]] // CHECK: flow.return %[[CONSUMER]] // CHECK: util.return %[[DISPATCH1]] + +// ----- + +util.func public @dont_fuse_no_shared_parallel_loops(%arg0: tensor<16x16x24xf32>, %arg1: tensor<64x3x32xf32>) -> (tensor<64x3x32xf32>, tensor<32x64x3xf32>) { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor + %1 = tensor.empty() : tensor<64x3x32xf32> + %2 = tensor.empty() : tensor<32x64x3xf32> + %3 = linalg.fill ins(%cst : f32) outs(%0 : tensor) -> tensor + %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> ()>], iterator_types = ["reduction", "reduction", "reduction"]} ins(%arg0 : tensor<16x16x24xf32>) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f32): + %6 = arith.mulf %in, %in : f32 + %7 = arith.addf %out, %6 : f32 + linalg.yield %7 : f32 + } -> tensor + %5:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg1, %4 : tensor<64x3x32xf32>, tensor) outs(%1, %2 : tensor<64x3x32xf32>, tensor<32x64x3xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32, %out_1: f32): + %6 = arith.addf %in, %in_0 : f32 + linalg.yield %6, %6 : f32, f32 + } -> (tensor<64x3x32xf32>, tensor<32x64x3xf32>) + util.return %5#0, %5#1 : tensor<64x3x32xf32>, tensor<32x64x3xf32> +} + +// CHECK-LABEL: util.func public @dont_fuse_no_shared_parallel_loops( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<16x16x24xf32> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<64x3x32xf32>) +// CHECK: %[[DISPATCH0:.+]] = flow.dispatch.region +// CHECK: %[[REDUCTION:.+]] = linalg.generic +// CHECK-SAME: ins(%[[ARG0]] +// CHECK: flow.return %[[REDUCTION]] +// CHECK: %[[DISPATCH1:.+]]:2 = flow.dispatch.region +// CHECK: %[[GENERIC:.+]]:2 = linalg.generic +// CHECK-SAME: ins(%[[ARG1]], %[[DISPATCH0]] +// CHECK: flow.return %[[GENERIC]]#0, %[[GENERIC]]#1 +// CHECK: util.return %[[DISPATCH1]]#0, %[[DISPATCH1]]#1 diff --git a/tests/external/iree-test-suites/sharktank_models/benchmarks/sdxl/clip_rocm.json b/tests/external/iree-test-suites/sharktank_models/benchmarks/sdxl/clip_rocm.json index d9c94118bcd1..1d3fbbd6a438 100644 --- a/tests/external/iree-test-suites/sharktank_models/benchmarks/sdxl/clip_rocm.json +++ b/tests/external/iree-test-suites/sharktank_models/benchmarks/sdxl/clip_rocm.json @@ -23,9 +23,9 @@ "mi308": 7.45 }, "golden_dispatch": { - "mi250": 792, - "mi300": 792, - "mi308": 792 + "mi250": 794, + "mi300": 794, + "mi308": 794 }, "golden_size": { "mi250": 460000,