Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ jobs:
-DLLVM_DISTRIBUTION_COMPONENTS="llvm-headers;llvm-libraries;cmake-exports;FileCheck;count;not;mlir-headers;mlir-libraries;mlir-cmake-exports;mlir-tblgen;mlir-python-sources" \
-DMLIR_ENABLE_BINDINGS_PYTHON=ON \
-DCMAKE_INSTALL_PREFIX=${GITHUB_WORKSPACE}/llvm-mlir/_mlir_install
echo "INFO: working around a missing dependency on stubgen"
ninja MLIRPythonModules.extension._mlir.dso._mlir.type_stubs
ninja install-distribution-stripped
popd

Expand Down
6 changes: 3 additions & 3 deletions water/lib/Dialect/Wave/IR/WaveOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,15 @@ bool wave::IterateOp::areTypesCompatible(mlir::Type lhs, mlir::Type rhs) {
}

mlir::OperandRange
wave::IterateOp::getEntrySuccessorOperands(mlir::RegionBranchPoint point) {
wave::IterateOp::getEntrySuccessorOperands(mlir::RegionSuccessor) {
return getIterArgs();
}

void wave::IterateOp::getSuccessorRegions(
mlir::RegionBranchPoint point,
::llvm::SmallVectorImpl<::mlir::RegionSuccessor> &regions) {
// May branch into the region or bypass it regardless of the source.
regions.emplace_back(mlir::RegionSuccessor(getResults()));
regions.emplace_back(mlir::RegionSuccessor(getOperation(), getResults()));
regions.emplace_back(
mlir::RegionSuccessor(&getBody(), getBody().front().getArguments()));
}
Expand Down Expand Up @@ -544,6 +544,6 @@ LogicalResult WriteOp::verify() {
//-----------------------------------------------------------------------------

mlir::MutableOperandRange
wave::YieldOp::getMutableSuccessorOperands(mlir::RegionBranchPoint) {
wave::YieldOp::getMutableSuccessorOperands(mlir::RegionSuccessor) {
return getValuesMutable();
}
2 changes: 1 addition & 1 deletion water/llvm-sha.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ec3cf67434ba361124cfbb548e93589acd0d3cf2
478e45fb94e541dfd3a53a23bbc8ed98337b8a77
2 changes: 1 addition & 1 deletion water/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Development requirements for building Python bindings
nanobind>=2.4, <3.0
nanobind>=2.9, <3.0
pybind11>=2.10.0, <=2.13.6
numpy
lit
Expand Down
35 changes: 17 additions & 18 deletions water/test/Dialect/Wave/lower-wave-to-mlir.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ module attributes {wave.normal_form = #wave.normal_form<full_types,memory_only_t
%acc = wave.register %cst_f32 : vector<4xf32>

// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma %[[LHS]] * %[[RHS]] + %[[ACC]]
// CHECK-SAME: blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
// CHECK: amdgpu.mfma 16x16x16 %[[LHS]] * %[[RHS]] + %[[ACC]]
// CHECK-SAME: blgp = none
// CHECK-SAME: vector<4xf16>, vector<4xf16>, vector<4xf32>
%res = wave.mma %lhs, %rhs, %acc {kind = #wave.mma_kind<f32_16x16x16_f16>}
Expand Down Expand Up @@ -93,88 +92,88 @@ module attributes {wave.normal_form = #wave.normal_form<full_types,memory_only_t
// f16 kinds
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 16 : i32, m = 16 : i32, n = 16 : i32
// CHECK-SAME: 16x16x16
%0 = wave.mma %lhs_f16, %rhs_f16, %acc_f32_4 {kind = #wave.mma_kind<f32_16x16x16_f16>}
: (vector<4xf16>, vector<4xf16>, vector<4xf32>) -> vector<4xf32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 8 : i32, m = 32 : i32, n = 32 : i32
// CHECK-SAME: 32x32x8
%1 = wave.mma %lhs_f16, %rhs_f16, %acc_f32_16 {kind = #wave.mma_kind<f32_32x32x8_f16>}
: (vector<4xf16>, vector<4xf16>, vector<16xf32>) -> vector<16xf32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32
// CHECK-SAME: 16x16x32
%2 = wave.mma %lhs_f16_w8, %rhs_f16_w8, %acc_f32_4 {kind = #wave.mma_kind<f32_16x16x32_k8_f16>}
: (vector<8xf16>, vector<8xf16>, vector<4xf32>) -> vector<4xf32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32
// CHECK-SAME: 32x32x16
%3 = wave.mma %lhs_f16_w8, %rhs_f16_w8, %acc_f32_16 {kind = #wave.mma_kind<f32_32x32x16_k8_f16>}
: (vector<8xf16>, vector<8xf16>, vector<16xf32>) -> vector<16xf32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32
// CHECK-SAME: 32x32x16
%4 = wave.mma %lhs_f16_w8, %rhs_f16_w8, %acc_f32_16 {kind = #wave.mma_kind<f32_32x32x16_f16>}
: (vector<8xf16>, vector<8xf16>, vector<16xf32>) -> vector<16xf32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32
// CHECK-SAME: 16x16x32
%5 = wave.mma %lhs_f16_w8, %rhs_f16_w8, %acc_f32_4 {kind = #wave.mma_kind<f32_16x16x32_f16>}
: (vector<8xf16>, vector<8xf16>, vector<4xf32>) -> vector<4xf32>

// bf16 kinds
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32
// CHECK-SAME: 32x32x16
%6 = wave.mma %lhs_bf16, %rhs_bf16, %acc_f32_16 {kind = #wave.mma_kind<f32_32x32x16_bf16>}
: (vector<8xbf16>, vector<8xbf16>, vector<16xf32>) -> vector<16xf32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32
// CHECK-SAME: 16x16x32
%7 = wave.mma %lhs_bf16, %rhs_bf16, %acc_f32_4 {kind = #wave.mma_kind<f32_16x16x32_bf16>}
: (vector<8xbf16>, vector<8xbf16>, vector<4xf32>) -> vector<4xf32>

// f8 kinds
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32
// CHECK-SAME: 16x16x32
%8 = wave.mma %lhs_f8, %rhs_f8, %acc_f32_4 {kind = #wave.mma_kind<f32_16x16x32_f8>}
: (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32
// CHECK-SAME: 32x32x16
%9 = wave.mma %lhs_f8, %rhs_f8, %acc_f32_16 {kind = #wave.mma_kind<f32_32x32x16_f8>}
: (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<16xf32>) -> vector<16xf32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32
// CHECK-SAME: 16x16x32
%10 = wave.mma %lhs_f8, %rhs_f8, %acc_f32_4 {kind = #wave.mma_kind<f32_16x16x32_k4_f8>}
: (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32
// CHECK-SAME: 32x32x16
%11 = wave.mma %lhs_f8, %rhs_f8, %acc_f32_16 {kind = #wave.mma_kind<f32_32x32x16_k4_f8>}
: (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<16xf32>) -> vector<16xf32>

// i8 kinds
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 16 : i32, m = 16 : i32, n = 16 : i32
// CHECK-SAME: 16x16x16
%12 = wave.mma %lhs_i8, %rhs_i8, %acc_i32_4 {kind = #wave.mma_kind<i32_16x16x16_i8>}
: (vector<4xi8>, vector<4xi8>, vector<4xi32>) -> vector<4xi32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 8 : i32, m = 32 : i32, n = 32 : i32
// CHECK-SAME: 32x32x8
%13 = wave.mma %lhs_i8, %rhs_i8, %acc_i32_16 {kind = #wave.mma_kind<i32_32x32x8_i8>}
: (vector<4xi8>, vector<4xi8>, vector<16xi32>) -> vector<16xi32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32
// CHECK-SAME: 16x16x32
%14 = wave.mma %lhs_i8_w8, %rhs_i8_w8, %acc_i32_4 {kind = #wave.mma_kind<i32_16x16x32_i8>}
: (vector<8xi8>, vector<8xi8>, vector<4xi32>) -> vector<4xi32>
// CHECK-NOT: wave.mma
// CHECK: amdgpu.mfma
// CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32
// CHECK-SAME: 32x32x16
%15 = wave.mma %lhs_i8_w8, %rhs_i8_w8, %acc_i32_16 {kind = #wave.mma_kind<i32_32x32x16_i8>}
: (vector<8xi8>, vector<8xi8>, vector<16xi32>) -> vector<16xi32>

Expand Down
13 changes: 4 additions & 9 deletions water/test/Transforms/assert-in-bounds.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ func.func @shape_static_one_index_dynamic(%memref: memref<5x2xf32>, %i: index) -
// CHECK: %[[BOUND:.+]] = arith.andi %[[LB]], %[[UB]]
// PERDIM: cf.assert %[[BOUND]], "memref access out of bounds along dimension 1"
//
// COMPOUND: %[[COMPOUND:.+]] = arith.andi %[[BOUND]], %[[TRUE]]
// COMPOUND: cf.assert %[[COMPOUND]], "memref access out of bounds"
// COMPOUND: cf.assert %[[BOUND]], "memref access out of bounds"
// INPLC: memref.load
// SPEC-NOT: memref.load
%1 = memref.load %memref[%0, %i] : memref<5x2xf32>
Expand All @@ -77,18 +76,14 @@ func.func @shape_dynamic(%memref: memref<?x?xf32>) -> f32 {
// CHECK: %[[DIM0:.+]] = memref.dim %{{.*}}, %[[ZERO2]]
// Note that folding changed index0 < dim0 into dim0 > index0.
// CHECK: %[[UB0:.+]] = arith.cmpi sgt, %[[DIM0]], %[[INDEX0]]
// PERDIM: %[[BOUND0:.+]] = arith.andi %[[UB0]]
// PERDIM: cf.assert %[[BOUND0]], "memref access out of bounds along dimension 0"
// COMPOUND: %[[PREBOUND0:.+]] = arith.andi %[[UB0]]
// COMPOUND: %[[BOUND0:.+]] = arith.andi %[[PREBOUND0]]
// PERDIM: cf.assert %[[UB0]], "memref access out of bounds along dimension 0"
//
// CHECK: %[[ONE1:.+]] = arith.constant 1 : index
// CHECK: %[[DIM1:.+]] = memref.dim %{{.*}}, %[[ONE1]]
// CHECK: %[[UB1:.+]] = arith.cmpi sgt, %[[DIM1]], %[[INDEX1]]
// CHECK: %[[BOUND1:.+]] = arith.andi %[[UB1]]
// PERDIM: cf.assert %[[BOUND1]], "memref access out of bounds along dimension 1"
// PERDIM: cf.assert %[[UB1]], "memref access out of bounds along dimension 1"
//
// COMPOUND: %[[COMPOUND:.+]] = arith.andi %[[BOUND0]], %[[BOUND1]]
// COMPOUND: %[[COMPOUND:.+]] = arith.andi %[[UB0]], %[[UB1]]
// COMPOUND: cf.assert %[[COMPOUND]], "memref access out of bounds"
//
// INPLC: memref.load
Expand Down
4 changes: 2 additions & 2 deletions water/test/Transforms/lowered_gemm_pipelined.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ module attributes {transform.with_named_sequence} {
%58 = vector.load %view[%5, %7] : memref<64x36xbf16, #gpu.address_space<workgroup>>, vector<8xbf16>
%59 = vector.load %view_4[%8, %6] : memref<64x36xbf16, #gpu.address_space<workgroup>>, vector<8xbf16>
%60 = vector.load %view_4[%8, %7] : memref<64x36xbf16, #gpu.address_space<workgroup>>, vector<8xbf16>
%61 = amdgpu.mfma %59 * %57 + %arg4 {blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32>
%62 = amdgpu.mfma %60 * %58 + %61 {blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32>
%61 = amdgpu.mfma 32x32x16 %59 * %57 + %arg4 {blocks = 1 : i32} blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32>
%62 = amdgpu.mfma 32x32x16 %60 * %58 + %61 {blocks = 1 : i32} blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32>
scf.yield %62, %arg7, %arg8, %55, %56 : vector<16xf32>, vector<8xbf16>, vector<8xbf16>, vector<8xbf16>, vector<8xbf16>
}
%16 = vector.extract_strided_slice %15#0 {offsets = [0], sizes = [1], strides = [1]} : vector<16xf32> to vector<1xf32>
Expand Down
Loading