diff --git a/.github/workflows/ci-gpu.yaml b/.github/workflows/ci-gpu.yaml index 5ac34c1fc..bf3a4ffed 100644 --- a/.github/workflows/ci-gpu.yaml +++ b/.github/workflows/ci-gpu.yaml @@ -97,6 +97,8 @@ jobs: -DLLVM_DISTRIBUTION_COMPONENTS="llvm-headers;llvm-libraries;cmake-exports;FileCheck;count;not;mlir-headers;mlir-libraries;mlir-cmake-exports;mlir-tblgen;mlir-python-sources" \ -DMLIR_ENABLE_BINDINGS_PYTHON=ON \ -DCMAKE_INSTALL_PREFIX=${GITHUB_WORKSPACE}/llvm-mlir/_mlir_install + echo "INFO: working around a missing dependency on stubgen" + ninja MLIRPythonModules.extension._mlir.dso._mlir.type_stubs ninja install-distribution-stripped popd diff --git a/water/lib/Dialect/Wave/IR/WaveOps.cpp b/water/lib/Dialect/Wave/IR/WaveOps.cpp index 79dfd7626..ea04a05bd 100644 --- a/water/lib/Dialect/Wave/IR/WaveOps.cpp +++ b/water/lib/Dialect/Wave/IR/WaveOps.cpp @@ -112,7 +112,7 @@ bool wave::IterateOp::areTypesCompatible(mlir::Type lhs, mlir::Type rhs) { } mlir::OperandRange -wave::IterateOp::getEntrySuccessorOperands(mlir::RegionBranchPoint point) { +wave::IterateOp::getEntrySuccessorOperands(mlir::RegionSuccessor) { return getIterArgs(); } @@ -120,7 +120,7 @@ void wave::IterateOp::getSuccessorRegions( mlir::RegionBranchPoint point, ::llvm::SmallVectorImpl<::mlir::RegionSuccessor> ®ions) { // May branch into the region or bypass it regardless of the source. - regions.emplace_back(mlir::RegionSuccessor(getResults())); + regions.emplace_back(mlir::RegionSuccessor(getOperation(), getResults())); regions.emplace_back( mlir::RegionSuccessor(&getBody(), getBody().front().getArguments())); } @@ -544,6 +544,6 @@ LogicalResult WriteOp::verify() { //----------------------------------------------------------------------------- mlir::MutableOperandRange -wave::YieldOp::getMutableSuccessorOperands(mlir::RegionBranchPoint) { +wave::YieldOp::getMutableSuccessorOperands(mlir::RegionSuccessor) { return getValuesMutable(); } diff --git a/water/llvm-sha.txt b/water/llvm-sha.txt index f94b55e73..247376ccd 100644 --- a/water/llvm-sha.txt +++ b/water/llvm-sha.txt @@ -1 +1 @@ -ec3cf67434ba361124cfbb548e93589acd0d3cf2 +478e45fb94e541dfd3a53a23bbc8ed98337b8a77 diff --git a/water/requirements-dev.txt b/water/requirements-dev.txt index 6ca5610ee..9ecc45c9a 100644 --- a/water/requirements-dev.txt +++ b/water/requirements-dev.txt @@ -1,5 +1,5 @@ # Development requirements for building Python bindings -nanobind>=2.4, <3.0 +nanobind>=2.9, <3.0 pybind11>=2.10.0, <=2.13.6 numpy lit diff --git a/water/test/Dialect/Wave/lower-wave-to-mlir.mlir b/water/test/Dialect/Wave/lower-wave-to-mlir.mlir index 8af3016ad..e7e04e91f 100644 --- a/water/test/Dialect/Wave/lower-wave-to-mlir.mlir +++ b/water/test/Dialect/Wave/lower-wave-to-mlir.mlir @@ -43,8 +43,7 @@ module attributes {wave.normal_form = #wave.normal_form // CHECK-NOT: wave.mma - // CHECK: amdgpu.mfma %[[LHS]] * %[[RHS]] + %[[ACC]] - // CHECK-SAME: blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32 + // CHECK: amdgpu.mfma 16x16x16 %[[LHS]] * %[[RHS]] + %[[ACC]] // CHECK-SAME: blgp = none // CHECK-SAME: vector<4xf16>, vector<4xf16>, vector<4xf32> %res = wave.mma %lhs, %rhs, %acc {kind = #wave.mma_kind} @@ -93,88 +92,88 @@ module attributes {wave.normal_form = #wave.normal_form} : (vector<4xf16>, vector<4xf16>, vector<4xf32>) -> vector<4xf32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 8 : i32, m = 32 : i32, n = 32 : i32 + // CHECK-SAME: 32x32x8 %1 = wave.mma %lhs_f16, %rhs_f16, %acc_f32_16 {kind = #wave.mma_kind} : (vector<4xf16>, vector<4xf16>, vector<16xf32>) -> vector<16xf32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32 + // CHECK-SAME: 16x16x32 %2 = wave.mma %lhs_f16_w8, %rhs_f16_w8, %acc_f32_4 {kind = #wave.mma_kind} : (vector<8xf16>, vector<8xf16>, vector<4xf32>) -> vector<4xf32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32 + // CHECK-SAME: 32x32x16 %3 = wave.mma %lhs_f16_w8, %rhs_f16_w8, %acc_f32_16 {kind = #wave.mma_kind} : (vector<8xf16>, vector<8xf16>, vector<16xf32>) -> vector<16xf32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32 + // CHECK-SAME: 32x32x16 %4 = wave.mma %lhs_f16_w8, %rhs_f16_w8, %acc_f32_16 {kind = #wave.mma_kind} : (vector<8xf16>, vector<8xf16>, vector<16xf32>) -> vector<16xf32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32 + // CHECK-SAME: 16x16x32 %5 = wave.mma %lhs_f16_w8, %rhs_f16_w8, %acc_f32_4 {kind = #wave.mma_kind} : (vector<8xf16>, vector<8xf16>, vector<4xf32>) -> vector<4xf32> // bf16 kinds // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32 + // CHECK-SAME: 32x32x16 %6 = wave.mma %lhs_bf16, %rhs_bf16, %acc_f32_16 {kind = #wave.mma_kind} : (vector<8xbf16>, vector<8xbf16>, vector<16xf32>) -> vector<16xf32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32 + // CHECK-SAME: 16x16x32 %7 = wave.mma %lhs_bf16, %rhs_bf16, %acc_f32_4 {kind = #wave.mma_kind} : (vector<8xbf16>, vector<8xbf16>, vector<4xf32>) -> vector<4xf32> // f8 kinds // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32 + // CHECK-SAME: 16x16x32 %8 = wave.mma %lhs_f8, %rhs_f8, %acc_f32_4 {kind = #wave.mma_kind} : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32 + // CHECK-SAME: 32x32x16 %9 = wave.mma %lhs_f8, %rhs_f8, %acc_f32_16 {kind = #wave.mma_kind} : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<16xf32>) -> vector<16xf32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32 + // CHECK-SAME: 16x16x32 %10 = wave.mma %lhs_f8, %rhs_f8, %acc_f32_4 {kind = #wave.mma_kind} : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32 + // CHECK-SAME: 32x32x16 %11 = wave.mma %lhs_f8, %rhs_f8, %acc_f32_16 {kind = #wave.mma_kind} : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<16xf32>) -> vector<16xf32> // i8 kinds // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 16 : i32, m = 16 : i32, n = 16 : i32 + // CHECK-SAME: 16x16x16 %12 = wave.mma %lhs_i8, %rhs_i8, %acc_i32_4 {kind = #wave.mma_kind} : (vector<4xi8>, vector<4xi8>, vector<4xi32>) -> vector<4xi32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 8 : i32, m = 32 : i32, n = 32 : i32 + // CHECK-SAME: 32x32x8 %13 = wave.mma %lhs_i8, %rhs_i8, %acc_i32_16 {kind = #wave.mma_kind} : (vector<4xi8>, vector<4xi8>, vector<16xi32>) -> vector<16xi32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 32 : i32, m = 16 : i32, n = 16 : i32 + // CHECK-SAME: 16x16x32 %14 = wave.mma %lhs_i8_w8, %rhs_i8_w8, %acc_i32_4 {kind = #wave.mma_kind} : (vector<8xi8>, vector<8xi8>, vector<4xi32>) -> vector<4xi32> // CHECK-NOT: wave.mma // CHECK: amdgpu.mfma - // CHECK-SAME: k = 16 : i32, m = 32 : i32, n = 32 : i32 + // CHECK-SAME: 32x32x16 %15 = wave.mma %lhs_i8_w8, %rhs_i8_w8, %acc_i32_16 {kind = #wave.mma_kind} : (vector<8xi8>, vector<8xi8>, vector<16xi32>) -> vector<16xi32> diff --git a/water/test/Transforms/assert-in-bounds.mlir b/water/test/Transforms/assert-in-bounds.mlir index 6a8dd9540..31595b6f4 100644 --- a/water/test/Transforms/assert-in-bounds.mlir +++ b/water/test/Transforms/assert-in-bounds.mlir @@ -51,8 +51,7 @@ func.func @shape_static_one_index_dynamic(%memref: memref<5x2xf32>, %i: index) - // CHECK: %[[BOUND:.+]] = arith.andi %[[LB]], %[[UB]] // PERDIM: cf.assert %[[BOUND]], "memref access out of bounds along dimension 1" // - // COMPOUND: %[[COMPOUND:.+]] = arith.andi %[[BOUND]], %[[TRUE]] - // COMPOUND: cf.assert %[[COMPOUND]], "memref access out of bounds" + // COMPOUND: cf.assert %[[BOUND]], "memref access out of bounds" // INPLC: memref.load // SPEC-NOT: memref.load %1 = memref.load %memref[%0, %i] : memref<5x2xf32> @@ -77,18 +76,14 @@ func.func @shape_dynamic(%memref: memref) -> f32 { // CHECK: %[[DIM0:.+]] = memref.dim %{{.*}}, %[[ZERO2]] // Note that folding changed index0 < dim0 into dim0 > index0. // CHECK: %[[UB0:.+]] = arith.cmpi sgt, %[[DIM0]], %[[INDEX0]] - // PERDIM: %[[BOUND0:.+]] = arith.andi %[[UB0]] - // PERDIM: cf.assert %[[BOUND0]], "memref access out of bounds along dimension 0" - // COMPOUND: %[[PREBOUND0:.+]] = arith.andi %[[UB0]] - // COMPOUND: %[[BOUND0:.+]] = arith.andi %[[PREBOUND0]] + // PERDIM: cf.assert %[[UB0]], "memref access out of bounds along dimension 0" // // CHECK: %[[ONE1:.+]] = arith.constant 1 : index // CHECK: %[[DIM1:.+]] = memref.dim %{{.*}}, %[[ONE1]] // CHECK: %[[UB1:.+]] = arith.cmpi sgt, %[[DIM1]], %[[INDEX1]] - // CHECK: %[[BOUND1:.+]] = arith.andi %[[UB1]] - // PERDIM: cf.assert %[[BOUND1]], "memref access out of bounds along dimension 1" + // PERDIM: cf.assert %[[UB1]], "memref access out of bounds along dimension 1" // - // COMPOUND: %[[COMPOUND:.+]] = arith.andi %[[BOUND0]], %[[BOUND1]] + // COMPOUND: %[[COMPOUND:.+]] = arith.andi %[[UB0]], %[[UB1]] // COMPOUND: cf.assert %[[COMPOUND]], "memref access out of bounds" // // INPLC: memref.load diff --git a/water/test/Transforms/lowered_gemm_pipelined.mlir b/water/test/Transforms/lowered_gemm_pipelined.mlir index 0fed3fb1c..7cecc9522 100644 --- a/water/test/Transforms/lowered_gemm_pipelined.mlir +++ b/water/test/Transforms/lowered_gemm_pipelined.mlir @@ -55,8 +55,8 @@ module attributes {transform.with_named_sequence} { %58 = vector.load %view[%5, %7] : memref<64x36xbf16, #gpu.address_space>, vector<8xbf16> %59 = vector.load %view_4[%8, %6] : memref<64x36xbf16, #gpu.address_space>, vector<8xbf16> %60 = vector.load %view_4[%8, %7] : memref<64x36xbf16, #gpu.address_space>, vector<8xbf16> - %61 = amdgpu.mfma %59 * %57 + %arg4 {blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32> - %62 = amdgpu.mfma %60 * %58 + %61 {blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32> + %61 = amdgpu.mfma 32x32x16 %59 * %57 + %arg4 {blocks = 1 : i32} blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32> + %62 = amdgpu.mfma 32x32x16 %60 * %58 + %61 {blocks = 1 : i32} blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32> scf.yield %62, %arg7, %arg8, %55, %56 : vector<16xf32>, vector<8xbf16>, vector<8xbf16>, vector<8xbf16>, vector<8xbf16> } %16 = vector.extract_strided_slice %15#0 {offsets = [0], sizes = [1], strides = [1]} : vector<16xf32> to vector<1xf32>