llvm · jroelofs · May 28, 2025 · May 28, 2025 · May 28, 2025
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -233,6 +233,7 @@ static bool isUniformShape(Value *V) {
   case Instruction::FAdd:
   case Instruction::FSub:
   case Instruction::FMul: // Scalar multiply.
+  case Instruction::FDiv:
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::Mul:
@@ -2167,6 +2168,8 @@ class LowerMatrixIntrinsics {
         return Builder.CreateFAdd(LHS, RHS);
       case Instruction::FMul:
         return Builder.CreateFMul(LHS, RHS);
+      case Instruction::FDiv:
+        return Builder.CreateFDiv(LHS, RHS);
       case Instruction::FSub:
         return Builder.CreateFSub(LHS, RHS);
       default:

diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define void @fdiv_2x2(ptr %num, ptr %denom, ptr %out) {
+; CHECK-LABEL: @fdiv_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[NUM:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[NUM]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[DENOM:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[DENOM]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[COL_LOAD]], [[COL_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <2 x double> [[COL_LOAD1]], [[COL_LOAD4]]
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP5]], align 16
+; CHECK-NEXT:    ret void
+;
+  %numv = load <4 x double>, ptr %num
+  %denomv = load <4 x double>, ptr %denom
+  %div = fdiv <4 x double> %numv, %denomv
+  %divt  = call <4 x double> @llvm.matrix.transpose(<4 x double> %div, i32 2, i32 2)
+  %divtt = call <4 x double> @llvm.matrix.transpose(<4 x double> %divt, i32 2, i32 2)
+  store <4 x double> %divtt, ptr %out
+  ret void
+}