[InstCombine] Minor Tweaks

vortex73 · vortex73 · commit 03fc921ee0a9 · 2025-06-25T10:44:25.000+05:30
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -804,25 +804,16 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) {
 
 bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
   // Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y))
-  auto *BinOp = dyn_cast<BinaryOperator>(&I);
-  if (!BinOp || !BinOp->isBitwiseLogicOp())
+  Value *LHSSrc, *RHSSrc;
+  if (!match(&I, m_BitwiseLogic(m_BitCast(m_Value(LHSSrc)),
+                                m_BitCast(m_Value(RHSSrc)))))
     return false;
 
-  Value *LHS = BinOp->getOperand(0);
-  Value *RHS = BinOp->getOperand(1);
-
-  // Both operands must be bitcasts
-  auto *LHSCast = dyn_cast<BitCastInst>(LHS);
-  auto *RHSCast = dyn_cast<BitCastInst>(RHS);
-  if (!LHSCast || !RHSCast)
-    return false;
-
-  Value *LHSSrc = LHSCast->getOperand(0);
-  Value *RHSSrc = RHSCast->getOperand(0);
-
   // Source types must match
   if (LHSSrc->getType() != RHSSrc->getType())
     return false;
+  if (!LHSSrc->getType()->getScalarType()->isIntegerTy())
+    return false;
 
   // Only handle vector types
   auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
@@ -831,15 +822,30 @@ bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
     return false;
 
   // Same total bit width
-  if (SrcVecTy->getPrimitiveSizeInBits() != DstVecTy->getPrimitiveSizeInBits())
-    return false;
+  assert(SrcVecTy->getPrimitiveSizeInBits() ==
+             DstVecTy->getPrimitiveSizeInBits() &&
+         "Bitcast should preserve total bit width");
+
+  // Cost Check :
+  // OldCost = bitlogic + 2*bitcasts
+  // NewCost = bitlogic + bitcast
+  auto *BinOp = cast<BinaryOperator>(&I);
+  InstructionCost OldCost =
+      TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy) +
+      TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, LHSSrc->getType(),
+                           TTI::CastContextHint::None) +
+      TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, RHSSrc->getType(),
+                           TTI::CastContextHint::None);
+  InstructionCost NewCost =
+      TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy) +
+      TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, SrcVecTy,
+                           TTI::CastContextHint::None);
 
-  // Cost check: prefer operations on narrower element types
-  unsigned SrcEltBits = SrcVecTy->getScalarSizeInBits();
-  unsigned DstEltBits = DstVecTy->getScalarSizeInBits();
+  LLVM_DEBUG(dbgs() << "Found a bitwise logic op of bitcasted values: " << I
+                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
 
-  // Prefer smaller element sizes (more elements, finer granularity)
-  if (SrcEltBits > DstEltBits)
+  if (NewCost > OldCost)
     return false;
 
   // Create the operation on the source type
@@ -848,6 +854,8 @@ bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
   if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
     NewBinOp->copyIRFlags(BinOp);
 
+  Worklist.pushValue(NewOp);
+
   // Bitcast the result back
   Value *Result = Builder.CreateBitCast(NewOp, I.getType());
   replaceValue(I, *Result);
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shrink-types.ll b/llvm/test/Transforms/VectorCombine/AArch64/shrink-types.ll
@@ -7,9 +7,8 @@ define i32 @test_and(<16 x i32> %a, ptr %b) {
 ; CHECK-LABEL: @test_and(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[B:%.*]], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc <16 x i32> [[A:%.*]] to <16 x i8>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[WIDE_LOAD]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i32> [[TMP0]], [[A:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
@@ -26,9 +25,8 @@ define i32 @test_mask_or(<16 x i32> %a, ptr %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[B:%.*]], align 1
 ; CHECK-NEXT:    [[A_MASKED:%.*]] = and <16 x i32> [[A:%.*]], splat (i32 16)
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc <16 x i32> [[A_MASKED]] to <16 x i8>
-; CHECK-NEXT:    [[TMP1:%.*]] = or <16 x i8> [[WIDE_LOAD]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = or <16 x i32> [[TMP0]], [[A_MASKED]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
@@ -47,15 +45,13 @@ define i32 @multiuse(<16 x i32> %u, <16 x i32> %v, ptr %b) {
 ; CHECK-NEXT:    [[U_MASKED:%.*]] = and <16 x i32> [[U:%.*]], splat (i32 255)
 ; CHECK-NEXT:    [[V_MASKED:%.*]] = and <16 x i32> [[V:%.*]], splat (i32 255)
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[B:%.*]], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = lshr <16 x i8> [[WIDE_LOAD]], splat (i8 4)
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[V_MASKED]] to <16 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = or <16 x i8> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i8> [[WIDE_LOAD]], splat (i8 15)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[U_MASKED]] to <16 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <16 x i8> [[WIDE_LOAD]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <16 x i32> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP7]], [[V_MASKED]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[TMP0]], splat (i32 15)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], [[U_MASKED]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <16 x i32> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
 ; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
@@ -81,9 +77,8 @@ define i32 @phi_bug(<16 x i32> %a, ptr %b) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[A_PHI:%.*]] = phi <16 x i32> [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[WIDE_LOAD_PHI:%.*]] = phi <16 x i8> [ [[WIDE_LOAD]], [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc <16 x i32> [[A_PHI]] to <16 x i8>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[WIDE_LOAD_PHI]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD_PHI]] to <16 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i32> [[TMP0]], [[A_PHI]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;