[VPlan] Narrow predicated single-scalar loads with header mask.#196630
[VPlan] Narrow predicated single-scalar loads with header mask.#196630
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesNarrow predicated single-scalar loads predicated only by the header mask to unpredicated single-scalar loads. The first lane of the header mask must be active, so the load is guaranteed to be executed at least once per vector iteration. We can replace all users by the single-scalar loaded value. Masked off lanes are poison, which can be replaced by the loaded value. With the new transform, the dedicated cost handling for predicated single-scalar loads with a header mask is no longer needed and is removed; the equivalent path for stores is retained for now. Patch is 23.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/196630.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d04b5edcfc212..8097699ff58ff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3397,7 +3397,7 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
return false;
}
-/// Return true if \p R is a predicated load/store with a loop-invariant address
+/// Return true if \p R is a predicated store with a loop-invariant address
/// only masked by the header mask.
static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R,
const SCEV *PtrSCEV,
@@ -3544,20 +3544,15 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
UsedByLoadStoreAddress ? UI : nullptr);
- // Check if this is a predicated load/store with a loop-invariant address
- // only masked by the header mask. If so, return the uniform mem op cost.
- if (isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) {
+ // Check if this is a predicated store with a loop-invariant address only
+ // masked by the header mask. If so, return the uniform mem op cost.
+ if (!IsLoad &&
+ isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) {
InstructionCost UniformCost =
ScalarMemOpCost +
Ctx.TTI.getAddressComputationCost(ScalarPtrTy, /*SE=*/nullptr,
/*Ptr=*/nullptr, Ctx.CostKind);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
- if (IsLoad) {
- return UniformCost +
- Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
- VectorTy, VectorTy, {}, Ctx.CostKind);
- }
-
VPValue *StoredVal = getOperand(0);
if (!StoredVal->isDefinedOutsideLoopRegions())
UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 32d89a34105a4..e898f10cf3b70 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1697,6 +1697,24 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
match(Def, m_ComputeReductionResult(m_VPIRValue(IRV))))
return Def->replaceAllUsesWith(IRV);
+ // Narrow a single-scalar predicated load with a header mask to an
+ // unpredicated load; the header mask only restricts the active lanes, but a
+ // single-scalar load reads from a uniform address and produces the same value
+ // for every lane.
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(Def)) {
+ if (RepR->isPredicated() && RepR->isSingleScalar() &&
+ RepR->getOpcode() == Instruction::Load &&
+ vputils::isHeaderMask(RepR->getMask(), *Plan)) {
+ auto *NewR = new VPReplicateRecipe(
+ RepR->getUnderlyingInstr(), drop_end(RepR->operands()),
+ /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR, *RepR,
+ RepR->getDebugLoc());
+ NewR->insertBefore(RepR);
+ RepR->replaceAllUsesWith(NewR);
+ return;
+ }
+ }
+
// Some simplifications can only be applied after unrolling. Perform them
// below.
if (!Plan->isUnrolled())
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
index 3e61a933d0ecb..1573bae53d52c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
@@ -16,29 +16,19 @@ define i32 @pr70988(ptr %src, i32 %n) {
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[UMAX]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE5:%.*]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_LOAD_CONTINUE5]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[PRED_LOAD_CONTINUE5]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[PRED_LOAD_CONTINUE5]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE5]] ]
-; CHECK-NEXT: br i1 [[ACTIVE_LANE_MASK]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: br i1 [[ACTIVE_LANE_MASK2]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5]]
-; CHECK: pred.load.if4:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]]
-; CHECK: pred.load.continue5:
-; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP8]], i32 [[VEC_PHI]])
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP12]], i32 [[VEC_PHI]])
; CHECK-NEXT: [[TMP16:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP14]], i32 [[VEC_PHI3]])
; CHECK-NEXT: [[TMP17]] = select i1 [[ACTIVE_LANE_MASK]], i32 [[TMP15]], i32 [[VEC_PHI]]
; CHECK-NEXT: [[TMP18]] = select i1 [[ACTIVE_LANE_MASK2]], i32 [[TMP16]], i32 [[VEC_PHI3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index 89c5553d3bc03..4a6d5cb3142cb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -199,22 +199,22 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ true, [[PRED_STORE_CONTINUE4]] ]
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; INTERLEAVE-NEXT: [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1
+; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[TMP0]]
+; INTERLEAVE-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP1]], align 8
; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; INTERLEAVE: pred.store.if:
-; INTERLEAVE-NEXT: [[TMP0:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-NEXT: [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8
-; INTERLEAVE-NEXT: [[TMP2:%.*]] = call double @foo(double [[TMP1]], i64 [[INDEX]]) #[[ATTR4:[0-9]+]]
-; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-NEXT: store double [[TMP2]], ptr [[TMP3]], align 8
+; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[TMP3]], align 8
+; INTERLEAVE-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[INDEX]]) #[[ATTR4:[0-9]+]]
+; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT: store double [[TMP5]], ptr [[TMP6]], align 8
; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE]]
; INTERLEAVE: pred.store.continue:
; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
; INTERLEAVE: pred.store.if3:
-; INTERLEAVE-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 1
-; INTERLEAVE-NEXT: [[TMP5:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[TMP4]]
-; INTERLEAVE-NEXT: [[TMP6:%.*]] = load double, ptr [[TMP5]], align 8
-; INTERLEAVE-NEXT: [[TMP7:%.*]] = call double @foo(double [[TMP6]], i64 [[TMP4]]) #[[ATTR4]]
-; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[TMP4]]
+; INTERLEAVE-NEXT: [[TMP7:%.*]] = call double @foo(double [[TMP2]], i64 [[TMP0]]) #[[ATTR4]]
+; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[TMP0]]
; INTERLEAVE-NEXT: store double [[TMP7]], ptr [[TMP8]], align 8
; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE4]]
; INTERLEAVE: pred.store.continue4:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
index 0a57bef879728..02669f976858d 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
@@ -111,20 +111,10 @@ define i32 @FOR_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
; VF1IC2-NEXT: [[VEC_IV:%.*]] = add i64 [[TMP0]], 0
; VF1IC2-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
; VF1IC2-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]]
-; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
-; VF1IC2: [[PRED_LOAD_IF]]:
-; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
-; VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
-; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]]
-; VF1IC2: [[PRED_LOAD_CONTINUE]]:
-; VF1IC2-NEXT: [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
-; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]]
-; VF1IC2: [[PRED_LOAD_IF2]]:
+; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[VEC_IV]]
; VF1IC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[VEC_IV1]]
-; VF1IC2-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
-; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE3]]
-; VF1IC2: [[PRED_LOAD_CONTINUE3]]:
-; VF1IC2-NEXT: [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP32]], %[[PRED_LOAD_IF2]] ]
+; VF1IC2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT: [[TMP8]] = load i32, ptr [[TMP31]], align 4
; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; VF1IC2: [[PRED_STORE_IF]]:
; VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
@@ -288,20 +278,10 @@ define i32 @FOR_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
; VF1IC2-NEXT: [[VEC_IV:%.*]] = add i64 [[TMP0]], 0
; VF1IC2-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
; VF1IC2-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]]
-; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
-; VF1IC2: [[PRED_LOAD_IF]]:
-; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
-; VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
-; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]]
-; VF1IC2: [[PRED_LOAD_CONTINUE]]:
-; VF1IC2-NEXT: [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
-; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]]
-; VF1IC2: [[PRED_LOAD_IF2]]:
+; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[VEC_IV]]
; VF1IC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[VEC_IV1]]
-; VF1IC2-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
-; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE3]]
-; VF1IC2: [[PRED_LOAD_CONTINUE3]]:
-; VF1IC2-NEXT: [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP29]], %[[PRED_LOAD_IF2]] ]
+; VF1IC2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT: [[TMP8]] = load i32, ptr [[TMP28]], align 4
; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; VF1IC2: [[PRED_STORE_IF]]:
; VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
@@ -473,20 +453,10 @@ define i32 @FOR_and_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
; VF1IC2-NEXT: [[VEC_IV:%.*]] = add i64 [[TMP0]], 0
; VF1IC2-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
; VF1IC2-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]]
-; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
-; VF1IC2: [[PRED_LOAD_IF]]:
-; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
-; VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
-; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]]
-; VF1IC2: [[PRED_LOAD_CONTINUE]]:
-; VF1IC2-NEXT: [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
-; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]]
-; VF1IC2: [[PRED_LOAD_IF2]]:
+; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[VEC_IV]]
; VF1IC2-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[VEC_IV1]]
-; VF1IC2-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
-; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE3]]
-; VF1IC2: [[PRED_LOAD_CONTINUE3]]:
-; VF1IC2-NEXT: [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP35]], %[[PRED_LOAD_IF2]] ]
+; VF1IC2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT: [[TMP8]] = load i32, ptr [[TMP34]], align 4
; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; VF1IC2: [[PRED_STORE_IF]]:
; VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index 106712c3d9a65..ccb671fe15f07 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -295,31 +295,31 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
; VF1UF4-NEXT: [[TMP5:%.*]] = icmp ule i64 [[TMP1]], 13
; VF1UF4-NEXT: [[TMP6:%.*]] = icmp ule i64 [[TMP2]], 13
; VF1UF4-NEXT: [[TMP7:%.*]] = icmp ule i64 [[TMP3]], 13
-; VF1UF4-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; VF1UF4: pred.store.if:
; VF1UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; VF1UF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF1UF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF1UF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
; VF1UF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF1UF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF1UF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF1UF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF1UF4-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; VF1UF4: pred.store.if:
; VF1UF4-NEXT: store i64 [[TMP9]], ptr [[B:%.*]], align 8
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]]
; VF1UF4: pred.store.continue:
; VF1UF4-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
; VF1UF4: pred.store.if1:
-; VF1UF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF1UF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP11]], align 8
; VF1UF4-NEXT: store i64 [[TMP12]], ptr [[B]], align 8
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE2]]
; VF1UF4: pred.store.continue2:
; VF1UF4-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
; VF1UF4: pred.store.if3:
-; VF1UF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; VF1UF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP14]], align 8
; VF1UF4-NEXT: store i64 [[TMP15]], ptr [[B]], align 8
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE4]]
; VF1UF4: pred.store.continue4:
; VF1UF4-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
; VF1UF4: pred.store.if5:
-; VF1UF4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF1UF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
; VF1UF4-NEXT: store i64 [[TMP18]], ptr [[B]], align 8
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE6]]
; VF1UF4: pred.store.continue6:
diff --git a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll
index 636eb6ddfaece..381818d54fc5c 100644
--- a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll
@@ -74,26 +74,16 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) {
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE3:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_LOAD_CONTINUE3]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[VEC_IV1:%.*]] = add i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i32 [[VEC_IV]], 14
; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i32 [[VEC_IV1]], 14
-; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[SRC]], i32 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP5:%.*]] = phi float [ poison, [[VECTOR_BODY]] ], [ [[TMP4]]...
[truncated]
|
Narrow predicated single-scalar loads predicated only by the header mask to unpredicated single-scalar loads. The first lane of the header mask must be active, so the load is guaranteed to be executed at least once per vector iteration. We can replace all users by the single-scalar loaded value. Masked off lanes are poison, which can be replaced by the loaded value. With the new transform, the dedicated cost handling for predicated single-scalar loads with a header mask is no longer needed and is removed; the equivalent path for stores is retained for now.
6756b4b to
8b6fa48
Compare
Narrow predicated single-scalar loads predicated only by the header mask to unpredicated single-scalar loads. The first lane of the header mask must be active, so the load is guaranteed to be executed at least once per vector iteration. We can replace all users by the single-scalar loaded value. Masked off lanes are poison, which can be replaced by the loaded value.
With the new transform, the dedicated cost handling for predicated single-scalar loads with a header mask is no longer needed and is removed; the equivalent path for stores is retained for now.