Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,11 @@ class LoopVectorizationCostModel {
private:
unsigned NumPredStores = 0;

/// Convenience function that returns the value of vscale_range iff
/// vscale_range.min == vscale_range.max or otherwise returns the value
/// returned by the corresponding TLI method.
Optional<unsigned> getVScaleForTuning() const;

/// \return An upper bound for the vectorization factors for both
/// fixed and scalable vectorization, where the minimum-known number of
/// elements is a power-of-2 larger than zero. If scalable vectorization is
Expand Down Expand Up @@ -5600,6 +5605,18 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return MaxVF;
}

Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
auto Min = Attr.getVScaleRangeMin();
auto Max = Attr.getVScaleRangeMax();
if (Max && Min == Max)
return Max;
}

return TTI.getVScaleForTuning();
}

bool LoopVectorizationCostModel::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
InstructionCost CostA = A.Cost;
Expand All @@ -5624,7 +5641,7 @@ bool LoopVectorizationCostModel::isMoreProfitable(
// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
if (Optional<unsigned> VScale = getVScaleForTuning()) {
if (A.Width.isScalable())
EstimatedWidthA *= VScale.getValue();
if (B.Width.isScalable())
Expand Down Expand Up @@ -5673,7 +5690,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

#ifndef NDEBUG
unsigned AssumedMinimumVscale = 1;
if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
if (Optional<unsigned> VScale = getVScaleForTuning())
AssumedMinimumVscale = VScale.getValue();
unsigned Width =
Candidate.Width.isScalable()
Expand Down Expand Up @@ -5885,8 +5902,20 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
return Result;
}

// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
ElementCount EstimatedRuntimeVF = MainLoopVF;
if (MainLoopVF.isScalable()) {
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
if (Optional<unsigned> VScale = getVScaleForTuning())
EstimatedRuntimeVF *= VScale.getValue();
}

for (auto &NextVF : ProfitableVFs)
if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
(Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
LVP.hasPlanWithVF(NextVF.Width))
Result = NextVF;
Expand Down
193 changes: 186 additions & 7 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,22 @@
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 --debug-only=loop-vectorize -force-target-instruction-cost=1 -S 2>%t | FileCheck %s --check-prefix=CHECK
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=8 --debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 --debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK-VF8
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED

target triple = "aarch64-linux-gnu"

; DEBUG: LV: Checking a loop in "f1"
; DEBUG: LV: Checking a loop in "main_vf_vscale_x_16"
; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1

; DEBUG-FORCED: LV: Checking a loop in "f1"
; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_16"
; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1

define void @f1(i8* %A) #0 {
; CHECK-LABEL: @f1(
define void @main_vf_vscale_x_16(i8* %A) #0 {
; CHECK-LABEL: @main_vf_vscale_x_16(
; CHECK-NEXT: iter.check:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
Expand Down Expand Up @@ -105,7 +103,7 @@ define void @f1(i8* %A) #0 {
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK-VF8-LABEL: @f1(
; CHECK-VF8-LABEL: @main_vf_vscale_x_16(
; CHECK-VF8-NEXT: iter.check:
; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK-VF8: vector.main.loop.iter.check:
Expand Down Expand Up @@ -195,4 +193,185 @@ exit:
ret void
}


; DEBUG: LV: Checking a loop in "main_vf_vscale_x_2"
; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
; DEBUG: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1

; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_2"
; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
; DEBUG-FORCED: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1

; When the vector.body uses VF=vscale x 1 (or VF=vscale x 2 because
; that's the minimum supported VF by SVE), we could still use a wide
; fixed-width VF=8 for the epilogue if the vectors are known to be
; sufficiently wide. This information can be deduced from vscale_range or
; VScaleForTuning (set by mcpu/mtune).
define void @main_vf_vscale_x_2(i64* %A) #0 vscale_range(8, 8) {
; CHECK-LABEL: @main_vf_vscale_x_2(
; CHECK-NEXT: iter.check:
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK: vector.main.loop.iter.check:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <vscale x 2 x i64>*
; CHECK-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP13]], align 1
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
; CHECK-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP17]], align 1
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
; CHECK-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
; CHECK-NEXT: store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024
; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
; CHECK-NEXT: store i64 1, i64* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: exit.loopexit:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK-VF8-LABEL: @main_vf_vscale_x_2(
; CHECK-VF8-NEXT: iter.check:
; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK-VF8: vector.main.loop.iter.check:
; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF8-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-VF8: vector.ph:
; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-VF8: vector.body:
; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-VF8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
; CHECK-VF8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF8-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; CHECK-VF8-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
; CHECK-VF8-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]]
; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]]
; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
; CHECK-VF8-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <vscale x 2 x i64>*
; CHECK-VF8-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP13]], align 1
; CHECK-VF8-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-VF8-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2
; CHECK-VF8-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]]
; CHECK-VF8-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
; CHECK-VF8-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP17]], align 1
; CHECK-VF8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF8-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; CHECK-VF8-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF8-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK-VF8: middle.block:
; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-VF8: vec.epilog.iter.check:
; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-VF8: vec.epilog.ph:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK-VF8: vec.epilog.vector.body:
; CHECK-VF8-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-VF8-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0
; CHECK-VF8-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
; CHECK-VF8-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
; CHECK-VF8-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
; CHECK-VF8-NEXT: store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
; CHECK-VF8-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
; CHECK-VF8-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-VF8: vec.epilog.middle.block:
; CHECK-VF8-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024
; CHECK-VF8-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK-VF8: vec.epilog.scalar.ph:
; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-VF8: for.body:
; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-VF8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
; CHECK-VF8-NEXT: store i64 1, i64* [[ARRAYIDX]], align 1
; CHECK-VF8-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF8-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
; CHECK-VF8-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-VF8: exit.loopexit:
; CHECK-VF8-NEXT: br label [[EXIT]]
; CHECK-VF8: exit:
; CHECK-VF8-NEXT: ret void
;
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%arrayidx = getelementptr inbounds i64, i64* %A, i64 %iv
store i64 1, i64* %arrayidx, align 1
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp ne i64 %iv.next, 1024
br i1 %exitcond, label %for.body, label %exit

exit:
ret void
}

attributes #0 = { "target-features"="+sve" }