diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 2288969ecc95c..8d74b12dbc4ba 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -925,11 +925,15 @@ class OpenMPIRBuilder { /// preheader of the loop. /// \param LoopType Information about type of loop worksharing. /// It corresponds to type of loop workshare OpenMP pragma. + /// \param ScheduleType Information about scheduling type. + /// \param ChunkSize Value of chunk size for static schedule. /// /// \returns Point where to insert code after the workshare construct. InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, - omp::WorksharingLoopType LoopType); + omp::WorksharingLoopType LoopType, + omp::OMPScheduleType ScheduleType, + Value *ChunkSize); /// Modifies the canonical loop to be a statically-scheduled workshare loop. /// diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 02b333e9ccd56..f9cbc39a24016 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -2728,7 +2728,8 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, static void createTargetLoopWorkshareCall( OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, - Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) { + Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn, + Value *ThreadChunkSize) { Type *TripCountTy = TripCount->getType(); Module &M = OMPBuilder->M; IRBuilder<> &Builder = OMPBuilder->Builder; @@ -2751,9 +2752,21 @@ static void createTargetLoopWorkshareCall( RealArgs.push_back( Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast")); - RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); - if (LoopType == WorksharingLoopType::DistributeForStaticLoop) { + switch (LoopType) { + case WorksharingLoopType::DistributeForStaticLoop: + RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); + ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc( + ThreadChunkSize, TripCountTy)) + : RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); + break; + case WorksharingLoopType::DistributeStaticLoop: RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); + break; + case WorksharingLoopType::ForStaticLoop: + ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc( + ThreadChunkSize, TripCountTy)) + : RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); + break; } Builder.CreateCall(RTLFn, RealArgs); @@ -2764,7 +2777,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector &ToBeDeleted, - WorksharingLoopType LoopType) { + WorksharingLoopType LoopType, Value *ChunkSize) { IRBuilder<> &Builder = OMPIRBuilder->Builder; BasicBlock *Preheader = CLI->getPreheader(); Value *TripCount = CLI->getTripCount(); @@ -2811,17 +2824,18 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident, LoopBodyArg, ParallelTaskPtr, TripCount, - OutlinedFn); + OutlinedFn, ChunkSize); for (auto &ToBeDeletedItem : ToBeDeleted) ToBeDeletedItem->eraseFromParent(); CLI->invalidate(); } -OpenMPIRBuilder::InsertPointTy -OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, - InsertPointTy AllocaIP, - WorksharingLoopType LoopType) { +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget( + DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, + WorksharingLoopType LoopType, OMPScheduleType EffectiveScheduleType, + Value *ChunkSize) { + uint32_t SrcLocStrSize; Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); @@ -2833,6 +2847,16 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, // Instructions which need to be deleted at the end of code generation SmallVector ToBeDeleted; + // TODO: Add support for dynamic scheduling + switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) { + case OMPScheduleType::BaseStatic: + case OMPScheduleType::BaseStaticChunked: + break; + default: + report_fatal_error( + "Unknown/unimplemented schedule kind for target workshare loop", false); + } + OI.OuterAllocaBB = AllocaIP.getBlock(); // Mark the body loop as region which needs to be extracted @@ -2906,7 +2930,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, OI.PostOutlineCB = [=, ToBeDeletedVec = std::move(ToBeDeleted)](Function &OutlinedFn) { workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr, - ToBeDeletedVec, LoopType); + ToBeDeletedVec, LoopType, ChunkSize); }; addOutlineInfo(std::move(OI)); return CLI->getAfterIP(); @@ -2918,11 +2942,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop( bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, WorksharingLoopType LoopType) { - if (Config.isTargetDevice()) - return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType); OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType( SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier, HasNonmonotonicModifier, HasOrderedClause); + if (Config.isTargetDevice()) + return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, + EffectiveScheduleType, ChunkSize); bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) == OMPScheduleType::ModifierOrdered; diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir index 220eb85b3483e..a5f5d07262c8d 100644 --- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir @@ -25,6 +25,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo } llvm.return } + + llvm.func @target_wsloop_schedule_static_chunked(%arg0: !llvm.ptr ){ + %loop_ub = llvm.mlir.constant(9 : i32) : i32 + %loop_lb = llvm.mlir.constant(0 : i32) : i32 + %loop_step = llvm.mlir.constant(1 : i32) : i32 + %chunk = llvm.mlir.constant(2 : i32) : i32 + omp.wsloop schedule(static = %chunk : i32) for (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) { + %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32> + llvm.store %loop_cnt, %gep : i32, !llvm.ptr + omp.yield + } + llvm.return + } } // CHECK: define void @[[FUNC0:.*]](ptr %[[ARG0:.*]]) @@ -45,3 +58,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0) // CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]]) + +// CHECK: define void @[[FUNC_SCHEDULE_STATIC_WSLOOP:.*]](ptr %[[ARG1:.*]]) +// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), ptr @[[LOOP_BODY_SCHEDULE_STATIC_FN:.*]], ptr %[[SCHEDULE_LOOP_ARGS:.*]], i32 10, i32 %[[NUM_THREADS:.*]], i32 2) + +// CHECK: define internal void @[[LOOP_BODY_SCHEDULE_STATIC_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]]) diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp index bcb7c5ad50a18..ee9ee9a14056d 100644 --- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -683,38 +683,38 @@ template class StaticLoopChunker { Ty NumIters, bool OneIterationPerThread) { Ty KernelIteration = NumBlocks * BlockChunk; + Ty BlockIV = BId * BlockChunk; - // Start index in the chunked space. - Ty IV = BId * BlockChunk + TId; - ASSERT(IV >= 0, "Bad index"); - + ASSERT((BlockIV + TId * ThreadChunk) >= 0, "Bad index"); // Cover the entire iteration space, assumptions in the caller might allow // to simplify this loop to a conditional. do { - - Ty BlockChunkLeft = - BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0; - Ty ThreadChunkLeft = - ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft; - - while (ThreadChunkLeft--) { - - // Given the blocking it's hard to keep track of what to execute. - if (IV >= NumIters) - return; - - // Execute the loop body. - LoopBody(IV, Arg); - - if (OneIterationPerThread) - return; - - ++IV; + Ty ThreadIV = TId * ThreadChunk; + // Cover the block space + while (ThreadIV < BlockChunk) { + Ty ThreadCnt = 0; + // Cover the thread space + while ((ThreadCnt < ThreadChunk) && + ((ThreadIV + ThreadCnt) < BlockChunk)) { + // Index in the chunked space. + Ty IV = BlockIV + ThreadIV + ThreadCnt; + + // Given the blocking it's hard to keep track of what to execute. + if (IV >= NumIters) + return; + + // Execute the loop body. + LoopBody(IV, Arg); + + if (OneIterationPerThread) + return; + ++ThreadCnt; + }; + ThreadIV += (NumThreads * ThreadChunk); } - IV += KernelIteration; - - } while (IV < NumIters); + BlockIV += KernelIteration; + } while (BlockIV < NumIters); } public: @@ -731,8 +731,8 @@ template class StaticLoopChunker { // from the `omp` getter and not the mapping directly. Ty TId = omp_get_thread_num(); - // There are no blocks involved here. - Ty BlockChunk = 0; + // There is only one block for the whole iteration space. + Ty BlockChunk = NumIters; Ty NumBlocks = 1; Ty BId = 0; diff --git a/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90 new file mode 100644 index 0000000000000..6b539e7a3837b --- /dev/null +++ b/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90 @@ -0,0 +1,46 @@ +! Offloading test with a target region and chunks +! REQUIRES: flang +! UNSUPPORTED: nvptx64-nvidia-cuda-LTO +! UNSUPPORTED: aarch64-unknown-linux-gnu +! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +! UNSUPPORTED: x86_64-pc-linux-gnu +! UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +! RUN: %libomptarget-compile-fortran-generic +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic + +program main + use omp_lib + integer :: A(100) +!$omp target map(from:A) +!$omp parallel do schedule(static,2) num_threads(10) + do index_ = 1, 100 + A(index_) = omp_get_team_num() * 1000 + omp_get_thread_num() + end do +!$omp end target + write(*,"(A)"), "omp target parallel for thread chunk size 2" + call printArray(A) + +end program main + +subroutine printArray(Array) + integer :: Array(*) + do i = 1, 100 + write(*, "(A, I0, A, I0, A)", advance="no") "B",Array(i)/1000,"T",modulo(Array(i),1000)," " + end do + write(*,'(/)') +end subroutine printArray + +!CHECK: omp target parallel for thread chunk size 2 + +!CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4 +!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9 +!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4 +!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9 +!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4 +!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9 +!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4 +!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9 +!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4 +!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9 +