From 6d0516ff51bf3b001e2a1a0cec1b94683722aa37 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Fri, 8 Sep 2023 18:12:03 -0700 Subject: [PATCH 1/2] [OpenMPOpt] Allow indirect calls in AAKernelInfoCallSite The Attributor has gained support for indirect calls but it is opt-in. This patch makes AAKernelInfoCallSite able to handle multiple potential callees. --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 391 ++++++++++-------- .../Transforms/OpenMP/spmdization_indirect.ll | 130 +----- 2 files changed, 220 insertions(+), 301 deletions(-) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 63493eb78c451..44aed26978422 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -3532,6 +3532,10 @@ struct AAKernelInfo : public StateWrapper { using Base = StateWrapper; AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + /// The callee value is tracked beyond a simple stripPointerCasts, so we allow + /// unknown callees. + static bool requiresCalleeForCallBase() { return false; } + /// Statistics are tracked as part of manifest for now. void trackStatistics() const override {} @@ -4797,139 +4801,157 @@ struct AAKernelInfoCallSite : AAKernelInfo { // we will handle them explicitly in the switch below. If it is not, we // will use an AAKernelInfo object on the callee to gather information and // merge that into the current state. The latter happens in the updateImpl. - Function *Callee = getAssociatedFunction(); - auto &OMPInfoCache = static_cast(A.getInfoCache()); - const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee); - if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) { - // Unknown caller or declarations are not analyzable, we give up. - if (!Callee || !A.isFunctionIPOAmendable(*Callee)) { - - // Unknown callees might contain parallel regions, except if they have - // an appropriate assumption attached. - if (!AssumptionAA || - !(AssumptionAA->hasAssumption("omp_no_openmp") || - AssumptionAA->hasAssumption("omp_no_parallelism"))) - ReachedUnknownParallelRegions.insert(&CB); - - // If SPMDCompatibilityTracker is not fixed, we need to give up on the - // idea we can run something unknown in SPMD-mode. - if (!SPMDCompatibilityTracker.isAtFixpoint()) { - SPMDCompatibilityTracker.indicatePessimisticFixpoint(); - SPMDCompatibilityTracker.insert(&CB); - } + auto CheckCallee = [&](Function *Callee, unsigned NumCallees) { + auto &OMPInfoCache = static_cast(A.getInfoCache()); + const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee); + if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) { + // Unknown caller or declarations are not analyzable, we give up. + if (!Callee || !A.isFunctionIPOAmendable(*Callee)) { + + // Unknown callees might contain parallel regions, except if they have + // an appropriate assumption attached. + if (!AssumptionAA || + !(AssumptionAA->hasAssumption("omp_no_openmp") || + AssumptionAA->hasAssumption("omp_no_parallelism"))) + ReachedUnknownParallelRegions.insert(&CB); + + // If SPMDCompatibilityTracker is not fixed, we need to give up on the + // idea we can run something unknown in SPMD-mode. + if (!SPMDCompatibilityTracker.isAtFixpoint()) { + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + SPMDCompatibilityTracker.insert(&CB); + } - // We have updated the state for this unknown call properly, there won't - // be any change so we indicate a fixpoint. - indicateOptimisticFixpoint(); + // We have updated the state for this unknown call properly, there + // won't be any change so we indicate a fixpoint. + indicateOptimisticFixpoint(); + } + // If the callee is known and can be used in IPO, we will update the + // state based on the callee state in updateImpl. + return; + } + if (NumCallees > 1) { + indicatePessimisticFixpoint(); + return; } - // If the callee is known and can be used in IPO, we will update the state - // based on the callee state in updateImpl. - return; - } - RuntimeFunction RF = It->getSecond(); - switch (RF) { - // All the functions we know are compatible with SPMD mode. - case OMPRTL___kmpc_is_spmd_exec_mode: - case OMPRTL___kmpc_distribute_static_fini: - case OMPRTL___kmpc_for_static_fini: - case OMPRTL___kmpc_global_thread_num: - case OMPRTL___kmpc_get_hardware_num_threads_in_block: - case OMPRTL___kmpc_get_hardware_num_blocks: - case OMPRTL___kmpc_single: - case OMPRTL___kmpc_end_single: - case OMPRTL___kmpc_master: - case OMPRTL___kmpc_end_master: - case OMPRTL___kmpc_barrier: - case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2: - case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2: - case OMPRTL___kmpc_nvptx_end_reduce_nowait: - case OMPRTL___kmpc_error: - case OMPRTL___kmpc_flush: - case OMPRTL___kmpc_get_hardware_thread_id_in_block: - case OMPRTL___kmpc_get_warp_size: - case OMPRTL_omp_get_thread_num: - case OMPRTL_omp_get_num_threads: - case OMPRTL_omp_get_max_threads: - case OMPRTL_omp_in_parallel: - case OMPRTL_omp_get_dynamic: - case OMPRTL_omp_get_cancellation: - case OMPRTL_omp_get_nested: - case OMPRTL_omp_get_schedule: - case OMPRTL_omp_get_thread_limit: - case OMPRTL_omp_get_supported_active_levels: - case OMPRTL_omp_get_max_active_levels: - case OMPRTL_omp_get_level: - case OMPRTL_omp_get_ancestor_thread_num: - case OMPRTL_omp_get_team_size: - case OMPRTL_omp_get_active_level: - case OMPRTL_omp_in_final: - case OMPRTL_omp_get_proc_bind: - case OMPRTL_omp_get_num_places: - case OMPRTL_omp_get_num_procs: - case OMPRTL_omp_get_place_proc_ids: - case OMPRTL_omp_get_place_num: - case OMPRTL_omp_get_partition_num_places: - case OMPRTL_omp_get_partition_place_nums: - case OMPRTL_omp_get_wtime: - break; - case OMPRTL___kmpc_distribute_static_init_4: - case OMPRTL___kmpc_distribute_static_init_4u: - case OMPRTL___kmpc_distribute_static_init_8: - case OMPRTL___kmpc_distribute_static_init_8u: - case OMPRTL___kmpc_for_static_init_4: - case OMPRTL___kmpc_for_static_init_4u: - case OMPRTL___kmpc_for_static_init_8: - case OMPRTL___kmpc_for_static_init_8u: { - // Check the schedule and allow static schedule in SPMD mode. - unsigned ScheduleArgOpNo = 2; - auto *ScheduleTypeCI = - dyn_cast(CB.getArgOperand(ScheduleArgOpNo)); - unsigned ScheduleTypeVal = - ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0; - switch (OMPScheduleType(ScheduleTypeVal)) { - case OMPScheduleType::UnorderedStatic: - case OMPScheduleType::UnorderedStaticChunked: - case OMPScheduleType::OrderedDistribute: - case OMPScheduleType::OrderedDistributeChunked: + RuntimeFunction RF = It->getSecond(); + switch (RF) { + // All the functions we know are compatible with SPMD mode. + case OMPRTL___kmpc_is_spmd_exec_mode: + case OMPRTL___kmpc_distribute_static_fini: + case OMPRTL___kmpc_for_static_fini: + case OMPRTL___kmpc_global_thread_num: + case OMPRTL___kmpc_get_hardware_num_threads_in_block: + case OMPRTL___kmpc_get_hardware_num_blocks: + case OMPRTL___kmpc_single: + case OMPRTL___kmpc_end_single: + case OMPRTL___kmpc_master: + case OMPRTL___kmpc_end_master: + case OMPRTL___kmpc_barrier: + case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2: + case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2: + case OMPRTL___kmpc_nvptx_end_reduce_nowait: + case OMPRTL___kmpc_error: + case OMPRTL___kmpc_flush: + case OMPRTL___kmpc_get_hardware_thread_id_in_block: + case OMPRTL___kmpc_get_warp_size: + case OMPRTL_omp_get_thread_num: + case OMPRTL_omp_get_num_threads: + case OMPRTL_omp_get_max_threads: + case OMPRTL_omp_in_parallel: + case OMPRTL_omp_get_dynamic: + case OMPRTL_omp_get_cancellation: + case OMPRTL_omp_get_nested: + case OMPRTL_omp_get_schedule: + case OMPRTL_omp_get_thread_limit: + case OMPRTL_omp_get_supported_active_levels: + case OMPRTL_omp_get_max_active_levels: + case OMPRTL_omp_get_level: + case OMPRTL_omp_get_ancestor_thread_num: + case OMPRTL_omp_get_team_size: + case OMPRTL_omp_get_active_level: + case OMPRTL_omp_in_final: + case OMPRTL_omp_get_proc_bind: + case OMPRTL_omp_get_num_places: + case OMPRTL_omp_get_num_procs: + case OMPRTL_omp_get_place_proc_ids: + case OMPRTL_omp_get_place_num: + case OMPRTL_omp_get_partition_num_places: + case OMPRTL_omp_get_partition_place_nums: + case OMPRTL_omp_get_wtime: break; + case OMPRTL___kmpc_distribute_static_init_4: + case OMPRTL___kmpc_distribute_static_init_4u: + case OMPRTL___kmpc_distribute_static_init_8: + case OMPRTL___kmpc_distribute_static_init_8u: + case OMPRTL___kmpc_for_static_init_4: + case OMPRTL___kmpc_for_static_init_4u: + case OMPRTL___kmpc_for_static_init_8: + case OMPRTL___kmpc_for_static_init_8u: { + // Check the schedule and allow static schedule in SPMD mode. + unsigned ScheduleArgOpNo = 2; + auto *ScheduleTypeCI = + dyn_cast(CB.getArgOperand(ScheduleArgOpNo)); + unsigned ScheduleTypeVal = + ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0; + switch (OMPScheduleType(ScheduleTypeVal)) { + case OMPScheduleType::UnorderedStatic: + case OMPScheduleType::UnorderedStaticChunked: + case OMPScheduleType::OrderedDistribute: + case OMPScheduleType::OrderedDistributeChunked: + break; + default: + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + SPMDCompatibilityTracker.insert(&CB); + break; + }; + } break; + case OMPRTL___kmpc_target_init: + KernelInitCB = &CB; + break; + case OMPRTL___kmpc_target_deinit: + KernelDeinitCB = &CB; + break; + case OMPRTL___kmpc_parallel_51: + if (!handleParallel51(A, CB)) + indicatePessimisticFixpoint(); + return; + case OMPRTL___kmpc_omp_task: + // We do not look into tasks right now, just give up. + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + SPMDCompatibilityTracker.insert(&CB); + ReachedUnknownParallelRegions.insert(&CB); + break; + case OMPRTL___kmpc_alloc_shared: + case OMPRTL___kmpc_free_shared: + // Return without setting a fixpoint, to be resolved in updateImpl. + return; default: + // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, + // generally. However, they do not hide parallel regions. SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); break; - }; - } break; - case OMPRTL___kmpc_target_init: - KernelInitCB = &CB; - break; - case OMPRTL___kmpc_target_deinit: - KernelDeinitCB = &CB; - break; - case OMPRTL___kmpc_parallel_51: - if (!handleParallel51(A, CB)) - indicatePessimisticFixpoint(); - return; - case OMPRTL___kmpc_omp_task: - // We do not look into tasks right now, just give up. - SPMDCompatibilityTracker.indicatePessimisticFixpoint(); - SPMDCompatibilityTracker.insert(&CB); - ReachedUnknownParallelRegions.insert(&CB); - break; - case OMPRTL___kmpc_alloc_shared: - case OMPRTL___kmpc_free_shared: - // Return without setting a fixpoint, to be resolved in updateImpl. + } + // All other OpenMP runtime calls will not reach parallel regions so they + // can be safely ignored for now. Since it is a known OpenMP runtime call + // we have now modeled all effects and there is no need for any update. + indicateOptimisticFixpoint(); + }; + + const auto *AACE = + A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL); + if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) { + CheckCallee(getAssociatedFunction(), 1); return; - default: - // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, - // generally. However, they do not hide parallel regions. - SPMDCompatibilityTracker.indicatePessimisticFixpoint(); - SPMDCompatibilityTracker.insert(&CB); - break; } - // All other OpenMP runtime calls will not reach parallel regions so they - // can be safely ignored for now. Since it is a known OpenMP runtime call we - // have now modeled all effects and there is no need for any update. - indicateOptimisticFixpoint(); + const auto &OptimisticEdges = AACE->getOptimisticEdges(); + for (auto *Callee : OptimisticEdges) { + CheckCallee(Callee, OptimisticEdges.size()); + if (isAtFixpoint()) + break; + } } ChangeStatus updateImpl(Attributor &A) override { @@ -4937,64 +4959,83 @@ struct AAKernelInfoCallSite : AAKernelInfo { // call site specific liveness information and then it makes // sense to specialize attributes for call sites arguments instead of // redirecting requests to the callee argument. - Function *F = getAssociatedFunction(); - auto &OMPInfoCache = static_cast(A.getInfoCache()); - const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F); - - // If F is not a runtime function, propagate the AAKernelInfo of the callee. - if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) { - const IRPosition &FnPos = IRPosition::function(*F); - auto *FnAA = A.getAAFor(*this, FnPos, DepClassTy::REQUIRED); - if (!FnAA) - return indicatePessimisticFixpoint(); - if (getState() == FnAA->getState()) - return ChangeStatus::UNCHANGED; - getState() = FnAA->getState(); - return ChangeStatus::CHANGED; - } - KernelInfoState StateBefore = getState(); - CallBase &CB = cast(getAssociatedValue()); - if (It->getSecond() == OMPRTL___kmpc_parallel_51) { - if (!handleParallel51(A, CB)) - return indicatePessimisticFixpoint(); - return StateBefore == getState() ? ChangeStatus::UNCHANGED - : ChangeStatus::CHANGED; - } - - // F is a runtime function that allocates or frees memory, check - // AAHeapToStack and AAHeapToShared. - assert((It->getSecond() == OMPRTL___kmpc_alloc_shared || - It->getSecond() == OMPRTL___kmpc_free_shared) && - "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call"); + auto CheckCallee = [&](Function *F, int NumCallees) { + const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F); + + // If F is not a runtime function, propagate the AAKernelInfo of the + // callee. + if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) { + const IRPosition &FnPos = IRPosition::function(*F); + auto *FnAA = + A.getAAFor(*this, FnPos, DepClassTy::REQUIRED); + if (!FnAA) + return indicatePessimisticFixpoint(); + if (getState() == FnAA->getState()) + return ChangeStatus::UNCHANGED; + getState() = FnAA->getState(); + return ChangeStatus::CHANGED; + } + if (NumCallees > 1) + return indicatePessimisticFixpoint(); - auto *HeapToStackAA = A.getAAFor( - *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL); - auto *HeapToSharedAA = A.getAAFor( - *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL); - - RuntimeFunction RF = It->getSecond(); + CallBase &CB = cast(getAssociatedValue()); + if (It->getSecond() == OMPRTL___kmpc_parallel_51) { + if (!handleParallel51(A, CB)) + return indicatePessimisticFixpoint(); + return StateBefore == getState() ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; + } - switch (RF) { - // If neither HeapToStack nor HeapToShared assume the call is removed, - // assume SPMD incompatibility. - case OMPRTL___kmpc_alloc_shared: - if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) && - (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB))) - SPMDCompatibilityTracker.insert(&CB); - break; - case OMPRTL___kmpc_free_shared: - if ((!HeapToStackAA || - !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) && - (!HeapToSharedAA || - !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB))) + // F is a runtime function that allocates or frees memory, check + // AAHeapToStack and AAHeapToShared. + assert( + (It->getSecond() == OMPRTL___kmpc_alloc_shared || + It->getSecond() == OMPRTL___kmpc_free_shared) && + "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call"); + + auto *HeapToStackAA = A.getAAFor( + *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL); + auto *HeapToSharedAA = A.getAAFor( + *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL); + + RuntimeFunction RF = It->getSecond(); + + switch (RF) { + // If neither HeapToStack nor HeapToShared assume the call is removed, + // assume SPMD incompatibility. + case OMPRTL___kmpc_alloc_shared: + if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) && + (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB))) + SPMDCompatibilityTracker.insert(&CB); + break; + case OMPRTL___kmpc_free_shared: + if ((!HeapToStackAA || + !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) && + (!HeapToSharedAA || + !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB))) + SPMDCompatibilityTracker.insert(&CB); + break; + default: + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); - break; - default: - SPMDCompatibilityTracker.indicatePessimisticFixpoint(); - SPMDCompatibilityTracker.insert(&CB); + } + return ChangeStatus::CHANGED; + }; + + const auto *AACE = + A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL); + if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) { + CheckCallee(getAssociatedFunction(), 1); + } else { + const auto &OptimisticEdges = AACE->getOptimisticEdges(); + for (auto *Callee : OptimisticEdges) { + CheckCallee(Callee, OptimisticEdges.size()); + if (isAtFixpoint()) + break; + } } return StateBefore == getState() ? ChangeStatus::UNCHANGED diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll index 4ca646470eabe..04b0e50d4bce4 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll @@ -16,15 +16,15 @@ ;. ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8 -; AMDGPU: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } -; AMDGPU: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } +; AMDGPU: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } +; AMDGPU: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null } ; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } ; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } ;. ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8 -; NVPTX: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } -; NVPTX: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } +; NVPTX: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } +; NVPTX: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null } ; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } ; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } ;. @@ -47,40 +47,9 @@ define internal void @spmd_callees__debug(i1 %c) { ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees__debug ; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: is_worker_check: -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast ptr [[WORKER_WORK_FN]] to ptr -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; AMDGPU: common.ret: @@ -109,39 +78,9 @@ define internal void @spmd_callees__debug(i1 %c) { ; NVPTX-LABEL: define {{[^@]+}}@spmd_callees__debug ; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: is_worker_check: -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast ptr [[WORKER_WORK_FN]] to ptr -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; NVPTX: common.ret: @@ -721,40 +660,9 @@ define weak void @spmd_callees_metadata(ptr %fp) #0 { ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees_metadata ; AMDGPU-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: is_worker_check: -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast ptr [[WORKER_WORK_FN]] to ptr -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; AMDGPU: common.ret: @@ -770,39 +678,9 @@ define weak void @spmd_callees_metadata(ptr %fp) #0 { ; NVPTX-LABEL: define {{[^@]+}}@spmd_callees_metadata ; NVPTX-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: is_worker_check: -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast ptr [[WORKER_WORK_FN]] to ptr -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; NVPTX: common.ret: From 6295a3025568a92d5dea544b46eb15103697a18c Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Sun, 10 Sep 2023 19:01:23 -0700 Subject: [PATCH 2/2] Update llvm/lib/Transforms/IPO/OpenMPOpt.cpp Co-authored-by: Shilei Tian --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 44aed26978422..a18730ab35621 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -5028,7 +5028,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { const auto *AACE = A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL); if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) { - CheckCallee(getAssociatedFunction(), 1); + CheckCallee(getAssociatedFunction(), /*NumCallees=*/1); } else { const auto &OptimisticEdges = AACE->getOptimisticEdges(); for (auto *Callee : OptimisticEdges) {