diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp index 36f44a20d9553..252a70d44736d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp @@ -19,6 +19,7 @@ #include "llvm/Transforms/Scalar/StructurizeCFG.h" #include "llvm/Transforms/Utils/FixIrreducible.h" #include "llvm/Transforms/Utils/LCSSA.h" +#include "llvm/Transforms/Utils/LowerSwitch.h" #include "llvm/Transforms/Utils/UnifyLoopExits.h" using namespace llvm; @@ -35,6 +36,16 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( ShadowStackGCLoweringPass>(); } +void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { + Base::addCodeGenPrepare(addPass); + + // LowerSwitch pass may introduce unreachable blocks that can cause unexpected + // behavior for subsequent passes. Placing it here seems better that these + // blocks would get cleaned up by UnreachableBlockElim inserted next in the + // pass flow. + addPass(LowerSwitchPass()); +} + void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { const bool LateCFGStructurize = AMDGPUTargetMachine::EnableLateStructurizeCFG; const bool DisableStructurizer = AMDGPUTargetMachine::DisableStructurizer; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h index e656e166b3eb2..efb296689bd64 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h @@ -19,10 +19,12 @@ class GCNTargetMachine; class AMDGPUCodeGenPassBuilder : public CodeGenPassBuilder { public: + using Base = CodeGenPassBuilder; + AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM, const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC); - + void addCodeGenPrepare(AddIRPass &) const; void addPreISel(AddIRPass &addPass) const; void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const; Error addInstSelector(AddMachinePass &) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b8aa93285ad84..fdb9cf0298819 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -67,12 +67,14 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/InferAddressSpaces.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/LowerSwitch.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include using namespace llvm; using namespace llvm::PatternMatch; +using namespace llvm::AMDGPU; namespace { class SGPRRegisterRegAlloc : public RegisterRegAllocBase { @@ -185,109 +187,95 @@ static VGPRRegisterRegAlloc fastRegAllocVGPR( "fast", "fast register allocator", createFastVGPRRegisterAllocator); } // anonymous namespace -static cl::opt -EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, - cl::desc("Run early if-conversion"), - cl::init(false)); +namespace llvm::AMDGPU { +cl::opt EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, + cl::desc("Run early if-conversion"), + cl::init(false)); -static cl::opt -OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, - cl::desc("Run pre-RA exec mask optimizations"), - cl::init(true)); +cl::opt OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, + cl::desc("Run pre-RA exec mask optimizations"), + cl::init(true)); -static cl::opt +cl::opt LowerCtorDtor("amdgpu-lower-global-ctor-dtor", cl::desc("Lower GPU ctor / dtors to globals on the device."), cl::init(true), cl::Hidden); // Option to disable vectorizer for tests. -static cl::opt EnableLoadStoreVectorizer( - "amdgpu-load-store-vectorizer", - cl::desc("Enable load store vectorizer"), - cl::init(true), - cl::Hidden); +cl::opt + EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", + cl::desc("Enable load store vectorizer"), + cl::init(true), cl::Hidden); // Option to control global loads scalarization -static cl::opt ScalarizeGlobal( - "amdgpu-scalarize-global-loads", - cl::desc("Enable global load scalarization"), - cl::init(true), - cl::Hidden); +cl::opt ScalarizeGlobal("amdgpu-scalarize-global-loads", + cl::desc("Enable global load scalarization"), + cl::init(true), cl::Hidden); // Option to run internalize pass. -static cl::opt InternalizeSymbols( - "amdgpu-internalize-symbols", - cl::desc("Enable elimination of non-kernel functions and unused globals"), - cl::init(false), - cl::Hidden); +cl::opt InternalizeSymbols( + "amdgpu-internalize-symbols", + cl::desc("Enable elimination of non-kernel functions and unused globals"), + cl::init(false), cl::Hidden); // Option to inline all early. -static cl::opt EarlyInlineAll( - "amdgpu-early-inline-all", - cl::desc("Inline all functions early"), - cl::init(false), - cl::Hidden); +cl::opt EarlyInlineAll("amdgpu-early-inline-all", + cl::desc("Inline all functions early"), + cl::init(false), cl::Hidden); -static cl::opt RemoveIncompatibleFunctions( +cl::opt RemoveIncompatibleFunctions( "amdgpu-enable-remove-incompatible-functions", cl::Hidden, cl::desc("Enable removal of functions when they" "use features not supported by the target GPU"), cl::init(true)); -static cl::opt EnableSDWAPeephole( - "amdgpu-sdwa-peephole", - cl::desc("Enable SDWA peepholer"), - cl::init(true)); +cl::opt EnableSDWAPeephole("amdgpu-sdwa-peephole", + cl::desc("Enable SDWA peepholer"), + cl::init(true)); -static cl::opt EnableDPPCombine( - "amdgpu-dpp-combine", - cl::desc("Enable DPP combiner"), - cl::init(true)); +cl::opt EnableDPPCombine("amdgpu-dpp-combine", + cl::desc("Enable DPP combiner"), cl::init(true)); // Enable address space based alias analysis -static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, - cl::desc("Enable AMDGPU Alias Analysis"), - cl::init(true)); +cl::opt + EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, + cl::desc("Enable AMDGPU Alias Analysis"), + cl::init(true)); // Option to run late CFG structurizer -static cl::opt LateCFGStructurize( - "amdgpu-late-structurize", - cl::desc("Enable late CFG structurization"), - cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), - cl::Hidden); +cl::opt LateCFGStructurize( + "amdgpu-late-structurize", cl::desc("Enable late CFG structurization"), + cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden); // Disable structurizer-based control-flow lowering in order to test convergence // control tokens. This should eventually be replaced by the wave-transform. -static cl::opt DisableStructurizer( +cl::opt DisableStructurizer( "amdgpu-disable-structurizer", cl::desc("Disable structurizer for experiments; produces unusable code"), cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden); // Enable lib calls simplifications -static cl::opt EnableLibCallSimplify( - "amdgpu-simplify-libcall", - cl::desc("Enable amdgpu library simplifications"), - cl::init(true), - cl::Hidden); - -static cl::opt EnableLowerKernelArguments( - "amdgpu-ir-lower-kernel-arguments", - cl::desc("Lower kernel argument loads in IR pass"), - cl::init(true), - cl::Hidden); - -static cl::opt EnableRegReassign( - "amdgpu-reassign-regs", - cl::desc("Enable register reassign optimizations on gfx10+"), - cl::init(true), - cl::Hidden); - -static cl::opt OptVGPRLiveRange( +cl::opt + EnableLibCallSimplify("amdgpu-simplify-libcall", + cl::desc("Enable amdgpu library simplifications"), + cl::init(true), cl::Hidden); + +cl::opt EnableLowerKernelArguments( + "amdgpu-ir-lower-kernel-arguments", + cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), + cl::Hidden); + +cl::opt EnableRegReassign( + "amdgpu-reassign-regs", + cl::desc("Enable register reassign optimizations on gfx10+"), + cl::init(true), cl::Hidden); + +cl::opt OptVGPRLiveRange( "amdgpu-opt-vgpr-liverange", cl::desc("Enable VGPR liverange optimizations for if-else structure"), cl::init(true), cl::Hidden); -static cl::opt AMDGPUAtomicOptimizerStrategy( +cl::opt AMDGPUAtomicOptimizerStrategy( "amdgpu-atomic-optimizer-strategy", cl::desc("Select DPP or Iterative strategy for scan"), cl::init(ScanOptions::Iterative), @@ -298,91 +286,85 @@ static cl::opt AMDGPUAtomicOptimizerStrategy( clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer"))); // Enable Mode register optimization -static cl::opt EnableSIModeRegisterPass( - "amdgpu-mode-register", - cl::desc("Enable mode register pass"), - cl::init(true), - cl::Hidden); +cl::opt EnableSIModeRegisterPass("amdgpu-mode-register", + cl::desc("Enable mode register pass"), + cl::init(true), cl::Hidden); // Enable GFX11.5+ s_singleuse_vdst insertion -static cl::opt +cl::opt EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst", cl::desc("Enable s_singleuse_vdst insertion"), cl::init(false), cl::Hidden); // Enable GFX11+ s_delay_alu insertion -static cl::opt - EnableInsertDelayAlu("amdgpu-enable-delay-alu", - cl::desc("Enable s_delay_alu insertion"), - cl::init(true), cl::Hidden); +cl::opt EnableInsertDelayAlu("amdgpu-enable-delay-alu", + cl::desc("Enable s_delay_alu insertion"), + cl::init(true), cl::Hidden); // Enable GFX11+ VOPD -static cl::opt - EnableVOPD("amdgpu-enable-vopd", - cl::desc("Enable VOPD, dual issue of VALU in wave32"), - cl::init(true), cl::Hidden); +cl::opt EnableVOPD("amdgpu-enable-vopd", + cl::desc("Enable VOPD, dual issue of VALU in wave32"), + cl::init(true), cl::Hidden); // Option is used in lit tests to prevent deadcoding of patterns inspected. -static cl::opt -EnableDCEInRA("amdgpu-dce-in-ra", - cl::init(true), cl::Hidden, - cl::desc("Enable machine DCE inside regalloc")); +cl::opt EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, + cl::desc("Enable machine DCE inside regalloc")); -static cl::opt EnableSetWavePriority("amdgpu-set-wave-priority", - cl::desc("Adjust wave priority"), - cl::init(false), cl::Hidden); +cl::opt EnableSetWavePriority("amdgpu-set-wave-priority", + cl::desc("Adjust wave priority"), + cl::init(false), cl::Hidden); -static cl::opt EnableScalarIRPasses( - "amdgpu-scalar-ir-passes", - cl::desc("Enable scalar IR passes"), - cl::init(true), - cl::Hidden); +cl::opt EnableScalarIRPasses("amdgpu-scalar-ir-passes", + cl::desc("Enable scalar IR passes"), + cl::init(true), cl::Hidden); -static cl::opt EnableStructurizerWorkarounds( +cl::opt EnableStructurizerWorkarounds( "amdgpu-enable-structurizer-workarounds", cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::location(AMDGPUTargetMachine::EnableStructurizerWorkarounds), cl::init(true), cl::Hidden); -static cl::opt EnableLowerModuleLDS( +cl::opt EnableLowerModuleLDS( "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden); -static cl::opt EnablePreRAOptimizations( - "amdgpu-enable-pre-ra-optimizations", - cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), - cl::Hidden); +cl::opt + EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations", + cl::desc("Enable Pre-RA optimizations pass"), + cl::init(true), cl::Hidden); -static cl::opt EnablePromoteKernelArguments( +cl::opt EnablePromoteKernelArguments( "amdgpu-enable-promote-kernel-arguments", cl::desc("Enable promotion of flat kernel pointer arguments to global"), cl::Hidden, cl::init(true)); -static cl::opt EnableImageIntrinsicOptimizer( +cl::opt EnableImageIntrinsicOptimizer( "amdgpu-enable-image-intrinsic-optimizer", cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden); -static cl::opt +cl::opt EnableLoopPrefetch("amdgpu-loop-prefetch", cl::desc("Enable loop data prefetch on AMDGPU"), cl::Hidden, cl::init(false)); -static cl::opt EnableMaxIlpSchedStrategy( +cl::opt EnableMaxIlpSchedStrategy( "amdgpu-enable-max-ilp-scheduling-strategy", cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), cl::Hidden, cl::init(false)); -static cl::opt EnableRewritePartialRegUses( +cl::opt EnableRewritePartialRegUses( "amdgpu-enable-rewrite-partial-reg-uses", cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), cl::Hidden); -static cl::opt EnableHipStdPar( - "amdgpu-enable-hipstdpar", - cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), - cl::Hidden); +cl::opt + EnableHipStdPar("amdgpu-enable-hipstdpar", + cl::desc("Enable HIP Standard Parallelism Offload support"), + cl::init(false), cl::Hidden); + +} // namespace llvm::AMDGPU static cl::opt EnableAMDGPUAttributor("amdgpu-attributor-enable", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 4d39ad2b41505..f01e26a846f43 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -16,12 +16,53 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" #include #include namespace llvm { +enum class ScanOptions; + +namespace AMDGPU { + +extern cl::opt EnableEarlyIfConversion; +extern cl::opt OptExecMaskPreRA; +extern cl::opt LowerCtorDtor; +extern cl::opt EnableLoadStoreVectorizer; +extern cl::opt ScalarizeGlobal; +extern cl::opt InternalizeSymbols; +extern cl::opt EarlyInlineAll; +extern cl::opt RemoveIncompatibleFunctions; +extern cl::opt EnableSDWAPeephole; +extern cl::opt EnableDPPCombine; +extern cl::opt EnableAMDGPUAliasAnalysis; +extern cl::opt LateCFGStructurize; +extern cl::opt DisableStructurizer; +extern cl::opt EnableLibCallSimplify; +extern cl::opt EnableLowerKernelArguments; +extern cl::opt EnableRegReassign; +extern cl::opt OptVGPRLiveRange; +extern cl::opt AMDGPUAtomicOptimizerStrategy; +extern cl::opt EnableSIModeRegisterPass; +extern cl::opt EnableInsertSingleUseVDST; +extern cl::opt EnableInsertDelayAlu; +extern cl::opt EnableVOPD; +extern cl::opt EnableDCEInRA; +extern cl::opt EnableSetWavePriority; +extern cl::opt EnableScalarIRPasses; +extern cl::opt EnableStructurizerWorkarounds; +extern cl::opt EnableLowerModuleLDS; +extern cl::opt EnablePreRAOptimizations; +extern cl::opt EnablePromoteKernelArguments; +extern cl::opt EnableImageIntrinsicOptimizer; +extern cl::opt EnableLoopPrefetch; +extern cl::opt EnableMaxIlpSchedStrategy; +extern cl::opt EnableRewritePartialRegUses; +extern cl::opt EnableHipStdPar; +} // namespace AMDGPU + //===----------------------------------------------------------------------===// // AMDGPU Target Machine (R600+) //===----------------------------------------------------------------------===//