diff --git a/.project b/.project
new file mode 100644
index 0000000000000..e964c69ba36e8
--- /dev/null
+++ b/.project
@@ -0,0 +1,11 @@
+
+
+ llvm-project
+
+
+
+
+
+
+
+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 27649fd60da50..d4933452f61d8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -595,6 +595,9 @@ C++2b Feature Support
CUDA/HIP Language Changes in Clang
----------------------------------
+ - Allow the use of ``__noinline__`` as a keyword (instead of ``__attribute__((noinline))``)
+ in lambda declarations.
+
Objective-C Language Changes in Clang
-------------------------------------
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 1a4ec43705bc3..3fa641778d021 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -5723,7 +5723,7 @@ class OMPMapClause final : public OMPMappableExprListClause,
size_t numTrailingObjects(OverloadToken) const {
// There are varlist_size() of expressions, and varlist_size() of
// user-defined mappers.
- return 2 * varlist_size();
+ return 2 * varlist_size() + 1;
}
size_t numTrailingObjects(OverloadToken) const {
return getUniqueDeclarationsNum();
@@ -5737,7 +5737,7 @@ class OMPMapClause final : public OMPMappableExprListClause,
OpenMPMapModifierKind MapTypeModifiers[NumberOfOMPMapClauseModifiers] = {
OMPC_MAP_MODIFIER_unknown, OMPC_MAP_MODIFIER_unknown,
OMPC_MAP_MODIFIER_unknown, OMPC_MAP_MODIFIER_unknown,
- OMPC_MAP_MODIFIER_unknown};
+ OMPC_MAP_MODIFIER_unknown, OMPC_MAP_MODIFIER_unknown};
/// Location of map-type-modifiers for the 'map' clause.
SourceLocation MapTypeModifiersLoc[NumberOfOMPMapClauseModifiers];
@@ -5838,6 +5838,11 @@ class OMPMapClause final : public OMPMappableExprListClause,
/// Set colon location.
void setColonLoc(SourceLocation Loc) { ColonLoc = Loc; }
+ /// Set iterator modifier.
+ void setIteratorModifier(Expr *IteratorModifier) {
+ getTrailingObjects()[2 * varlist_size()] = IteratorModifier;
+ }
+
public:
/// Creates clause with a list of variables \a VL.
///
@@ -5850,6 +5855,7 @@ class OMPMapClause final : public OMPMappableExprListClause,
/// \param ComponentLists Component lists used in the clause.
/// \param UDMapperRefs References to user-defined mappers associated with
/// expressions used in the clause.
+ /// \param IteratorModifier Iterator modifier.
/// \param MapModifiers Map-type-modifiers.
/// \param MapModifiersLoc Location of map-type-modifiers.
/// \param UDMQualifierLoc C++ nested name specifier for the associated
@@ -5862,7 +5868,7 @@ class OMPMapClause final : public OMPMappableExprListClause,
Create(const ASTContext &C, const OMPVarListLocTy &Locs,
ArrayRef Vars, ArrayRef Declarations,
MappableExprComponentListsRef ComponentLists,
- ArrayRef UDMapperRefs,
+ ArrayRef UDMapperRefs, Expr *IteratorModifier,
ArrayRef MapModifiers,
ArrayRef MapModifiersLoc,
NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId,
@@ -5881,6 +5887,11 @@ class OMPMapClause final : public OMPMappableExprListClause,
static OMPMapClause *CreateEmpty(const ASTContext &C,
const OMPMappableExprListSizeTy &Sizes);
+ /// Fetches Expr * of iterator modifier.
+ Expr *getIteratorModifier() {
+ return getTrailingObjects()[2 * varlist_size()];
+ }
+
/// Fetches mapping kind for the clause.
OpenMPMapClauseKind getMapType() const LLVM_READONLY { return MapType; }
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 16cf932c3760b..eaf4a6db3600e 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3749,8 +3749,8 @@ def OMPDeclareTargetDecl : InheritableAttr {
let Documentation = [OMPDeclareTargetDocs];
let Args = [
EnumArgument<"MapType", "MapTypeTy",
- [ "to", "link" ],
- [ "MT_To", "MT_Link" ]>,
+ [ "to", "enter", "link" ],
+ [ "MT_To", "MT_Enter", "MT_Link" ]>,
EnumArgument<"DevType", "DevTypeTy",
[ "host", "nohost", "any" ],
[ "DT_Host", "DT_NoHost", "DT_Any" ]>,
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 28da4ff72bc45..82fc6c047b5da 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1359,7 +1359,7 @@ def err_omp_unknown_map_type : Error<
"incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'">;
def err_omp_unknown_map_type_modifier : Error<
"incorrect map type modifier, expected one of: 'always', 'close', 'mapper'"
- "%select{|, 'present'}0%select{|, 'ompx_hold'}1">;
+ "%select{|, 'present'|, 'present', 'iterator'}0%select{|, 'ompx_hold'}1">;
def err_omp_map_type_missing : Error<
"missing map type">;
def err_omp_map_type_modifier_missing : Error<
@@ -1383,12 +1383,22 @@ def note_omp_assumption_clause_continue_here
: Note<"the ignored tokens spans until here">;
def err_omp_declare_target_unexpected_clause: Error<
"unexpected '%0' clause, only %select{'device_type'|'to' or 'link'|'to', 'link' or 'device_type'|'device_type', 'indirect'|'to', 'link', 'device_type' or 'indirect'}1 clauses expected">;
+def err_omp_declare_target_unexpected_clause_52: Error<
+ "unexpected '%0' clause, only %select{'device_type'|'enter' or 'link'|'enter', 'link' or 'device_type'|'device_type', 'indirect'|'enter', 'link', 'device_type' or 'indirect'}1 clauses expected">;
def err_omp_begin_declare_target_unexpected_implicit_to_clause: Error<
"unexpected '(', only 'to', 'link' or 'device_type' clauses expected for 'begin declare target' directive">;
-def err_omp_declare_target_unexpected_clause_after_implicit_to: Error<
+def err_omp_declare_target_wrong_clause_after_implicit_to: Error<
"unexpected clause after an implicit 'to' clause">;
+def err_omp_declare_target_wrong_clause_after_implicit_enter: Error<
+ "unexpected clause after an implicit 'enter' clause">;
def err_omp_declare_target_missing_to_or_link_clause: Error<
"expected at least one %select{'to' or 'link'|'to', 'link' or 'indirect'}0 clause">;
+def err_omp_declare_target_missing_enter_or_link_clause: Error<
+ "expected at least one %select{'enter' or 'link'|'enter', 'link' or 'indirect'}0 clause">;
+def err_omp_declare_target_unexpected_to_clause: Error<
+ "unexpected 'to' clause, use 'enter' instead">;
+def err_omp_declare_target_unexpected_enter_clause: Error<
+ "unexpected 'enter' clause, use 'to' instead">;
def err_omp_declare_target_multiple : Error<
"%0 appears multiple times in clauses on the same declare target directive">;
def err_omp_declare_target_indirect_device_type: Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a53a830ccc190..9a004945974ee 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10814,6 +10814,8 @@ def err_omp_depend_sink_source_with_modifier : Error<
"depend modifier cannot be used with 'sink' or 'source' depend type">;
def err_omp_depend_modifier_not_iterator : Error<
"expected iterator specification as depend modifier">;
+def err_omp_map_modifier_not_iterator : Error<
+ "expected iterator specification as map modifier">;
def err_omp_linear_ordered : Error<
"'linear' clause cannot be specified along with 'ordered' clause with a parameter">;
def err_omp_unexpected_schedule_modifier : Error<
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 7d4b135c1de3b..1cd1df7e9b0de 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -253,8 +253,10 @@ LANGOPT(OpenMPCUDANumSMs , 32, 0, "Number of SMs for CUDA devices.")
LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.")
LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.")
LANGOPT(OpenMPGPUThreadsPerTeam, 32, 256, "Number of threads per team for GPUs.")
+LANGOPT(OpenMPTargetXteamReductionBlockSize, 32, 1024, "Number of threads in a block used by cross-team reduction.")
LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading device RTL")
LANGOPT(OpenMPTargetIgnoreEnvVars , 1, 0, "Generate code assuming that device related environment variables can be ignored.")
+LANGOPT(OpenMPTargetBigJumpLoop , 1, 0, "Use big jump loop code generation technique.")
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index 4c0884e0a6424..26153853e09b3 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -131,6 +131,7 @@ OPENMP_MAP_KIND(release)
OPENMP_MAP_MODIFIER_KIND(always)
OPENMP_MAP_MODIFIER_KIND(close)
OPENMP_MAP_MODIFIER_KIND(mapper)
+OPENMP_MAP_MODIFIER_KIND(iterator)
OPENMP_MAP_MODIFIER_KIND(present)
// This is an OpenMP extension for the sake of OpenACC support.
OPENMP_MAP_MODIFIER_KIND(ompx_hold)
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index 82875aa0fafa6..7837c1b4e70c3 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -83,7 +83,7 @@ enum OpenMPMapModifierKind {
OMPC_MAP_MODIFIER_last
};
- /// Number of allowed map-type-modifiers.
+/// Number of allowed map-type-modifiers.
static constexpr unsigned NumberOfOMPMapClauseModifiers =
OMPC_MAP_MODIFIER_last - OMPC_MAP_MODIFIER_unknown - 1;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 74d495c8dfeea..bb4374bfbdca1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2616,6 +2616,8 @@ def fopenmp_cuda_teams_reduction_recs_num_EQ : Joined<["-"], "fopenmp-cuda-teams
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
def fopenmp_gpu_threads_per_team_EQ : Joined<["-"], "fopenmp-gpu-threads-per-team=">, Group,
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
+def fopenmp_target_xteam_reduction_blocksize_EQ : Joined<["-"], "fopenmp-target-xteam-reduction-blocksize=">, Group,
+ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
def fopenmp_target_debug : Flag<["-"], "fopenmp-target-debug">, Group, Flags<[CC1Option, NoArgumentUnused]>,
HelpText<"Enable debugging in the OpenMP offloading device RTL">;
def fno_openmp_target_debug : Flag<["-"], "fno-openmp-target-debug">, Group, Flags<[NoArgumentUnused]>;
@@ -2630,6 +2632,14 @@ def fno_openmp_target_ignore_env_vars : Flag<["-"], "fno-openmp-target-ignore-en
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
HelpText<"Assert that device related environment variables cannot be ignored while generating code">,
MarshallingInfoFlag>;
+def fopenmp_target_big_jump_loop : Flag<["-"], "fopenmp-target-big-jump-loop">, Group,
+ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
+ HelpText<"Use the big-jump-loop code generation technique if possible">,
+ MarshallingInfoFlag>;
+def fno_openmp_target_big_jump_loop : Flag<["-"], "fno-openmp-target-big-jump-loop">, Group,
+ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
+ HelpText<"Do not use the big-jump-loop code generation technique">,
+ MarshallingInfoFlag>;
def fopenmp_assume_teams_oversubscription : Flag<["-"], "fopenmp-assume-teams-oversubscription">,
Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
def fopenmp_assume_threads_oversubscription : Flag<["-"], "fopenmp-assume-threads-oversubscription">,
@@ -3724,12 +3734,12 @@ defm amdgpu_ieee : BoolOption<"m", "amdgpu-ieee",
NegFlag>, Group;
def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group,
- HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">,
+ HelpText<"Specify code object ABI version. Defaults to 5. (AMDGPU only)">,
Flags<[CC1Option]>,
Values<"none,2,3,4,5">,
NormalizedValuesScope<"TargetOptions">,
NormalizedValues<["COV_None", "COV_2", "COV_3", "COV_4", "COV_5"]>,
- MarshallingInfoEnum, "COV_4">;
+ MarshallingInfoEnum, "COV_5">;
defm code_object_v3_legacy : SimpleMFlag<"code-object-v3",
"Legacy option to specify code object ABI V3",
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b32dfe158c8f3..ed3a8ebaea417 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11079,6 +11079,7 @@ class Sema final {
QualType MapperType,
SourceLocation StartLoc,
DeclarationName VN);
+ void ActOnOpenMPIteratorVarDecl(VarDecl *VD);
bool isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const;
const ValueDecl *getOpenMPDeclareMapperVarName() const;
@@ -11790,6 +11791,7 @@ class Sema final {
/// Data used for processing a list of variables in OpenMP clauses.
struct OpenMPVarListDataTy final {
Expr *DepModOrTailExpr = nullptr;
+ Expr *IteratorExpr = nullptr;
SourceLocation ColonLoc;
SourceLocation RLoc;
CXXScopeSpec ReductionOrMapperIdScopeSpec;
@@ -11916,7 +11918,7 @@ class Sema final {
SourceLocation EndLoc);
/// Called on well-formed 'map' clause.
OMPClause *ActOnOpenMPMapClause(
- ArrayRef MapTypeModifiers,
+ Expr *IteratorModifier, ArrayRef MapTypeModifiers,
ArrayRef MapTypeModifiersLoc,
CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
diff --git a/clang/lib/AST/AttrImpl.cpp b/clang/lib/AST/AttrImpl.cpp
index cecbd703ac61e..da842f6b190e7 100644
--- a/clang/lib/AST/AttrImpl.cpp
+++ b/clang/lib/AST/AttrImpl.cpp
@@ -137,7 +137,7 @@ void OMPDeclareTargetDeclAttr::printPrettyPragma(
// Use fake syntax because it is for testing and debugging purpose only.
if (getDevType() != DT_Any)
OS << " device_type(" << ConvertDevTypeTyToStr(getDevType()) << ")";
- if (getMapType() != MT_To)
+ if (getMapType() != MT_To && getMapType() != MT_Enter)
OS << ' ' << ConvertMapTypeTyToStr(getMapType());
if (Expr *E = getIndirectExpr()) {
OS << " indirect(";
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 34e75723b3f30..d4903352873e4 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -16053,9 +16053,13 @@ bool Expr::EvaluateWithSubstitution(APValue &Value, ASTContext &Ctx,
if ((*I)->isValueDependent() ||
!EvaluateCallArg(PVD, *I, Call, Info) ||
Info.EvalStatus.HasSideEffects) {
- // If evaluation fails, throw away the argument entirely.
- if (APValue *Slot = Info.getParamSlot(Call, PVD))
- *Slot = APValue();
+ // If evaluation fails, throw away the argument entirely unless I is
+ // value-dependent. In those cases, the condition above will short-circuit
+ // before calling `EvaluateCallArg` and no param slot is created.
+ if (!(*I)->isValueDependent()) {
+ if (APValue *Slot = Info.getParamSlot(Call, PVD))
+ *Slot = APValue();
+ }
}
// Ignore any side-effects from a failed evaluation. This is safe because
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index adfc1d542bb34..096b4e9f7f8aa 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1127,7 +1127,7 @@ OMPMapClause *OMPMapClause::Create(
const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef Vars,
ArrayRef Declarations,
MappableExprComponentListsRef ComponentLists, ArrayRef UDMapperRefs,
- ArrayRef MapModifiers,
+ Expr *IteratorModifier, ArrayRef MapModifiers,
ArrayRef MapModifiersLoc,
NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId,
OpenMPMapClauseKind Type, bool TypeIsImplicit, SourceLocation TypeLoc) {
@@ -1150,7 +1150,7 @@ OMPMapClause *OMPMapClause::Create(
void *Mem = C.Allocate(
totalSizeToAlloc(
- 2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+ 2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
Sizes.NumComponents));
OMPMapClause *Clause = new (Mem)
@@ -1159,6 +1159,7 @@ OMPMapClause *OMPMapClause::Create(
Clause->setVarRefs(Vars);
Clause->setUDMapperRefs(UDMapperRefs);
+ Clause->setIteratorModifier(IteratorModifier);
Clause->setClauseInfo(Declarations, ComponentLists);
Clause->setMapType(Type);
Clause->setMapLoc(TypeLoc);
@@ -1171,10 +1172,12 @@ OMPMapClause::CreateEmpty(const ASTContext &C,
void *Mem = C.Allocate(
totalSizeToAlloc(
- 2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+ 2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
Sizes.NumComponents));
- return new (Mem) OMPMapClause(Sizes);
+ OMPMapClause *Clause = new (Mem) OMPMapClause(Sizes);
+ Clause->setIteratorModifier(nullptr);
+ return Clause;
}
OMPToClause *OMPToClause::Create(
@@ -2216,16 +2219,27 @@ static void PrintMapper(raw_ostream &OS, T *Node,
OS << Node->getMapperIdInfo() << ')';
}
+template
+static void PrintIterator(raw_ostream &OS, T *Node,
+ const PrintingPolicy &Policy) {
+ if (Expr *IteratorModifier = Node->getIteratorModifier())
+ IteratorModifier->printPretty(OS, nullptr, Policy);
+}
+
void OMPClausePrinter::VisitOMPMapClause(OMPMapClause *Node) {
if (!Node->varlist_empty()) {
OS << "map(";
if (Node->getMapType() != OMPC_MAP_unknown) {
for (unsigned I = 0; I < NumberOfOMPMapClauseModifiers; ++I) {
if (Node->getMapTypeModifier(I) != OMPC_MAP_MODIFIER_unknown) {
- OS << getOpenMPSimpleClauseTypeName(OMPC_map,
- Node->getMapTypeModifier(I));
- if (Node->getMapTypeModifier(I) == OMPC_MAP_MODIFIER_mapper)
- PrintMapper(OS, Node, Policy);
+ if (Node->getMapTypeModifier(I) == OMPC_MAP_MODIFIER_iterator) {
+ PrintIterator(OS, Node, Policy);
+ } else {
+ OS << getOpenMPSimpleClauseTypeName(OMPC_map,
+ Node->getMapTypeModifier(I));
+ if (Node->getMapTypeModifier(I) == OMPC_MAP_MODIFIER_mapper)
+ PrintMapper(OS, Node, Policy);
+ }
OS << ',';
}
}
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index cf16c320580a9..ebad051cb0d3c 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -189,9 +189,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
case GK_GFX1101:
case GK_GFX1100:
Features["ci-insts"] = true;
- Features["dot1-insts"] = true;
Features["dot5-insts"] = true;
- Features["dot6-insts"] = true;
Features["dot7-insts"] = true;
Features["dot8-insts"] = true;
Features["dl-insts"] = true;
diff --git a/clang/lib/Basic/Targets/AMDGPU.h b/clang/lib/Basic/Targets/AMDGPU.h
index 5e73a3cb8019a..c43d0ba5896f0 100644
--- a/clang/lib/Basic/Targets/AMDGPU.h
+++ b/clang/lib/Basic/Targets/AMDGPU.h
@@ -13,6 +13,7 @@
#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_AMDGPU_H
#define LLVM_CLANG_LIB_BASIC_TARGETS_AMDGPU_H
+#include "clang/Basic/AddressSpaces.h"
#include "clang/Basic/TargetID.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/TargetOptions.h"
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index be9497563621a..72a01fbd629cf 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2335,8 +2335,13 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
getLangOpts().Sanitize.has(SanitizerKind::Memory) ||
getLangOpts().Sanitize.has(SanitizerKind::Return);
+ // Enable noundef attribute based on codegen options and
+ // skip adding the attribute to HIP device functions.
+ bool EnableNoundefAttrs = CodeGenOpts.EnableNoundefAttrs &&
+ !(getLangOpts().HIP && getLangOpts().CUDAIsDevice);
+
// Determine if the return type could be partially undef
- if (CodeGenOpts.EnableNoundefAttrs && HasStrictReturn) {
+ if (EnableNoundefAttrs && HasStrictReturn) {
if (!RetTy->isVoidType() && RetAI.getKind() != ABIArgInfo::Indirect &&
DetermineNoUndef(RetTy, getTypes(), DL, RetAI))
RetAttrs.addAttribute(llvm::Attribute::NoUndef);
@@ -2470,8 +2475,7 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
}
// Decide whether the argument we're handling could be partially undef
- if (CodeGenOpts.EnableNoundefAttrs &&
- DetermineNoUndef(ParamType, getTypes(), DL, AI)) {
+ if (EnableNoundefAttrs && DetermineNoUndef(ParamType, getTypes(), DL, AI)) {
Attrs.addAttribute(llvm::Attribute::NoUndef);
}
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 73d08d8c9e0c8..79a21b8ac499d 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2495,14 +2495,16 @@ static Address emitDeclTargetVarDeclLValue(CodeGenFunction &CGF,
const VarDecl *VD, QualType T) {
llvm::Optional Res =
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
- // Return an invalid address if variable is MT_To and unified
- // memory is not enabled. For all other cases: MT_Link and
- // MT_To with unified memory, return a valid address.
- if (!Res || (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ // Return an invalid address if variable is MT_To (or MT_Enter starting with
+ // OpenMP 5.2) and unified memory is not enabled. For all other cases: MT_Link
+ // and MT_To (or MT_Enter) with unified memory, return a valid address.
+ if (!Res || ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
!CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory()))
return Address::invalid();
assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) ||
- (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory())) &&
"Expected link clause OR to clause with unified memory enabled.");
QualType PtrTy = CGF.getContext().getPointerType(VD->getType());
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 1abeedae4baf4..9051b1ee9852c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1665,7 +1665,8 @@ Address CGOpenMPRuntime::getAddrOfDeclareTargetVar(const VarDecl *VD) {
llvm::Optional Res =
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
if (Res && (*Res == OMPDeclareTargetDeclAttr::MT_Link ||
- (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
HasRequiresUnifiedSharedMemory))) {
SmallString<64> PtrName;
{
@@ -1880,7 +1881,8 @@ bool CGOpenMPRuntime::emitDeclareTargetVarDefinition(const VarDecl *VD,
Optional Res =
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
if (!Res || *Res == OMPDeclareTargetDeclAttr::MT_Link ||
- (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
HasRequiresUnifiedSharedMemory))
return CGM.getLangOpts().OpenMPIsDevice;
VD = VD->getDefinition(CGM.getContext());
@@ -7715,7 +7717,8 @@ class MappableExprsHandler {
if (llvm::Optional Res =
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) {
if ((*Res == OMPDeclareTargetDeclAttr::MT_Link) ||
- (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory())) {
RequiresReference = true;
BP = CGF.CGM.getOpenMPRuntime().getAddrOfDeclareTargetVar(VD);
@@ -10524,6 +10527,10 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
CodeGenFunction::EmitOMPTargetTeamsGenericLoopDeviceFunction(
CGM, ParentName, cast(E));
break;
+ case OMPD_target_parallel_loop:
+ CodeGenFunction::EmitOMPTargetParallelGenericLoopDeviceFunction(
+ CGM, ParentName, cast(E));
+ break;
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
@@ -10676,7 +10683,8 @@ bool CGOpenMPRuntime::emitTargetGlobalVariable(GlobalDecl GD) {
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(
cast(GD.getDecl()));
if (!Res || *Res == OMPDeclareTargetDeclAttr::MT_Link ||
- (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
HasRequiresUnifiedSharedMemory)) {
DeferredGlobalVariables.insert(cast(GD.getDecl()));
return true;
@@ -10713,7 +10721,8 @@ void CGOpenMPRuntime::registerTargetGlobalVariable(const VarDecl *VD,
int64_t VarSize;
llvm::GlobalValue::LinkageTypes Linkage;
- if (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ if ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
!HasRequiresUnifiedSharedMemory) {
Flags = llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
VarName = CGM.getMangledName(VD);
@@ -10744,7 +10753,8 @@ void CGOpenMPRuntime::registerTargetGlobalVariable(const VarDecl *VD,
}
} else {
assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) ||
- (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
HasRequiresUnifiedSharedMemory)) &&
"Declare target attribute must link or to with unified memory.");
if (*Res == OMPDeclareTargetDeclAttr::MT_Link)
@@ -10781,12 +10791,14 @@ void CGOpenMPRuntime::emitDeferredTargetDecls() const {
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
if (!Res)
continue;
- if (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ if ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
!HasRequiresUnifiedSharedMemory) {
CGM.EmitGlobal(VD);
} else {
assert((*Res == OMPDeclareTargetDeclAttr::MT_Link ||
- (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
HasRequiresUnifiedSharedMemory)) &&
"Expected link clause or to clause with unified memory.");
(void)CGM.getOpenMPRuntime().getAddrOfDeclareTargetVar(VD);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 50dd74653e78b..d7cb2f32b4325 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -665,6 +665,7 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx,
case OMPD_target_teams:
return hasNestedSPMDDirective(Ctx, D);
case OMPD_target_teams_loop:
+ case OMPD_target_parallel_loop:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
@@ -1040,59 +1041,6 @@ static int ComputeGenericWorkgroupSize(CodeGenModule &CGM, int WorkgroupSize) {
return WorkgroupSizeWithMaster;
}
-int getWorkGroupSizeSPMDHelper(CodeGenModule &CGM,
- const OMPExecutableDirective &D) {
- // Honor block-size provided by command-line option. This logic must be kept
- // in sync with metadata generation. If this option is not specified on the
- // command line then the value used will be the 256.
- int WorkGroupSz = CGM.getLangOpts().OpenMPGPUThreadsPerTeam;
-
- // Check block-size provided by thread_limit clause. We start with the
- // maximum thread limit and lower it if user requests a lower thread limit.
- int ThreadLimit = CGM.getTarget().getGridValue().GV_Max_WG_Size;
- const auto *ThreadLimitClause = D.getSingleClause();
- if (ThreadLimitClause) {
- Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit();
- clang::Expr::EvalResult Result;
- if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext())) {
- int ThreadLimitEval = Result.Val.getInt().getExtValue();
- if (ThreadLimitEval > 0 && ThreadLimitEval < ThreadLimit)
- ThreadLimit = ThreadLimitEval;
- }
- }
-
- // If the command line work group size is less than any default or user
- // specified thread limit then it is honored otherwise the thread limit
- // determined above will be used.
- if (WorkGroupSz > ThreadLimit)
- WorkGroupSz = ThreadLimit;
-
- // Set the actual number of threads if the user requests a value different
- // then the default. If the value is greater than the currently computed
- // thread limit then cap the number of threads to the thread limit.
- int NumThreads = CGM.getTarget().getGridValue().GV_Default_WG_Size;
- const auto *NumThreadsClause = D.getSingleClause();
- if (NumThreadsClause) {
- Expr *NumThreadsExpr = NumThreadsClause->getNumThreads();
- clang::Expr::EvalResult Result;
- if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext())) {
- NumThreads = Result.Val.getInt().getExtValue();
- // Cap the number of threads to the current thread limit.
- if (NumThreads > ThreadLimit)
- NumThreads = ThreadLimit;
- // num_threads clause takes precendence over the command line value:
- WorkGroupSz = NumThreads;
- }
- }
-
- // Sanitize the workgroup size received from the command line. Its default
- // value is GV_Default_WG_Size.
- if (WorkGroupSz < 1 || WorkGroupSz > ThreadLimit)
- WorkGroupSz = CGM.getTarget().getGridValue().GV_Default_WG_Size;
-
- return WorkGroupSz;
-}
-
void CGOpenMPRuntimeGPU::GenerateMetaData(CodeGenModule &CGM,
const OMPExecutableDirective &D,
llvm::Function *&OutlinedFn,
@@ -1109,13 +1057,11 @@ void CGOpenMPRuntimeGPU::GenerateMetaData(CodeGenModule &CGM,
isOpenMPParallelDirective(D.getDirectiveKind()) ||
CGM.isXteamRedKernel(CGM.getSingleForStmt(D.getAssociatedStmt()))) {
// Call the work group size calculation for SPMD mode loops.
- compileTimeThreadLimit = getWorkGroupSizeSPMDHelper(CGM, D);
+ compileTimeThreadLimit = CGM.getWorkGroupSizeSPMDHelper(D);
- // Xteam reduction overrides the command-line option and other settings
- // for now: blocksize hardcoded to 1024.
- // TODO: remove this restriction.
- if (CGM.isXteamRedKernel(CGM.getSingleForStmt(D.getAssociatedStmt())))
- compileTimeThreadLimit = 1024;
+ // Apply Xteam reduction constraints on blocksize.
+ if (CGM.isXteamRedKernel(D))
+ compileTimeThreadLimit = CGM.getXteamRedBlockSize(D);
// Add kernel metadata if ThreadLimit Clause is compile time constant > 0
if (compileTimeThreadLimit > 0) {
@@ -1282,6 +1228,22 @@ void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
Fn->addFnAttr(llvm::Attribute::get(Ctx, "kernel"));
}
+static OMPTgtExecModeFlags
+computeExecutionMode(bool Mode, const Stmt *DirectiveStmt, CodeGenModule &CGM) {
+ if (!Mode)
+ return OMP_TGT_EXEC_MODE_GENERIC;
+ if (DirectiveStmt) {
+ if (CGM.isNoLoopKernel(DirectiveStmt))
+ return OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
+ if (CGM.isBigJumpLoopKernel(CGM.getSingleForStmt(DirectiveStmt)))
+ return OMP_TGT_EXEC_MODE_SPMD_BIG_JUMP_LOOP;
+ const Stmt *S = CGM.getSingleForStmt(DirectiveStmt);
+ if (S && CGM.isXteamRedKernel(S))
+ return OMP_TGT_EXEC_MODE_XTEAM_RED;
+ }
+ return OMP_TGT_EXEC_MODE_SPMD;
+}
+
void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
@@ -1313,29 +1275,32 @@ void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
}
emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);
- } else
+ } else {
emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);
+ DEBUG_WITH_TYPE(
+ NO_LOOP_XTEAM_RED,
+ CGM.emitNxResult("[No-Loop/Xteam]", D, CodeGenModule::NxNonSPMD));
+ }
- setPropertyExecutionMode(
- CGM, OutlinedFn->getName(),
- Mode ? (DirectiveStmt && CGM.isNoLoopKernel(DirectiveStmt)
- ? OMP_TGT_EXEC_MODE_SPMD_NO_LOOP
- : (DirectiveStmt && CGM.isXteamRedKernel(
- CGM.getSingleForStmt(DirectiveStmt))
- ? OMP_TGT_EXEC_MODE_XTEAM_RED
- : OMP_TGT_EXEC_MODE_SPMD))
- : OMP_TGT_EXEC_MODE_GENERIC);
- // Reset no-loop or xteam reduction kernel metadata if it exists
- if (Mode && DirectiveStmt && CGM.isNoLoopKernel(DirectiveStmt))
- CGM.resetNoLoopKernel(DirectiveStmt);
- else if (Mode && DirectiveStmt &&
- CGM.isXteamRedKernel(CGM.getSingleForStmt(DirectiveStmt)))
- CGM.resetXteamRedKernel(CGM.getSingleForStmt(DirectiveStmt));
+ setPropertyExecutionMode(CGM, OutlinedFn->getName(),
+ computeExecutionMode(Mode, DirectiveStmt, CGM));
+
+ // Reset specialized kernel metadata if it exists
+ if (Mode && DirectiveStmt) {
+ if (CGM.isNoLoopKernel(DirectiveStmt))
+ CGM.resetNoLoopKernel(DirectiveStmt);
+ else if (CGM.isBigJumpLoopKernel(CGM.getSingleForStmt(DirectiveStmt)))
+ CGM.resetBigJumpLoopKernel(CGM.getSingleForStmt(DirectiveStmt));
+ else if (CGM.isXteamRedKernel(CGM.getSingleForStmt(DirectiveStmt)))
+ CGM.resetXteamRedKernel(CGM.getSingleForStmt(DirectiveStmt));
+ }
// Reset cached mode
CGM.setIsSPMDExecutionMode(false);
assert(!CGM.isNoLoopKernel(DirectiveStmt) &&
"No-loop attribute not reset after emit");
+ assert(!CGM.isBigJumpLoopKernel(CGM.getSingleForStmt(DirectiveStmt)) &&
+ "Big jump loop attribute not reset after emit");
assert(!CGM.isXteamRedKernel(CGM.getSingleForStmt(DirectiveStmt)) &&
"Xteam reduction attribute not reset after emit");
}
@@ -4237,13 +4202,12 @@ CGOpenMPRuntimeGPU::getGPUCompleteBlockSize(CodeGenFunction &CGF,
// Get effects of thread-controlling clauses on the current number of threads
// and any command line requests:
- return llvm::ConstantInt::get(CGF.Int32Ty,
- getWorkGroupSizeSPMDHelper(CGM, D));
+ return llvm::ConstantInt::get(CGF.Int32Ty, CGM.getWorkGroupSizeSPMDHelper(D));
}
-llvm::Value *CGOpenMPRuntimeGPU::getXteamRedBlockSize(CodeGenFunction &CGF) {
- // For now, this is hardcoded to 1024
- return llvm::ConstantInt::get(CGF.Int32Ty, 1024);
+llvm::Value *CGOpenMPRuntimeGPU::getXteamRedBlockSize(CodeGenFunction &CGF,
+ int BlockSize) {
+ return llvm::ConstantInt::get(CGF.Int32Ty, BlockSize);
}
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumBlocks(CodeGenFunction &CGF) {
@@ -4306,7 +4270,7 @@ CGOpenMPRuntimeGPU::getXteamRedFunctionPtrs(CodeGenFunction &CGF,
llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
CodeGenFunction &CGF, llvm::Value *Val, llvm::Value *SumPtr,
llvm::Value *DTeamVals, llvm::Value *DTeamsDonePtr,
- llvm::Value *ThreadStartIndex, llvm::Value *NumTeams) {
+ llvm::Value *ThreadStartIndex, llvm::Value *NumTeams, int BlockSize) {
// TODO handle more types
llvm::Type *SumType = Val->getType();
assert(
@@ -4332,29 +4296,121 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
if (SumType->isIntegerTy()) {
if (SumType->getPrimitiveSizeInBits() == 32) {
+ switch (BlockSize) {
+ case 64:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_ui_1x64),
+ Args);
+ case 128:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_ui_2x64),
+ Args);
+ case 256:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_ui_4x64),
+ Args);
+ case 512:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_ui_8x64),
+ Args);
+ case 1024:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_xteamr_ui_16x64),
+ Args);
+ }
+ }
+ if (SumType->getPrimitiveSizeInBits() == 64) {
+ switch (BlockSize) {
+ case 64:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_ul_1x64),
+ Args);
+ case 128:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_ul_2x64),
+ Args);
+ case 256:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_ul_4x64),
+ Args);
+ case 512:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_ul_8x64),
+ Args);
+ case 1024:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_xteamr_ul_16x64),
+ Args);
+ }
+ }
+ }
+ if (SumType->isFloatTy()) {
+ switch (BlockSize) {
+ case 64:
return CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
- OMPRTL___kmpc_xteamr_ui_16x64),
+ OMPRTL___kmpc_xteamr_f_1x64),
Args);
- }
- if (SumType->getPrimitiveSizeInBits() == 64) {
+ case 128:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_f_2x64),
+ Args);
+ case 256:
return CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
- OMPRTL___kmpc_xteamr_ul_16x64),
+ OMPRTL___kmpc_xteamr_f_4x64),
+ Args);
+ case 512:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_f_8x64),
+ Args);
+ case 1024:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_f_16x64),
Args);
}
}
- if (SumType->isFloatTy()) {
- return CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
- OMPRTL___kmpc_xteamr_f_16x64),
- Args);
- }
if (SumType->isDoubleTy()) {
- return CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
- OMPRTL___kmpc_xteamr_d_16x64),
- Args);
+ switch (BlockSize) {
+ case 64:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_d_1x64),
+ Args);
+ case 128:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_d_2x64),
+ Args);
+ case 256:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_d_4x64),
+ Args);
+ case 512:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_d_8x64),
+ Args);
+ case 1024:
+ return CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_xteamr_d_16x64),
+ Args);
+ }
}
llvm_unreachable("No support for other types currently.");
}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 00b2fb5b5b4d5..d2bde52682550 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -199,7 +199,7 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
llvm::Value *getGPUNumBlocks(CodeGenFunction &CGF);
/// Get the number of blocks on the GPU for special reduction
- llvm::Value *getXteamRedBlockSize(CodeGenFunction &CGF);
+ llvm::Value *getXteamRedBlockSize(CodeGenFunction &CGF, int BlockSize);
std::pair
getXteamRedFunctionPtrs(CodeGenFunction &CGF, llvm::Type *RedVarType);
@@ -209,7 +209,7 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
llvm::Value *SumPtr, llvm::Value *DTeamVals,
llvm::Value *DTeamsDonePtr,
llvm::Value *ThreadStartIndex,
- llvm::Value *NumTeams);
+ llvm::Value *NumTeams, int BlockSize);
/// Returns whether the current architecture supports fast FP atomics
bool supportFastFPAtomics() override;
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index d2947e887f58a..d0399e3e7f22e 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -69,9 +69,10 @@ llvm::Value *CodeGenFunction::applyNoLoopInc(const Expr *Inc,
}
std::pair
-CodeGenFunction::EmitXteamRedStartingIndex(const ForStmt &FStmt) {
+CodeGenFunction::EmitBigJumpLoopStartingIndex(const ForStmt &FStmt) {
const CodeGenModule::NoLoopIntermediateStmts &Directives =
- CGM.getXteamRedStmts(&FStmt);
+ CGM.isXteamRedKernel(&FStmt) ? CGM.getXteamRedStmts(&FStmt)
+ : CGM.getBigJumpLoopStmts(&FStmt);
assert(Directives.size() > 0 && isa(Directives.back()) &&
"Appropriate directive not found");
const OMPLoopDirective &LD = *(cast(Directives.back()));
@@ -86,7 +87,10 @@ CodeGenFunction::EmitXteamRedStartingIndex(const ForStmt &FStmt) {
llvm::Value *GpuThreadId = RT.getGPUThreadID(*this);
// workgroup_size
- llvm::Value *WorkGroupSize = RT.getXteamRedBlockSize(*this);
+ llvm::Value *WorkGroupSize =
+ CGM.isXteamRedKernel(&FStmt)
+ ? RT.getXteamRedBlockSize(*this, CGM.getXteamRedBlockSize(&FStmt))
+ : RT.getXteamRedBlockSize(*this, CGM.getBigJumpLoopBlockSize(&FStmt));
// workgroup_id
llvm::Value *WorkGroupId = RT.getGPUBlockID(*this);
@@ -106,20 +110,22 @@ CodeGenFunction::EmitXteamRedStartingIndex(const ForStmt &FStmt) {
Builder.CreateIntCast(GlobalGpuThreadId, IvAddr.getElementType(), false);
llvm::Value *Iv = Builder.CreateAdd(Gtid, Builder.CreateLoad(IvAddr));
- // Cache the thread specific initial loop iteration value and the number of
- // teams
- CGM.updateXteamRedKernel(&FStmt, Builder.CreateIntCast(Iv, Int64Ty, false),
- RT.getGPUNumBlocks(*this));
-
+ if (CGM.isXteamRedKernel(&FStmt)) {
+ // Cache the thread specific initial loop iteration value and the number of
+ // teams
+ CGM.updateXteamRedKernel(&FStmt, Builder.CreateIntCast(Iv, Int64Ty, false),
+ RT.getGPUNumBlocks(*this));
+ }
// Set the initial value of the loop iteration
Builder.CreateStore(Iv, IvAddr);
return std::make_pair(LoopVD, IvAddr);
}
-void CodeGenFunction::EmitXteamRedUpdates(const ForStmt &FStmt) {
+void CodeGenFunction::EmitBigJumpLoopUpdates(const ForStmt &FStmt) {
const CodeGenModule::NoLoopIntermediateStmts &Directives =
- CGM.getXteamRedStmts(&FStmt);
+ CGM.isXteamRedKernel(&FStmt) ? CGM.getXteamRedStmts(&FStmt)
+ : CGM.getBigJumpLoopStmts(&FStmt);
assert(Directives.size() > 0 && isa(Directives.back()) &&
"Appropriate directive not found");
const OMPLoopDirective &LD = *(cast(Directives.back()));
@@ -128,18 +134,25 @@ void CodeGenFunction::EmitXteamRedUpdates(const ForStmt &FStmt) {
EmitIgnoredExpr(UE);
}
-void CodeGenFunction::EmitXteamRedInc(const ForStmt &FStmt,
- const VarDecl *LoopVD,
- const Address &NoLoopIvAddr) {
+void CodeGenFunction::EmitBigJumpLoopInc(const ForStmt &FStmt,
+ const VarDecl *LoopVD,
+ const Address &NoLoopIvAddr) {
const CodeGenModule::NoLoopIntermediateStmts &Directives =
- CGM.getXteamRedStmts(&FStmt);
+ CGM.isXteamRedKernel(&FStmt) ? CGM.getXteamRedStmts(&FStmt)
+ : CGM.getBigJumpLoopStmts(&FStmt);
assert(Directives.size() > 0 && isa(Directives.back()) &&
"Appropriate directive not found");
const OMPLoopDirective &LD = *(cast(Directives.back()));
auto &RT = static_cast(CGM.getOpenMPRuntime());
- llvm::Value *BlockSize = RT.getXteamRedBlockSize(*this);
- llvm::Value *NumBlocks = CGM.getXteamRedNumTeams(&FStmt);
+ llvm::Value *BlockSize =
+ CGM.isXteamRedKernel(&FStmt)
+ ? RT.getXteamRedBlockSize(*this, CGM.getXteamRedBlockSize(&FStmt))
+ : RT.getXteamRedBlockSize(*this, CGM.getBigJumpLoopBlockSize(&FStmt));
+
+ llvm::Value *NumBlocks = CGM.isXteamRedKernel(&FStmt)
+ ? CGM.getXteamRedNumTeams(&FStmt)
+ : RT.getGPUNumBlocks(*this);
assert(NumBlocks && "Number of blocks cannot be null");
// prod = block_size * num_blocks
llvm::Value *Prod = Builder.CreateMul(BlockSize, NumBlocks);
@@ -195,7 +208,6 @@ CodeGenFunction::EmitNoLoopIV(const OMPLoopDirective &LD) {
// Emit init of the iteration variable
EmitIgnoredExpr(LD.getInit());
-
return std::make_pair(IVDecl, GetAddrOfLocalVar(IVDecl));
}
@@ -290,6 +302,19 @@ void CodeGenFunction::EmitNoLoopKernel(const OMPExecutableDirective &D,
}
}
+void CodeGenFunction::EmitBigJumpLoopKernel(const OMPExecutableDirective &D,
+ SourceLocation Loc) {
+ if (!HaveInsertPoint())
+ EnsureInsertPoint();
+
+ // We expect one FOR stmt for the OpenMP directive
+ const ForStmt *CapturedForStmt = CGM.getSingleForStmt(D.getAssociatedStmt());
+ assert(CapturedForStmt && "Cannot generate kernel for null captured stmt");
+
+ // The BigJump loop will be generated during the following statement emit.
+ EmitStmt(CapturedForStmt);
+}
+
void CodeGenFunction::EmitXteamRedKernel(
const OMPExecutableDirective &D, const Stmt *S, const FunctionArgList &Args,
const CodeGenModule::NoLoopIntermediateStmts &IntermediateStmts,
@@ -315,7 +340,7 @@ void CodeGenFunction::EmitXteamRedKernel(
EmitStmt(CapturedForStmt);
// Now emit the calls to xteam_sum, one for each reduction variable
- EmitXteamRedSum(CapturedForStmt, Args);
+ EmitXteamRedSum(CapturedForStmt, Args, CGM.getXteamRedBlockSize(D));
// Xteam codegen done
CGM.setCurrentXteamRedStmt(nullptr);
@@ -355,7 +380,8 @@ void CodeGenFunction::EmitXteamLocalAggregator(const ForStmt *FStmt) {
// Emit __kmpc_xteam_sum(*xteam_red_local_addr, red_var_addr) for each reduction
// in the helper map for the given For Stmt
void CodeGenFunction::EmitXteamRedSum(const ForStmt *FStmt,
- const FunctionArgList &Args) {
+ const FunctionArgList &Args,
+ int BlockSize) {
auto &RT = static_cast(CGM.getOpenMPRuntime());
const CodeGenModule::XteamRedVarMap &RedVarMap = CGM.getXteamRedVarMap(FStmt);
llvm::Value *ThreadStartIdx = CGM.getXteamRedThreadStartIndex(FStmt);
@@ -379,7 +405,7 @@ void CodeGenFunction::EmitXteamRedSum(const ForStmt *FStmt,
// Pass in OrigRedVarAddr.getPointer to kmpc_xteam_sum
RT.getXteamRedSum(*this, Builder.CreateLoad(RVI.RedVarAddr),
OrigRedVarAddr.getPointer(), DTeamVals, DTeamsDonePtr,
- ThreadStartIdx, NumTeams);
+ ThreadStartIdx, NumTeams, BlockSize);
}
}
@@ -832,8 +858,7 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) {
EmitOMPGenericLoopDirective(cast(*S));
break;
case Stmt::OMPTeamsGenericLoopDirectiveClass:
- llvm_unreachable("teams loop directive not supported yet.");
- // EmitOMPTeamsGenericLoopDirective(cast(*S));
+ EmitOMPTeamsGenericLoopDirective(cast(*S));
break;
case Stmt::OMPTargetTeamsGenericLoopDirectiveClass:
EmitOMPTargetTeamsGenericLoopDirective(
@@ -844,9 +869,8 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) {
cast(*S));
break;
case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
- llvm_unreachable("target parallel loop directive not supported yet.");
- // EmitOMPTargetParallelGenericLoopDirective(
- // cast(*S));
+ EmitOMPTargetParallelGenericLoopDirective(
+ cast(*S));
break;
case Stmt::OMPParallelMaskedDirectiveClass:
llvm_unreachable("parallel masked directive not supported yet.");
@@ -1454,27 +1478,29 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
LexicalScope ForScope(*this, S.getSourceRange());
- Address XteamRedIvAddr = Address::invalid();
+ Address BigJumpLoopIvAddr = Address::invalid();
const VarDecl *LoopVar = nullptr;
- const OMPLoopDirective *XteamLD = nullptr;
- if (CGM.getLangOpts().OpenMPIsDevice && CGM.isXteamRedKernel(&S)) {
+ const OMPLoopDirective *BigJumpLoopLD = nullptr;
+ if (CGM.getLangOpts().OpenMPIsDevice &&
+ (CGM.isXteamRedKernel(&S) || CGM.isBigJumpLoopKernel(&S))) {
const CodeGenModule::NoLoopIntermediateStmts &Directives =
- CGM.getXteamRedStmts(&S);
+ CGM.isXteamRedKernel(&S) ? CGM.getXteamRedStmts(&S)
+ : CGM.getBigJumpLoopStmts(&S);
assert(Directives.size() > 0 && isa(Directives.back()) &&
"Appropriate directive not found");
- XteamLD = cast(Directives.back());
+ BigJumpLoopLD = cast(Directives.back());
std::pair LoopVarInfo =
- EmitXteamRedStartingIndex(S);
+ EmitBigJumpLoopStartingIndex(S);
LoopVar = LoopVarInfo.first;
- XteamRedIvAddr = LoopVarInfo.second;
+ BigJumpLoopIvAddr = LoopVarInfo.second;
} else {
// Evaluate the first part before the loop.
if (S.getInit())
EmitStmt(S.getInit());
}
- const Expr *CondExpr = XteamLD ? XteamLD->getCond() : S.getCond();
+ const Expr *CondExpr = BigJumpLoopLD ? BigJumpLoopLD->getCond() : S.getCond();
// Start the loop with a block that tests the condition.
// If there's an increment, the continue scope will be overwritten
@@ -1559,18 +1585,21 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
// a compound statement.
RunCleanupsScope BodyScope(*this);
- if (CGM.getLangOpts().OpenMPIsDevice && CGM.isXteamRedKernel(&S)) {
- EmitXteamRedUpdates(S);
- EmitOMPNoLoopBody(*XteamLD);
+ if (CGM.getLangOpts().OpenMPIsDevice &&
+ (CGM.isXteamRedKernel(&S) || CGM.isBigJumpLoopKernel(&S))) {
+ EmitBigJumpLoopUpdates(S);
+ EmitOMPNoLoopBody(*BigJumpLoopLD);
} else {
EmitStmt(S.getBody());
}
}
- if (CGM.getLangOpts().OpenMPIsDevice && CGM.isXteamRedKernel(&S)) {
+ if (CGM.getLangOpts().OpenMPIsDevice &&
+ (CGM.isXteamRedKernel(&S) || CGM.isBigJumpLoopKernel(&S))) {
EmitBlock(Continue.getBlock());
- EmitXteamRedInc(S, LoopVar,
- XteamRedIvAddr); // *iv = *iv + num_teams * num_threads
+ EmitBigJumpLoopInc(
+ S, LoopVar,
+ BigJumpLoopIvAddr); // *iv = *iv + num_teams * num_threads
} else {
// If there is an increment, emit it next.
if (S.getInc()) {
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index c7076d35bbb7f..e1c93bf0a0d62 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -848,12 +848,15 @@ llvm::Function *CodeGenFunction::GenerateOpenMPCapturedStmtFunction(
// Generate specialized kernels for device only
if (CGM.getLangOpts().OpenMPIsDevice && D.hasAssociatedStmt() &&
- CGM.isNoLoopKernel(D.getAssociatedStmt())) {
+ (CGM.isNoLoopKernel(D.getAssociatedStmt()) ||
+ (FStmt && CGM.isBigJumpLoopKernel(FStmt)))) {
OMPPrivateScope PrivateScope(*this);
EmitOMPPrivateClause(D, PrivateScope);
(void)PrivateScope.Privatize();
-
- EmitNoLoopKernel(D, Loc);
+ if (CGM.isNoLoopKernel(D.getAssociatedStmt()))
+ EmitNoLoopKernel(D, Loc);
+ else
+ EmitBigJumpLoopKernel(D, Loc);
} else if (CGM.getLangOpts().OpenMPIsDevice && isXteamKernel) {
OMPPrivateScope PrivateScope(*this);
EmitOMPPrivateClause(D, PrivateScope);
@@ -8176,12 +8179,14 @@ void CodeGenFunction::EmitOMPTargetUpdateDirective(
/// A 'loop' construct is supposed to be a work distribution construct by
/// default unless its binding region is the innermost enclosing parallel
-/// region. For now, we are defaulting to work sharing as an experiment to
-/// determine how best to implement 'loop' and its combined forms especially
-/// as part of the 'target teams loop' directive). Note that this code is
-/// equivalent to how 'for' is implemented (when not using OpenMPIRBuilder).
+/// region, in which case it is a worksharing region. Because we currently
+/// have no way to know if this is true, for now emit them as inlined loops.
void CodeGenFunction::EmitOMPGenericLoopDirective(
const OMPLoopDirective &S) {
+#if 0
+ // TODO: A 'loop' construct is worksharing only if its binding region is
+ // the innermost enclosing parallel region. Until we can determine
+ // this, 'loop' should be emitted as inlined.
bool HasLastprivates = false;
auto &&CodeGen = [this, &S, &HasLastprivates]
(CodeGenFunction &CGF, PrePostActionTy &) {
@@ -8199,6 +8204,14 @@ void CodeGenFunction::EmitOMPGenericLoopDirective(
CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getBeginLoc(), OMPD_loop);
// Check for outer lastprivate conditional update.
checkForLastprivateConditionalUpdate(*this, S);
+#else
+ // Just inline the underlying statement for now.
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt());
+ };
+ OMPLexicalScope Scope(*this, S, OMPD_unknown);
+ CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_loop, CodeGen);
+#endif
}
/// Equivalent to 'parallel for' except for handling of clauses that don't
@@ -8225,24 +8238,14 @@ void CodeGenFunction::EmitOMPParallelGenericLoopDirective(
/// Emit code for 'teams loop'
void CodeGenFunction::EmitOMPTeamsGenericLoopDirective(
const OMPTeamsGenericLoopDirective &S) {
- auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
- Action.Enter(CGF);
- // FIXME: Should be able to emit with generic loop code, but it doesn't
- // work right now.
- CGF.EmitOMPGenericLoopDirective(S);
- };
- emitCommonOMPTeamsDirective(*this, S, OMPD_loop, CodeGen);
- emitPostUpdateForReductionClause(*this, S,
- [](CodeGenFunction &) { return nullptr; });
-}
-
-/// Emit code for 'target parallel loop'
-void CodeGenFunction::EmitOMPTargetParallelGenericLoopDirective(
- const OMPTargetParallelGenericLoopDirective &S) {
+ // For now, emit as the two combined directives 'parallel' and 'loop'.
+ // This is similar to what we do for 'target teams loop'. Eventually,
+ // 'distribute' will be added so that 'teams loop' fully emulates
+ // 'teams distribute parallel for'.
auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
CGF.EmitOMPParallelGenericLoopDirective(S);
};
- emitCommonOMPTargetDirective(*this, S, CodeGen);
+ emitCommonOMPTeamsDirective(*this, S, OMPD_loop, CodeGen);
}
static void emitTargetTeamsGenericLoopRegion(
@@ -8283,6 +8286,47 @@ void CodeGenFunction::EmitOMPTargetTeamsGenericLoopDeviceFunction(
"Target device function emission failed for 'target teams loop'.");
}
+static void emitTargetParallelGenericLoopRegion(
+ CodeGenFunction &CGF, const OMPTargetParallelGenericLoopDirective &S,
+ PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ // Emit directive as a combined directive that consists of two implicit
+ // directives: 'parallel' with (worksharing) 'loop' directive.
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ CodeGenFunction::OMPCancelStackRAII CancelRegion(
+ CGF, OMPD_target_parallel_loop, /*hasCancel=*/false);
+ CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds,
+ emitDispatchForLoopBounds);
+ };
+ emitCommonOMPParallelDirective(CGF, S, OMPD_loop, CodeGen,
+ emitEmptyBoundParameters);
+}
+
+void CodeGenFunction::EmitOMPTargetParallelGenericLoopDeviceFunction(
+ CodeGenModule &CGM, StringRef ParentName,
+ const OMPTargetParallelGenericLoopDirective &S) {
+ // Emit target parallel loop region as a standalone region.
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ emitTargetParallelGenericLoopRegion(CGF, S, Action);
+ };
+ llvm::Function *Fn;
+ llvm::Constant *Addr;
+ // Emit target region as a standalone region.
+ CGM.getOpenMPRuntime().emitTargetOutlinedFunction(
+ S, ParentName, Fn, Addr, /*IsOffloadEntry=*/true, CodeGen);
+ assert(Fn && Addr && "Target device function emission failed.");
+}
+
+/// Emit code for 'target parallel loop'
+void CodeGenFunction::EmitOMPTargetParallelGenericLoopDirective(
+ const OMPTargetParallelGenericLoopDirective &S) {
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ emitTargetParallelGenericLoopRegion(CGF, S, Action);
+ };
+ emitCommonOMPTargetDirective(*this, S, CodeGen);
+}
+
void CodeGenFunction::EmitSimpleOMPExecutableDirective(
const OMPExecutableDirective &D) {
if (const auto *SD = dyn_cast(&D)) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 2af2c924371a5..6cba6927ef4a5 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -404,6 +404,7 @@ class CodeGenFunction : public CodeGenTypeCache {
return PostAllocaInsertPt;
}
+
/// API for captured statement code generation.
class CGCapturedStmtInfo {
public:
@@ -3240,6 +3241,9 @@ class CodeGenFunction : public CodeGenTypeCache {
/// conditions for a no-loop kernel are met.
void EmitNoLoopKernel(const OMPExecutableDirective &D, SourceLocation Loc);
+ void EmitBigJumpLoopKernel(const OMPExecutableDirective &D,
+ SourceLocation Loc);
+
/// EmitXteamRedKernel - For an OpenMP target reduction directive, emit the
/// kernel code assuming that related runtime environment variables can be
/// ignored.
@@ -3255,7 +3259,9 @@ class CodeGenFunction : public CodeGenTypeCache {
/// associated variables. Returns the loop iteration variable and its address.
std::pair EmitNoLoopIV(const OMPLoopDirective &LD);
- void EmitXteamRedUpdates(const ForStmt &FStmt);
+ /// Emit updates of the original loop indices. Used by both
+ /// BigJumpLoop and Xteam reduction kernel codegen.
+ void EmitBigJumpLoopUpdates(const ForStmt &FStmt);
/// EmitSimpleStmt - Try to emit a "simple" statement which does not
/// necessarily require an insertion point or debug information; typically
@@ -3681,6 +3687,11 @@ class CodeGenFunction : public CodeGenTypeCache {
static void EmitOMPTargetTeamsGenericLoopDeviceFunction(CodeGenModule &CGM,
StringRef ParentName, const OMPTargetTeamsGenericLoopDirective &S);
+ /// Emit device code for the target parallel loop directive.
+ static void EmitOMPTargetParallelGenericLoopDeviceFunction(
+ CodeGenModule &CGM, StringRef ParentName,
+ const OMPTargetParallelGenericLoopDirective &S);
+
/// Emit the Stmt \p S and return its topmost canonical loop, if any.
/// TODO: The \p Depth paramter is not yet implemented and must be 1. In the
/// future it is meant to be the number of loops expected in the loop nests
@@ -4901,12 +4912,17 @@ class CodeGenFunction : public CodeGenTypeCache {
llvm::Value *applyNoLoopInc(const Expr *Inc, const VarDecl *IVDecl,
llvm::Value *CurrVal);
+ /// Emit the starting index of a BigJumpLoop which is used in
+ /// BigJumpLoop and Xteam reduction kernels.
std::pair
- EmitXteamRedStartingIndex(const ForStmt &FStmt);
- void EmitXteamRedInc(const ForStmt &FStmt, const VarDecl *LoopVar,
- const Address &NoLoopIvAddr);
+ EmitBigJumpLoopStartingIndex(const ForStmt &FStmt);
+ /// Emit the increment of a BigJumpLoop which is used in BigJumpLoop
+ /// and Xteam reduction kernels.
+ void EmitBigJumpLoopInc(const ForStmt &FStmt, const VarDecl *LoopVar,
+ const Address &NoLoopIvAddr);
void EmitXteamLocalAggregator(const ForStmt *FStmt);
- void EmitXteamRedSum(const ForStmt *FStmt, const FunctionArgList &Args);
+ void EmitXteamRedSum(const ForStmt *FStmt, const FunctionArgList &Args,
+ int BlockSize);
bool EmitXteamRedStmt(const Stmt *S);
};
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index a2e8078b93ec3..b4e66ad64a9a9 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -584,6 +584,9 @@ void CodeGenModule::Release() {
"__amdgpu_device_library_preserve_asan_functions_ptr", nullptr,
llvm::GlobalVariable::NotThreadLocal);
addCompilerUsedGlobal(Var);
+ if (!getModule().getModuleFlag("amdgpu_hostcall")) {
+ getModule().addModuleFlag(llvm::Module::Override, "amdgpu_hostcall", 1);
+ }
}
// Emit amdgpu_code_object_version module flag, which is code object version
// times 100.
@@ -3314,12 +3317,14 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) {
bool UnifiedMemoryEnabled =
getOpenMPRuntime().hasRequiresUnifiedSharedMemory();
- if (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ if ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
!UnifiedMemoryEnabled) {
(void)GetAddrOfGlobalVar(VD);
} else {
assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) ||
- (*Res == OMPDeclareTargetDeclAttr::MT_To &&
+ ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
+ *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
UnifiedMemoryEnabled)) &&
"Link clause or to clause with unified memory expected.");
(void)getOpenMPRuntime().getAddrOfDeclareTargetVar(VD);
@@ -7250,6 +7255,9 @@ void CodeGenModule::emitNxResult(std::string StatusMsg,
switch (Status) {
case NxSuccess:
break;
+ case NxNonSPMD:
+ StatusMsg += "Non-SPMD mode not supported";
+ break;
case NxOptionDisabled:
StatusMsg += "Command line option disabled";
break;
@@ -7304,6 +7312,9 @@ void CodeGenModule::emitNxResult(std::string StatusMsg,
case NxNonUnitStaticChunk:
StatusMsg += "Schedule clause with non-unit chunk size";
break;
+ case NxNonConcurrentOrder:
+ StatusMsg += "Non-concurrent order not supported";
+ break;
case NxUnsupportedRedType:
StatusMsg += "Unsupported reduction variable type";
break;
@@ -7329,6 +7340,9 @@ void CodeGenModule::emitNxResult(std::string StatusMsg,
case NxUnsupportedRedExpr:
StatusMsg += "Unsupported reduction expression found";
break;
+ case NxUnsupportedXteamRedThreadLimit:
+ StatusMsg += "Thread Limit less than 256 not supported";
+ break;
}
SourceLocation L = D.getBeginLoc();
@@ -7556,6 +7570,77 @@ CodeGenModule::getNoLoopForStmtStatus(const OMPExecutableDirective &D,
return NxSuccess;
}
+int CodeGenModule::getWorkGroupSizeSPMDHelper(const OMPExecutableDirective &D) {
+ // Honor block-size provided by command-line option. This logic must be kept
+ // in sync with metadata generation. If this option is not specified on the
+ // command line then the value used will be the 256.
+ int WorkGroupSz = getLangOpts().OpenMPGPUThreadsPerTeam;
+
+ // Cross team reduction blocksize default may be specified separately.
+ if (isXteamRedKernel(D))
+ WorkGroupSz = getLangOpts().OpenMPTargetXteamReductionBlockSize;
+
+ // Check block-size provided by thread_limit clause. We start with the
+ // maximum thread limit and lower it if user requests a lower thread limit.
+ int ThreadLimit = getTarget().getGridValue().GV_Max_WG_Size;
+ const auto *ThreadLimitClause = D.getSingleClause();
+ if (ThreadLimitClause) {
+ Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit();
+ clang::Expr::EvalResult Result;
+ if (ThreadLimitExpr->EvaluateAsInt(Result, getContext())) {
+ int ThreadLimitEval = Result.Val.getInt().getExtValue();
+ if (ThreadLimitEval > 0 && ThreadLimitEval < ThreadLimit)
+ ThreadLimit = ThreadLimitEval;
+ }
+ }
+
+ // If the command line work group size is less than any default or user
+ // specified thread limit then it is honored otherwise the thread limit
+ // determined above will be used.
+ if (WorkGroupSz > ThreadLimit)
+ WorkGroupSz = ThreadLimit;
+
+ // Set the actual number of threads if the user requests a value different
+ // then the default. If the value is greater than the currently computed
+ // thread limit then cap the number of threads to the thread limit.
+ int NumThreads = getTarget().getGridValue().GV_Default_WG_Size;
+ const auto *NumThreadsClause = D.getSingleClause();
+ if (NumThreadsClause) {
+ Expr *NumThreadsExpr = NumThreadsClause->getNumThreads();
+ clang::Expr::EvalResult Result;
+ if (NumThreadsExpr->EvaluateAsInt(Result, getContext())) {
+ NumThreads = Result.Val.getInt().getExtValue();
+ // Cap the number of threads to the current thread limit.
+ if (NumThreads > ThreadLimit)
+ NumThreads = ThreadLimit;
+ // num_threads clause takes precendence over the command line value:
+ WorkGroupSz = NumThreads;
+ }
+ }
+
+ // Sanitize the workgroup size received from the command line. Its default
+ // value is GV_Default_WG_Size.
+ if (WorkGroupSz < 1 || WorkGroupSz > ThreadLimit)
+ WorkGroupSz = getTarget().getGridValue().GV_Default_WG_Size;
+
+ return WorkGroupSz;
+}
+
+int CodeGenModule::computeXteamRedBlockSize(const OMPExecutableDirective &D) {
+ int InitialBlockSize = getWorkGroupSizeSPMDHelper(D);
+ // We support block sizes 64, 128, 256, 512, and 1024 only for Xteam
+ // reduction.
+ if (InitialBlockSize < 128)
+ return 64;
+ if (InitialBlockSize < 256)
+ return 128;
+ if (InitialBlockSize < 512)
+ return 256;
+ if (InitialBlockSize < 1024)
+ return 512;
+ return 1024;
+}
+
CodeGenModule::NoLoopXteamErr
CodeGenModule::getXteamRedForStmtStatus(const OMPExecutableDirective &D,
const Stmt *OMPStmt,
@@ -7617,10 +7702,26 @@ CodeGenModule::getNoLoopCompatibleOrderStatus(const OMPLoopDirective &LD) {
return NxSuccess;
}
+CodeGenModule::NoLoopXteamErr
+CodeGenModule::getXteamRedCompatibleThreadLimitStatus(
+ const OMPLoopDirective &LD) {
+ const auto *ThreadLimitClause = LD.getSingleClause();
+ if (!ThreadLimitClause)
+ return NxSuccess;
+ Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit();
+ clang::Expr::EvalResult Result;
+ if (ThreadLimitExpr->EvaluateAsInt(Result, getContext())) {
+ int ThreadLimitEval = Result.Val.getInt().getExtValue();
+ // We support thread limit >= 64
+ if (ThreadLimitEval > 63)
+ return NxSuccess;
+ }
+ return NxUnsupportedXteamRedThreadLimit;
+}
+
CodeGenModule::NoLoopXteamErr
CodeGenModule::getNoLoopCombinedClausesStatus(const OMPExecutableDirective &D) {
if (D.hasClausesOfKind() ||
- D.hasClausesOfKind() ||
D.hasClausesOfKind() ||
D.hasClausesOfKind() ||
D.hasClausesOfKind() ||
@@ -7641,9 +7742,6 @@ CodeGenModule::NoLoopXteamErr CodeGenModule::getXteamRedCombinedClausesStatus(
if (D.hasClausesOfKind() ||
D.hasClausesOfKind() ||
D.hasClausesOfKind() ||
- D.hasClausesOfKind() ||
- D.hasClausesOfKind() ||
- D.hasClausesOfKind() ||
D.hasClausesOfKind() ||
D.hasClausesOfKind() ||
D.hasClausesOfKind() ||
@@ -7653,6 +7751,8 @@ CodeGenModule::NoLoopXteamErr CodeGenModule::getXteamRedCombinedClausesStatus(
return NxNotLoopDirective;
const OMPLoopDirective &LD = cast(D);
NoLoopXteamErr NxStatus = NxSuccess;
+ if ((NxStatus = getXteamRedCompatibleThreadLimitStatus(LD)))
+ return NxStatus;
if ((NxStatus = getNoLoopCompatibleOrderStatus(LD)))
return NxStatus;
return getNoLoopCompatibleSchedStatus(LD);
@@ -7776,7 +7876,8 @@ CodeGenModule::NoLoopXteamErr CodeGenModule::checkAndSetNoLoopTargetConstruct(
CodeGenModule::NoLoopXteamErr
CodeGenModule::checkAndSetNoLoopKernel(const OMPExecutableDirective &D) {
NoLoopXteamErr NxStatus = NxSuccess;
- if (!getLangOpts().OpenMPTargetIgnoreEnvVars)
+ if (!getLangOpts().OpenMPTargetIgnoreEnvVars ||
+ !getLangOpts().OpenMPNoNestedParallelism)
return NxOptionDisabled;
if (D.getDirectiveKind() !=
@@ -7807,7 +7908,21 @@ CodeGenModule::checkAndSetNoLoopKernel(const OMPExecutableDirective &D) {
NoLoopIntermediateStmts IntermediateStmts;
// Push top-level directive
IntermediateStmts.push_back(&D);
- setNoLoopKernel(AssocStmt, IntermediateStmts);
+
+ // Now we should determine whether this qualifies as a NoLoop or a
+ // BigJumpLoop kernel. BigJumpLoop is enabled whenever NoLoop is
+ // enabled. If the num_teams clause is specified, BigJumpLoop is
+ // chosen. If the command line option to force BigJumpLoop is used,
+ // it is preferred over No-Loop.
+ if (D.hasClausesOfKind() ||
+ getLangOpts().OpenMPTargetBigJumpLoop) {
+ const ForStmt *FStmt = getSingleForStmt(AssocStmt);
+ assert(FStmt && "For stmt cannot be null");
+ BigJumpLoopKernels.insert(std::make_pair(
+ FStmt, BigJumpLoopKernelInfo(getWorkGroupSizeSPMDHelper(D),
+ IntermediateStmts)));
+ } else
+ setNoLoopKernel(AssocStmt, IntermediateStmts);
// All checks passed
return NxSuccess;
@@ -7816,7 +7931,8 @@ CodeGenModule::checkAndSetNoLoopKernel(const OMPExecutableDirective &D) {
CodeGenModule::NoLoopXteamErr
CodeGenModule::checkAndSetXteamRedKernel(const OMPExecutableDirective &D) {
NoLoopXteamErr NxStatus = NxSuccess;
- if (!getLangOpts().OpenMPTargetIgnoreEnvVars)
+ if (!getLangOpts().OpenMPTargetIgnoreEnvVars ||
+ !getLangOpts().OpenMPNoNestedParallelism)
return NxOptionDisabled;
// Allowing only a combined construct for now
@@ -7852,13 +7968,43 @@ CodeGenModule::checkAndSetXteamRedKernel(const OMPExecutableDirective &D) {
assert(FStmt && "For stmt cannot be null");
XteamRedKernels.insert(std::make_pair(
FStmt, XteamRedKernelInfo(/*ThreadStartIndex=*/nullptr,
- /*NumTeams=*/nullptr, IntermediateStmts,
+ /*NumTeams=*/nullptr,
+ /*BlockSize=*/0, IntermediateStmts,
RedVarMapPair.second)));
+ // The blocksize has to be computed after adding this kernel to the metadata
+ // above, since the computation below depends on that metadata. Compute block
+ // size during device compilation only.
+ int BlockSize =
+ getLangOpts().OpenMPIsDevice ? computeXteamRedBlockSize(D) : 0;
+ if (BlockSize > 0)
+ updateXteamRedKernel(FStmt, BlockSize);
+
// All checks passed
return NxSuccess;
}
+bool CodeGenModule::isXteamRedKernel(const OMPExecutableDirective &D) {
+ if (!D.hasAssociatedStmt())
+ return false;
+ const ForStmt *FStmt = getSingleForStmt(D.getAssociatedStmt());
+ if (FStmt == nullptr)
+ return false;
+ return isXteamRedKernel(FStmt);
+}
+
+int CodeGenModule::getXteamRedBlockSize(const ForStmt *FStmt) {
+ assert(XteamRedKernels.find(FStmt) != XteamRedKernels.end() &&
+ "Metadata missing for Xteam kernel");
+ return XteamRedKernels.find(FStmt)->second.BlockSize;
+}
+
+int CodeGenModule::getXteamRedBlockSize(const OMPExecutableDirective &D) {
+ assert(isXteamRedKernel(D) && "Expected an Xteam reduction kernel");
+ const ForStmt *FStmt = getSingleForStmt(D.getAssociatedStmt());
+ return getXteamRedBlockSize(FStmt);
+}
+
void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
assert(DeferredDeclsToEmit.empty() &&
"Should have emitted all decls deferred to emit.");
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 04b9853989384..1bb0669e53a77 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -301,6 +301,7 @@ class CodeGenModule : public CodeGenTypeCache {
enum NoLoopXteamErr {
NxSuccess,
+ NxNonSPMD,
NxOptionDisabled,
NxUnsupportedDirective,
NxUnsupportedSplitDirective,
@@ -327,7 +328,8 @@ class CodeGenModule : public CodeGenTypeCache {
NxUnsupportedRedOp,
NxNoRedVar,
NxMultRedVar,
- NxUnsupportedRedExpr
+ NxUnsupportedRedExpr,
+ NxUnsupportedXteamRedThreadLimit
};
/// Top-level and nested OpenMP directives that may use no-loop codegen.
@@ -336,6 +338,16 @@ class CodeGenModule : public CodeGenTypeCache {
/// Map construct statement to the intermediate ones for no-loop codegen
using NoLoopKernelMap = llvm::DenseMap;
+ struct BigJumpLoopKernelInfo {
+ BigJumpLoopKernelInfo(int BlkSz, NoLoopIntermediateStmts Stmts)
+ : BlockSize{BlkSz}, BigJumpLoopIntStmts{Stmts} {}
+
+ int BlockSize;
+ NoLoopIntermediateStmts BigJumpLoopIntStmts;
+ };
+ using BigJumpLoopKernelMap =
+ llvm::DenseMap;
+
/// Map a reduction variable to the corresponding metadata. The metadata
/// contains
// the reduction expression, the coorresponding Xteam local aggregator var,
@@ -349,16 +361,15 @@ class CodeGenModule : public CodeGenTypeCache {
size_t ArgPos;
};
using XteamRedVarMap = llvm::DenseMap;
- // using XteamRedKernelInfo = std::pair;
struct XteamRedKernelInfo {
- XteamRedKernelInfo(llvm::Value *TSI, llvm::Value *NT,
+ XteamRedKernelInfo(llvm::Value *TSI, llvm::Value *NT, int BlkSz,
NoLoopIntermediateStmts Stmts, XteamRedVarMap RVM)
- : ThreadStartIndex{TSI}, NumTeams{NT}, XteamIntStmts{Stmts},
- XteamRedVars{RVM} {}
+ : ThreadStartIndex{TSI}, NumTeams{NT}, BlockSize{BlkSz},
+ XteamIntStmts{Stmts}, XteamRedVars{RVM} {}
llvm::Value *ThreadStartIndex;
llvm::Value *NumTeams;
+ int BlockSize;
NoLoopIntermediateStmts XteamIntStmts;
XteamRedVarMap XteamRedVars;
};
@@ -410,6 +421,7 @@ class CodeGenModule : public CodeGenTypeCache {
const Stmt *CurrentXteamRedStmt = nullptr;
NoLoopKernelMap NoLoopKernels;
+ BigJumpLoopKernelMap BigJumpLoopKernels;
XteamRedKernelMap XteamRedKernels;
// A set of references that have only been seen via a weakref so far. This is
@@ -1622,6 +1634,9 @@ class CodeGenModule : public CodeGenTypeCache {
/// Given the order clause, can No-Loop code be generated?
NoLoopXteamErr getNoLoopCompatibleOrderStatus(const OMPLoopDirective &LD);
+ NoLoopXteamErr
+ getXteamRedCompatibleThreadLimitStatus(const OMPLoopDirective &LD);
+
/// Helper functions for generating a NoLoop kernel
/// For a captured statement, get the single For statement, if it exists,
/// otherwise return nullptr.
@@ -1661,11 +1676,34 @@ class CodeGenModule : public CodeGenTypeCache {
return NoLoopKernels.find(S) != NoLoopKernels.end();
}
+ /// Given a top-level target construct for BigJumpLoop codegen, get the
+ /// intermediate OpenMP constructs.
+ const NoLoopIntermediateStmts &getBigJumpLoopStmts(const Stmt *S) {
+ assert(isBigJumpLoopKernel(S));
+ return BigJumpLoopKernels.find(S)->second.BigJumpLoopIntStmts;
+ }
+
+ /// Get the cached blocksize to be used for this BigJumpLoop kernel.
+ int getBigJumpLoopBlockSize(const Stmt *S) {
+ assert(isBigJumpLoopKernel(S));
+ return BigJumpLoopKernels.find(S)->second.BlockSize;
+ }
+
+ /// Erase BigJumpLoop related metadata for the input statement.
+ void resetBigJumpLoopKernel(const Stmt *S) { BigJumpLoopKernels.erase(S); }
+ /// Is a BigJumpLoop kernel generated for the input statement?
+ bool isBigJumpLoopKernel(const Stmt *S) {
+ return BigJumpLoopKernels.find(S) != BigJumpLoopKernels.end();
+ }
+
/// If we are able to generate a Xteam reduction kernel for this directive,
/// return true, otherwise return false. If successful, metadata for the
/// reduction variables are created for subsequent codegen phases to work on.
NoLoopXteamErr checkAndSetXteamRedKernel(const OMPExecutableDirective &D);
+ /// Compute the block size to be used for a kernel
+ int getWorkGroupSizeSPMDHelper(const OMPExecutableDirective &D);
+
/// Given a ForStmt for which Xteam codegen will be done, return the
/// intermediate statements for a split directive.
const NoLoopIntermediateStmts &getXteamRedStmts(const Stmt *S) {
@@ -1715,12 +1753,22 @@ class CodeGenModule : public CodeGenTypeCache {
KernelInfo.NumTeams = NTeams;
}
+ void updateXteamRedKernel(const Stmt *S, int BlkSz) {
+ assert(isXteamRedKernel(S));
+ XteamRedKernels.find(S)->second.BlockSize = BlkSz;
+ }
+
+ // Get the already-computed block size used by Xteam reduction
+ int getXteamRedBlockSize(const ForStmt *FStmt);
+ int getXteamRedBlockSize(const OMPExecutableDirective &D);
+
/// Erase spec-red related metadata for the input statement
void resetXteamRedKernel(const Stmt *S) { XteamRedKernels.erase(S); }
/// Are we generating xteam reduction kernel for the statement
bool isXteamRedKernel(const Stmt *S) {
return XteamRedKernels.find(S) != XteamRedKernels.end();
}
+ bool isXteamRedKernel(const OMPExecutableDirective &D);
void setCurrentXteamRedStmt(const Stmt *S) { CurrentXteamRedStmt = S; }
const Stmt *getCurrentXteamRedStmt() { return CurrentXteamRedStmt; }
@@ -1924,6 +1972,9 @@ class CodeGenModule : public CodeGenTypeCache {
NoLoopXteamErr getNoLoopForStmtStatus(const OMPExecutableDirective &,
const Stmt *);
+ // Compute the block size used by Xteam reduction
+ int computeXteamRedBlockSize(const OMPExecutableDirective &D);
+
/// Top level checker for xteam reduction of the loop
NoLoopXteamErr getXteamRedForStmtStatus(const OMPExecutableDirective &,
const Stmt *, const XteamRedVarMap &);
diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index adb9f8028bccf..babd38bb68155 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -28,6 +28,7 @@
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/raw_ostream.h"
#include
+#include
#include
#include
#include
@@ -352,7 +353,8 @@ void Compilation::ExecuteJobs(const JobList &Jobs,
const Command *Next = nullptr;
while (!JS.IsDone(Next)) {
if (!Next) {
- std::this_thread::yield();
+ // sleep, rather than yield so we do not busy wait.
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
continue;
}
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 8e2b40ae6b1c2..765d4af90b656 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6007,15 +6007,16 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
!C.getArgs().hasArg(options::OPT__SLASH_Fo)) ||
CCGenDiagnostics) {
StringRef Name = llvm::sys::path::filename(BaseInput);
- std::pair Split = Name.split('.');
- SmallString<128> fname(Split.first.str().c_str());
+ size_t pos = Name.find_last_of(".");
+ StringRef PrefixName = Name.substr(0, pos);
+ SmallString<128> fname(PrefixName.str().c_str());
if (!BoundArch.empty()) {
fname += "-";
fname.append(BoundArch);
}
SmallString<128> TmpName;
const char *Suffix = nullptr;
- if (Split.second == "a")
+ if (Name.ends_with(".a"))
Suffix = "a";
else
Suffix = types::getTypeTempSuffix(JA.getType(), IsCLMode());
@@ -6035,11 +6036,11 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
}
} else {
if (MultipleArchs && !BoundArch.empty()) {
- TmpName = GetTemporaryDirectory(Split.first);
+ TmpName = GetTemporaryDirectory(PrefixName);
llvm::sys::path::append(TmpName,
- Split.first + "-" + BoundArch + "." + Suffix);
+ PrefixName + "-" + BoundArch + "." + Suffix);
} else {
- TmpName = GetTemporaryPath(Split.first, Suffix);
+ TmpName = GetTemporaryPath(PrefixName, Suffix);
}
}
return C.addTempFile(C.getArgs().MakeArgString(TmpName));
@@ -6123,7 +6124,11 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
NamedOutput =
MakeCLOutputFilename(C.getArgs(), Val, BaseName, types::TY_Object);
} else {
- const char *Suffix = types::getTypeTempSuffix(JA.getType(), IsCLMode());
+ const char *Suffix = nullptr;
+ if (BaseName.ends_with(".a"))
+ Suffix = "a";
+ else
+ Suffix = types::getTypeTempSuffix(JA.getType(), IsCLMode());
assert(Suffix && "All types used for output should have a suffix.");
std::string::size_type End = std::string::npos;
@@ -6179,9 +6184,10 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
// Must share the same path to conflict.
if (SameFile) {
StringRef Name = llvm::sys::path::filename(BaseInput);
- std::pair Split = Name.split('.');
+ size_t pos = Name.find_last_of(".");
+ StringRef PrefixName = Name.substr(0, pos);
std::string TmpName = GetTemporaryPath(
- Split.first, types::getTypeTempSuffix(JA.getType(), IsCLMode()));
+ PrefixName, types::getTypeTempSuffix(JA.getType(), IsCLMode()));
return C.addTempFile(C.getArgs().MakeArgString(TmpName));
}
}
diff --git a/clang/lib/Driver/ToolChains/AMDFlang.cpp b/clang/lib/Driver/ToolChains/AMDFlang.cpp
index 9c8c5aa9582e1..7c4b3d939d050 100644
--- a/clang/lib/Driver/ToolChains/AMDFlang.cpp
+++ b/clang/lib/Driver/ToolChains/AMDFlang.cpp
@@ -999,6 +999,13 @@ void AMDFlang::ConstructJob(Compilation &C, const JobAction &JA,
// Remove "noinline" attriblute
LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("183"); LowerCmdArgs.push_back("0x10");
+ // Move option 234 flang reductions up to -fopenmp-target-fast
+ // instructing flang2 to use 32 teams for reduction tuning via opt 234.
+ if (Args.hasFlag(options::OPT_fopenmp_target_fast,
+ options::OPT_fno_openmp_target_fast, false)) {
+ LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("234"); LowerCmdArgs.push_back("32");
+ }
+
// Set a -x flag for second part of Fortran frontend
for (Arg *A : Args.filtered(options::OPT_Mx_EQ)) {
A->claim();
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 96a1856e57682..dd81c408d006f 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -243,7 +243,8 @@ RocmInstallationDetector::getInstallationPathCandidates() {
}
// Some versions of the rocm llvm package install to /opt/rocm/llvm/bin
- if (ParentName == "llvm")
+ // Some versions of the aomp package install to /opt/rocm/aomp/bin
+ if (ParentName == "llvm" || ParentName.startswith("aomp"))
ParentDir = llvm::sys::path::parent_path(ParentDir);
// Some versions of the aomp package install to /opt/rocm/aomp/bin
// and it seems ParentDir is already pointing to correct place.
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index 0eac7869620b1..006fcae426d8b 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -52,7 +52,7 @@ namespace toolchains {
class LLVM_LIBRARY_VISIBILITY AMDGPUToolChain : public Generic_ELF {
protected:
const std::map OptionsDefault;
- unsigned CodeObjectVersion = 4;
+ unsigned CodeObjectVersion = 5;
Tool *buildLinker() const override;
StringRef getOptionDefault(options::ID OptID) const {
auto opt = OptionsDefault.find(OptID);
diff --git a/clang/lib/Driver/ToolChains/AmdOptArgs.cpp b/clang/lib/Driver/ToolChains/AmdOptArgs.cpp
index 8f2fff3b4a278..fea33d0dd36c8 100644
--- a/clang/lib/Driver/ToolChains/AmdOptArgs.cpp
+++ b/clang/lib/Driver/ToolChains/AmdOptArgs.cpp
@@ -43,7 +43,6 @@ static bool hasLlvmAoccOption(const ArgList &Args) {
Flags.insert(std::make_pair("-mark-rv-outline", true));
Flags.insert(std::make_pair("-rv-outline", true));
Flags.insert(std::make_pair("-rv-depth", true));
- Flags.insert(std::make_pair("-rv-max-reg-size", true));
Flags.insert(std::make_pair("-enable-branch-combine", true));
Flags.insert(std::make_pair("-simplifycfg-no-storesink", true));
Flags.insert(std::make_pair("-inline-aggressive", true));
@@ -278,11 +277,6 @@ static bool checkForPropOpts(const ToolChain &TC, const Driver &D,
}
ClosedToolChainNeeded = true;
} else if ((MArch == "znver2") || (MArch == "znver3")) {
- // -rv-max-reg-size=256 around 5% gain on nab
- if (!checkOnly) {
- CmdArgs.push_back("-mllvm");
- CmdArgs.push_back("-rv-max-reg-size=256");
- }
ClosedToolChainNeeded = true;
}
}
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d3157bdacf501..e763e5f3c2bfd 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -768,8 +768,8 @@ static bool isTargetFastUsed(const ArgList &Args) {
options::OPT_fno_openmp_target_fast, isOFastUsed(Args));
}
-/// Ignore possibility of runtime environment variables during kernel code
-/// generation at -O3 (and above) and -Ofast
+/// Ignore possibility of environment variables if either
+/// -fopenmp-target-fast or -Ofast is used.
static bool shouldIgnoreEnvVars(const ArgList &Args) {
if (Args.hasFlag(options::OPT_fno_openmp_target_fast,
options::OPT_fopenmp_target_fast, false))
@@ -778,29 +778,6 @@ static bool shouldIgnoreEnvVars(const ArgList &Args) {
if (isTargetFastUsed(Args))
return true;
- if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
- if (A->getOption().matches(options::OPT_O4))
- return true;
-
- if (A->getOption().matches(options::OPT_O0))
- return false;
-
- assert(A->getOption().matches(options::OPT_O) && "Must have a -O flag");
-
- StringRef S(A->getValue());
- if (S == "s")
- return false;
-
- if (S == "z")
- return false;
-
- unsigned OptLevel = 0;
- if (S.getAsInteger(10, OptLevel))
- return false;
-
- return OptLevel > 2;
- }
-
return false;
}
@@ -6257,6 +6234,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.AddAllArgs(CmdArgs,
options::OPT_fopenmp_cuda_teams_reduction_recs_num_EQ);
Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_gpu_threads_per_team_EQ);
+ Args.AddAllArgs(CmdArgs,
+ options::OPT_fopenmp_target_xteam_reduction_blocksize_EQ);
if (Args.hasFlag(options::OPT_fopenmp_optimistic_collapse,
options::OPT_fno_openmp_optimistic_collapse,
/*Default=*/false))
@@ -6270,6 +6249,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
} else
CmdArgs.push_back("-fno-openmp-target-fast");
+
if (Args.hasFlag(options::OPT_fopenmp_target_ignore_env_vars,
options::OPT_fno_openmp_target_ignore_env_vars,
shouldIgnoreEnvVars(Args)))
@@ -6277,6 +6257,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
else
CmdArgs.push_back("-fno-openmp-target-ignore-env-vars");
+ if (Args.hasFlag(options::OPT_fopenmp_target_big_jump_loop,
+ options::OPT_fno_openmp_target_big_jump_loop, false))
+ CmdArgs.push_back("-fopenmp-target-big-jump-loop");
+ else
+ CmdArgs.push_back("-fno-openmp-target-big-jump-loop");
+
// When in OpenMP offloading mode with NVPTX target, forward
// cuda-mode flag
if (Args.hasFlag(options::OPT_fopenmp_cuda_mode,
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 89a3dd6f8901c..5428d3f42705e 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2267,7 +2267,7 @@ void tools::checkAMDGPUCodeObjectVersion(const Driver &D,
unsigned tools::getAMDGPUCodeObjectVersion(const Driver &D,
const llvm::opt::ArgList &Args) {
- unsigned CodeObjVer = 4; // default
+ unsigned CodeObjVer = 5; // default
if (auto *CodeObjArg = getAMDGPUCodeObjectArgument(D, Args)) {
if (CodeObjArg->getOption().getID() ==
options::OPT_mno_code_object_v3_legacy) {
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 7315fd89ef0ec..9f78bdc003264 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -111,7 +111,12 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
const llvm::opt::ArgList &Args) const {
// Construct lld command.
// The output from ld.lld is an HSA code object file.
- ArgStringList LldArgs{"-flavor", "gnu", "--no-undefined", "-shared",
+ ArgStringList LldArgs{"-flavor",
+ "gnu",
+ "-m",
+ "elf64_amdgpu",
+ "--no-undefined",
+ "-shared",
"-plugin-opt=-amdgpu-internalize-symbols"};
auto &TC = getToolChain();
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 0744e8df626cc..48f0bbfcabfb9 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3465,6 +3465,11 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
else
GenerateArg(Args, OPT_fno_openmp_target_ignore_env_vars, SA);
+ if (Opts.OpenMPTargetBigJumpLoop)
+ GenerateArg(Args, OPT_fopenmp_target_big_jump_loop, SA);
+ else
+ GenerateArg(Args, OPT_fno_openmp_target_big_jump_loop, SA);
+
if (Opts.OpenMPThreadSubscription)
GenerateArg(Args, OPT_fopenmp_assume_threads_oversubscription, SA);
@@ -3501,6 +3506,10 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
GenerateArg(Args, OPT_fopenmp_gpu_threads_per_team_EQ,
Twine(Opts.OpenMPGPUThreadsPerTeam), SA);
+ if (Opts.OpenMPTargetXteamReductionBlockSize != 1024)
+ GenerateArg(Args, OPT_fopenmp_target_xteam_reduction_blocksize_EQ,
+ Twine(Opts.OpenMPTargetXteamReductionBlockSize), SA);
+
if (!Opts.OMPTargetTriples.empty()) {
std::string Targets;
llvm::raw_string_ostream OS(Targets);
@@ -3909,11 +3918,17 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
getLastArgIntValue(Args, options::OPT_fopenmp_gpu_threads_per_team_EQ,
Opts.OpenMPGPUThreadsPerTeam, Diags);
- // Turn ON at -O3 (and above) and -Ofast
+ Opts.OpenMPTargetXteamReductionBlockSize = getLastArgIntValue(
+ Args, options::OPT_fopenmp_target_xteam_reduction_blocksize_EQ,
+ Opts.OpenMPTargetXteamReductionBlockSize, Diags);
+
Opts.OpenMPTargetIgnoreEnvVars =
Args.hasFlag(options::OPT_fopenmp_target_ignore_env_vars,
- options::OPT_fno_openmp_target_ignore_env_vars,
- getOptimizationLevel(Args, IK, Diags) > 2);
+ options::OPT_fno_openmp_target_ignore_env_vars, false);
+
+ Opts.OpenMPTargetBigJumpLoop =
+ Args.hasFlag(options::OPT_fopenmp_target_big_jump_loop,
+ options::OPT_fno_openmp_target_big_jump_loop, false);
// Set the value of the debugging flag used in the new offloading device RTL.
// Set either by a specific value or to a default if not specified.
diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index fb5ec3ab9c273..a6b17fc5ab6e9 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -166,7 +166,6 @@ uint64_t __make_mantissa(const char *__tagp) {
}
// BEGIN FLOAT
-#if defined(__cplusplus)
__DEVICE__
int abs(int __x) {
int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
@@ -182,7 +181,6 @@ long long llabs(long long __x) {
long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
return (__x ^ __sgn) - __sgn;
}
-#endif
__DEVICE__
float acosf(float __x) { return __ocml_acos_f32(__x); }
diff --git a/clang/lib/Headers/openmp_wrappers/complex b/clang/lib/Headers/openmp_wrappers/complex
index e785c028fdb8b..5ff428c3f3d76 100644
--- a/clang/lib/Headers/openmp_wrappers/complex
+++ b/clang/lib/Headers/openmp_wrappers/complex
@@ -17,17 +17,15 @@
#endif
// We require std::math functions in the complex builtins below.
-#ifdef __NVPTX__
#include
-#define __CUDA__
+
+#ifdef __NVPTX__
#define __OPENMP_NVPTX__
#include <__clang_cuda_complex_builtins.h>
#undef __OPENMP_NVPTX__
#endif // __NVPTX__
#ifdef __AMDGCN__
-#include <__clang_hip_libdevice_declares.h>
-#define __ARCHTYPES__ amdgcn
#define __OPENMP_AMDGCN__
#include <__clang_cuda_complex_builtins.h>
#undef __OPENMP_AMDGCN__
@@ -47,7 +45,6 @@
// arithmetic and calls to non-complex functions, all of which we can then
// handle.
#ifndef _LIBCPP_STD_VER
-#ifndef _GLIBCXX_COMPLEX
#pragma omp begin declare variant match( \
device = {arch(amdgcn, nvptx, nvptx64)}, \
@@ -57,5 +54,4 @@
#pragma omp end declare variant
-#endif // _GLIBCXX_COMPLEX
#endif // _LIBCPP_STD_VER
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index e34bd8d7bca40..a768c4da504af 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1291,7 +1291,22 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
if (getLangOpts().CUDA) {
// In CUDA code, GNU attributes are allowed to appear immediately after the
// "[...]", even if there is no "(...)" before the lambda body.
- MaybeParseGNUAttributes(D);
+ //
+ // Note that we support __noinline__ as a keyword in this mode and thus
+ // it has to be separately handled.
+ while (true) {
+ if (Tok.is(tok::kw___noinline__)) {
+ IdentifierInfo *AttrName = Tok.getIdentifierInfo();
+ SourceLocation AttrNameLoc = ConsumeToken();
+ Attr.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+ ParsedAttr::AS_Keyword);
+ } else if (Tok.is(tok::kw___attribute))
+ ParseGNUAttributes(Attr, nullptr, &D);
+ else
+ break;
+ }
+
+ D.takeAttributes(Attr);
}
// Helper to emit a warning if we see a CUDA host/device/global attribute
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 6f8a467b9a657..2388fe0393683 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -1801,9 +1801,21 @@ void Parser::ParseOMPDeclareTargetClauses(
<< getOpenMPClauseName(OMPC_indirect) << 0;
break;
}
- bool IsToOrLinkClause =
+ bool IsToEnterOrLinkClause =
OMPDeclareTargetDeclAttr::ConvertStrToMapTypeTy(ClauseName, MT);
- assert((!IsDeviceTypeClause || !IsToOrLinkClause) && "Cannot be both!");
+ assert((!IsDeviceTypeClause || !IsToEnterOrLinkClause) &&
+ "Cannot be both!");
+
+ // Starting with OpenMP 5.2 the `to` clause has been replaced by the
+ // `enter` clause.
+ if (getLangOpts().OpenMP >= 52 && ClauseName == "to") {
+ Diag(Tok, diag::err_omp_declare_target_unexpected_to_clause);
+ break;
+ }
+ if (getLangOpts().OpenMP <= 51 && ClauseName == "enter") {
+ Diag(Tok, diag::err_omp_declare_target_unexpected_enter_clause);
+ break;
+ }
if (!IsDeviceTypeClause && !IsIndirectClause &&
DTCI.Kind == OMPD_begin_declare_target) {
@@ -1811,16 +1823,18 @@ void Parser::ParseOMPDeclareTargetClauses(
<< ClauseName << (getLangOpts().OpenMP >= 51 ? 3 : 0);
break;
}
- if (!IsDeviceTypeClause && !IsToOrLinkClause && !IsIndirectClause) {
- Diag(Tok, diag::err_omp_declare_target_unexpected_clause)
+ if (!IsDeviceTypeClause && !IsToEnterOrLinkClause && !IsIndirectClause) {
+ Diag(Tok, getLangOpts().OpenMP >= 52
+ ? diag::err_omp_declare_target_unexpected_clause_52
+ : diag::err_omp_declare_target_unexpected_clause)
<< ClauseName
- << (getLangOpts().OpenMP >= 51 ? 4
- : getLangOpts().OpenMP >= 50 ? 2
- : 1);
+ << (getLangOpts().OpenMP >= 51
+ ? 4
+ : getLangOpts().OpenMP >= 50 ? 2 : 1);
break;
}
- if (IsToOrLinkClause || IsIndirectClause)
+ if (IsToEnterOrLinkClause || IsIndirectClause)
HasToOrLinkOrIndirectClause = true;
if (IsIndirectClause) {
@@ -1884,7 +1898,9 @@ void Parser::ParseOMPDeclareTargetClauses(
}
if (!HasIdentifier && Tok.isNot(tok::annot_pragma_openmp_end)) {
Diag(Tok,
- diag::err_omp_declare_target_unexpected_clause_after_implicit_to);
+ getLangOpts().OpenMP >= 52
+ ? diag::err_omp_declare_target_wrong_clause_after_implicit_enter
+ : diag::err_omp_declare_target_wrong_clause_after_implicit_to);
break;
}
@@ -1899,7 +1915,10 @@ void Parser::ParseOMPDeclareTargetClauses(
// For declare target require at least 'to' or 'link' to be present.
if (DTCI.Kind == OMPD_declare_target && RequiresToOrLinkOrIndirectClause &&
!HasToOrLinkOrIndirectClause)
- Diag(DTCI.Loc, diag::err_omp_declare_target_missing_to_or_link_clause)
+ Diag(DTCI.Loc,
+ getLangOpts().OpenMP >= 52
+ ? diag::err_omp_declare_target_missing_enter_or_link_clause
+ : diag::err_omp_declare_target_missing_to_or_link_clause)
<< (getLangOpts().OpenMP >= 51 ? 1 : 0);
SkipUntil(tok::annot_pragma_openmp_end, StopBeforeMatch);
@@ -3970,7 +3989,8 @@ bool Parser::parseMapTypeModifiers(Sema::OpenMPVarListDataTy &Data) {
if (PP.LookAhead(0).is(tok::colon))
return false;
Diag(Tok, diag::err_omp_unknown_map_type_modifier)
- << (getLangOpts().OpenMP >= 51 ? 1 : 0)
+ << (getLangOpts().OpenMP >= 51 ? (getLangOpts().OpenMP >= 52 ? 2 : 1)
+ : 0)
<< getLangOpts().OpenMPExtensions;
ConsumeToken();
}
@@ -4159,6 +4179,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
return true;
bool HasIterator = false;
+ bool InvalidIterator = false;
bool NeedRParenForLinear = false;
BalancedDelimiterTracker LinearT(*this, tok::l_paren,
tok::annot_pragma_openmp_end);
@@ -4264,6 +4285,23 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
Data.ColonLoc = ConsumeToken();
}
} else if (Kind == OMPC_map) {
+ // Handle optional iterator map modifier.
+ if (Tok.is(tok::identifier) && PP.getSpelling(Tok) == "iterator") {
+ HasIterator = true;
+ EnterScope(Scope::OpenMPDirectiveScope | Scope::DeclScope);
+ Data.MapTypeModifiers.push_back(OMPC_MAP_MODIFIER_iterator);
+ Data.MapTypeModifiersLoc.push_back(Tok.getLocation());
+ ExprResult IteratorRes = ParseOpenMPIteratorsExpr();
+ Data.IteratorExpr = IteratorRes.get();
+ // Parse ','
+ ExpectAndConsume(tok::comma);
+ if (getLangOpts().OpenMP < 52) {
+ Diag(Tok, diag::err_omp_unknown_map_type_modifier)
+ << (getLangOpts().OpenMP >= 51 ? 1 : 0)
+ << getLangOpts().OpenMPExtensions;
+ InvalidIterator = true;
+ }
+ }
// Handle map type for map clause.
ColonProtectionRAIIObject ColonRAII(*this);
@@ -4293,6 +4331,12 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
}
if (Data.ExtraModifier == OMPC_MAP_unknown) {
Data.ExtraModifier = OMPC_MAP_tofrom;
+ if (getLangOpts().OpenMP >= 52) {
+ if (DKind == OMPD_target_enter_data)
+ Data.ExtraModifier = OMPC_MAP_to;
+ else if (DKind == OMPD_target_exit_data)
+ Data.ExtraModifier = OMPC_MAP_from;
+ }
Data.IsMapTypeImplicit = true;
}
@@ -4455,7 +4499,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
ExitScope();
return (Kind != OMPC_depend && Kind != OMPC_map && Vars.empty()) ||
(MustHaveTail && !Data.DepModOrTailExpr) || InvalidReductionId ||
- IsInvalidMapperModifier;
+ IsInvalidMapperModifier || InvalidIterator;
}
/// Parsing of OpenMP clause 'private', 'firstprivate', 'lastprivate',
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 2493b4a76d5e1..194f425626669 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -351,7 +351,8 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef Locs,
// [OpenMP 5.0], 2.19.7.3. declare mapper Directive, Restrictions
// List-items in map clauses on this construct may only refer to the declared
// variable var and entities that could be referenced by a procedure defined
- // at the same location
+ // at the same location.
+ // [OpenMP 5.2] Also allow iterator declared variables.
if (LangOpts.OpenMP && isa(D) &&
!isOpenMPDeclareMapperVarDeclAllowed(cast(D))) {
Diag(Loc, diag::err_omp_declare_mapper_wrong_var)
@@ -5415,6 +5416,10 @@ ExprResult Sema::ActOnOMPIteratorExpr(Scope *S, SourceLocation IteratorKwLoc,
} else {
CurContext->addDecl(VD);
}
+
+ /// Act on the iterator variable declaration.
+ ActOnOpenMPIteratorVarDecl(VD);
+
Expr *Begin = D.Range.Begin;
if (!IsDeclTyDependent && Begin && !Begin->isTypeDependent()) {
ExprResult BeginRes =
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index c093cf71b6e15..942f817d11c57 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -215,6 +215,7 @@ class DSAStackTy {
llvm::SmallVector
ImplicitDefaultFirstprivateFDs;
Expr *DeclareMapperVar = nullptr;
+ SmallVector IteratorVarDecls;
SharingMapTy(OpenMPDirectiveKind DKind, DeclarationNameInfo Name,
Scope *CurScope, SourceLocation Loc)
: Directive(DKind), DirectiveName(Name), CurScope(CurScope),
@@ -1140,6 +1141,22 @@ class DSAStackTy {
const SharingMapTy *Top = getTopOfStackOrNull();
return Top ? Top->DeclareMapperVar : nullptr;
}
+
+ /// Add a new iterator variable.
+ void addIteratorVarDecl(VarDecl *VD) {
+ SharingMapTy &StackElem = getTopOfStack();
+ StackElem.IteratorVarDecls.push_back(VD->getCanonicalDecl());
+ }
+ /// Check if variable declaration is an iterator VarDecl.
+ bool isIteratorVarDecl(const VarDecl *VD) const {
+ const SharingMapTy *Top = getTopOfStackOrNull();
+ if (!Top)
+ return false;
+
+ return llvm::any_of(Top->IteratorVarDecls, [VD](const VarDecl *IteratorVD) {
+ return IteratorVD == VD->getCanonicalDecl();
+ });
+ }
/// get captured field from ImplicitDefaultFirstprivateFDs
VarDecl *getImplicitFDCapExprDecl(const FieldDecl *FD) const {
const_iterator I = begin();
@@ -2702,6 +2719,24 @@ void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
}
if (!LangOpts.OpenMPIsDevice && !LangOpts.OpenMPOffloadMandatory && DevTy &&
*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
+ // In OpenMP 5.2 or later, if the function has a host variant then allow
+ // that to be called instead
+ auto &&HasHostAttr = [](const FunctionDecl *Callee) {
+ for (OMPDeclareVariantAttr *A :
+ Callee->specific_attrs()) {
+ auto *DeclRefVariant = cast(A->getVariantFuncRef());
+ auto *VariantFD = cast(DeclRefVariant->getDecl());
+ Optional DevTy =
+ OMPDeclareTargetDeclAttr::getDeviceType(
+ VariantFD->getMostRecentDecl());
+ if (!DevTy || *DevTy == OMPDeclareTargetDeclAttr::DT_Host)
+ return true;
+ }
+ return false;
+ };
+ if (getLangOpts().OpenMP >= 52 &&
+ Callee->hasAttr() && HasHostAttr(Callee))
+ return;
// Diagnose nohost function called during host codegen.
StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName(
OMPC_device_type, OMPC_DEVICE_TYPE_nohost);
@@ -6043,7 +6078,7 @@ processImplicitMapsWithDefaultMappers(Sema &S, DSAStackTy *Stack,
CXXScopeSpec MapperIdScopeSpec;
DeclarationNameInfo MapperId;
if (OMPClause *NewClause = S.ActOnOpenMPMapClause(
- C->getMapTypeModifiers(), C->getMapTypeModifiersLoc(),
+ nullptr, C->getMapTypeModifiers(), C->getMapTypeModifiersLoc(),
MapperIdScopeSpec, MapperId, C->getMapType(),
/*IsMapTypeImplicit=*/true, SourceLocation(), SourceLocation(),
SubExprs, OMPVarListLocTy()))
@@ -6185,8 +6220,8 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
CXXScopeSpec MapperIdScopeSpec;
DeclarationNameInfo MapperId;
if (OMPClause *Implicit = ActOnOpenMPMapClause(
- OMPC_MAP_MODIFIER_unknown, SourceLocation(), MapperIdScopeSpec,
- MapperId, OMPC_MAP_tofrom,
+ nullptr, OMPC_MAP_MODIFIER_unknown, SourceLocation(),
+ MapperIdScopeSpec, MapperId, OMPC_MAP_tofrom,
/*IsMapTypeImplicit=*/true, SourceLocation(), SourceLocation(),
Exprs, OMPVarListLocTy(), /*NoDiagnose=*/true))
ClausesWithImplicit.emplace_back(Implicit);
@@ -6202,7 +6237,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
DeclarationNameInfo MapperId;
auto Kind = static_cast(ClauseKindCnt);
if (OMPClause *Implicit = ActOnOpenMPMapClause(
- ImplicitMapModifiers[I], ImplicitMapModifiersLoc[I],
+ nullptr, ImplicitMapModifiers[I], ImplicitMapModifiersLoc[I],
MapperIdScopeSpec, MapperId, Kind, /*IsMapTypeImplicit=*/true,
SourceLocation(), SourceLocation(), ImplicitMap,
OMPVarListLocTy())) {
@@ -17584,7 +17619,7 @@ OMPClause *Sema::ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
assert(0 <= ExtraModifier && ExtraModifier <= OMPC_MAP_unknown &&
"Unexpected map modifier.");
Res = ActOnOpenMPMapClause(
- Data.MapTypeModifiers, Data.MapTypeModifiersLoc,
+ Data.IteratorExpr, Data.MapTypeModifiers, Data.MapTypeModifiersLoc,
Data.ReductionOrMapperIdScopeSpec, Data.ReductionOrMapperId,
static_cast(ExtraModifier), Data.IsMapTypeImplicit,
ExtraModifierLoc, ColonLoc, VarList, Locs);
@@ -21638,10 +21673,12 @@ static void checkMappableExpressionList(
// target enter data
// OpenMP [2.10.2, Restrictions, p. 99]
// A map-type must be specified in all map clauses and must be either
- // to or alloc.
+ // to or alloc. Starting with OpenMP 5.2 the default map type is `to` if
+ // no map type is present.
OpenMPDirectiveKind DKind = DSAS->getCurrentDirective();
if (DKind == OMPD_target_enter_data &&
- !(MapType == OMPC_MAP_to || MapType == OMPC_MAP_alloc)) {
+ !(MapType == OMPC_MAP_to || MapType == OMPC_MAP_alloc ||
+ SemaRef.getLangOpts().OpenMP >= 52)) {
SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
<< (IsMapTypeImplicit ? 1 : 0)
<< getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
@@ -21652,10 +21689,11 @@ static void checkMappableExpressionList(
// target exit_data
// OpenMP [2.10.3, Restrictions, p. 102]
// A map-type must be specified in all map clauses and must be either
- // from, release, or delete.
+ // from, release, or delete. Starting with OpenMP 5.2 the default map
+ // type is `from` if no map type is present.
if (DKind == OMPD_target_exit_data &&
!(MapType == OMPC_MAP_from || MapType == OMPC_MAP_release ||
- MapType == OMPC_MAP_delete)) {
+ MapType == OMPC_MAP_delete || SemaRef.getLangOpts().OpenMP >= 52)) {
SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
<< (IsMapTypeImplicit ? 1 : 0)
<< getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
@@ -21744,7 +21782,7 @@ static void checkMappableExpressionList(
}
OMPClause *Sema::ActOnOpenMPMapClause(
- ArrayRef MapTypeModifiers,
+ Expr *IteratorModifier, ArrayRef MapTypeModifiers,
ArrayRef MapTypeModifiersLoc,
CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
OpenMPMapClauseKind MapType, bool IsMapTypeImplicit, SourceLocation MapLoc,
@@ -21754,9 +21792,14 @@ OMPClause *Sema::ActOnOpenMPMapClause(
OpenMPMapModifierKind Modifiers[] = {
OMPC_MAP_MODIFIER_unknown, OMPC_MAP_MODIFIER_unknown,
OMPC_MAP_MODIFIER_unknown, OMPC_MAP_MODIFIER_unknown,
- OMPC_MAP_MODIFIER_unknown};
+ OMPC_MAP_MODIFIER_unknown, OMPC_MAP_MODIFIER_unknown};
SourceLocation ModifiersLoc[NumberOfOMPMapClauseModifiers];
+ if (IteratorModifier && !IteratorModifier->getType()->isSpecificBuiltinType(
+ BuiltinType::OMPIterator))
+ Diag(IteratorModifier->getExprLoc(),
+ diag::err_omp_map_modifier_not_iterator);
+
// Process map-type-modifiers, flag errors for duplicate modifiers.
unsigned Count = 0;
for (unsigned I = 0, E = MapTypeModifiers.size(); I < E; ++I) {
@@ -21780,11 +21823,11 @@ OMPClause *Sema::ActOnOpenMPMapClause(
// We need to produce a map clause even if we don't have variables so that
// other diagnostics related with non-existing map clauses are accurate.
- return OMPMapClause::Create(Context, Locs, MVLI.ProcessedVarList,
- MVLI.VarBaseDeclarations, MVLI.VarComponents,
- MVLI.UDMapperList, Modifiers, ModifiersLoc,
- MapperIdScopeSpec.getWithLocInContext(Context),
- MapperId, MapType, IsMapTypeImplicit, MapLoc);
+ return OMPMapClause::Create(
+ Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+ MVLI.VarComponents, MVLI.UDMapperList, IteratorModifier, Modifiers,
+ ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(Context), MapperId,
+ MapType, IsMapTypeImplicit, MapLoc);
}
QualType Sema::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
@@ -22178,6 +22221,11 @@ Sema::ActOnOpenMPDeclareMapperDirectiveVarDecl(Scope *S, QualType MapperType,
return E;
}
+void Sema::ActOnOpenMPIteratorVarDecl(VarDecl *VD) {
+ if (DSAStack->getDeclareMapperVarRef())
+ DSAStack->addIteratorVarDecl(VD);
+}
+
bool Sema::isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const {
assert(LangOpts.OpenMP && "Expected OpenMP mode.");
const Expr *Ref = DSAStack->getDeclareMapperVarRef();
@@ -22186,6 +22234,8 @@ bool Sema::isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const {
return true;
if (VD->isUsableInConstantExpressions(Context))
return true;
+ if (LangOpts.OpenMP >= 52 && DSAStack->isIteratorVarDecl(VD))
+ return true;
return false;
}
return true;
@@ -22664,7 +22714,8 @@ static void checkDeclInTargetContext(SourceLocation SL, SourceRange SR,
(SemaRef.getCurLambda(/*IgnoreNonLambdaCapturingScope=*/true) ||
SemaRef.getCurBlock() || SemaRef.getCurCapturedRegion()) &&
VD->hasGlobalStorage()) {
- if (!MapTy || *MapTy != OMPDeclareTargetDeclAttr::MT_To) {
+ if (!MapTy || (*MapTy != OMPDeclareTargetDeclAttr::MT_To &&
+ *MapTy != OMPDeclareTargetDeclAttr::MT_Enter)) {
// OpenMP 5.0, 2.12.7 declare target Directive, Restrictions
// If a lambda declaration and definition appears between a
// declare target directive and the matching end declare target
@@ -22745,8 +22796,11 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
IsIndirect = true;
}
auto *A = OMPDeclareTargetDeclAttr::CreateImplicit(
- Context, OMPDeclareTargetDeclAttr::MT_To, DTCI.DT, IndirectE,
- IsIndirect, Level, SourceRange(DTCI.Loc, DTCI.Loc));
+ Context,
+ getLangOpts().OpenMP >= 52 ? OMPDeclareTargetDeclAttr::MT_Enter
+ : OMPDeclareTargetDeclAttr::MT_To,
+ DTCI.DT, IndirectE, IsIndirect, Level,
+ SourceRange(DTCI.Loc, DTCI.Loc));
D->addAttr(A);
if (ASTMutationListener *ML = Context.getASTMutationListener())
ML->DeclarationMarkedOpenMPDeclareTarget(D, A);
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index de2bb7734bc99..baa29bcbae718 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -3680,9 +3680,10 @@ TemplateDeclInstantiator::VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D) {
OMPVarListLocTy Locs(OldC->getBeginLoc(), OldC->getLParenLoc(),
OldC->getEndLoc());
OMPClause *NewC = SemaRef.ActOnOpenMPMapClause(
- OldC->getMapTypeModifiers(), OldC->getMapTypeModifiersLoc(), SS,
- NewNameInfo, OldC->getMapType(), OldC->isImplicitMapType(),
- OldC->getMapLoc(), OldC->getColonLoc(), NewVars, Locs);
+ OldC->getIteratorModifier(), OldC->getMapTypeModifiers(),
+ OldC->getMapTypeModifiersLoc(), SS, NewNameInfo, OldC->getMapType(),
+ OldC->isImplicitMapType(), OldC->getMapLoc(), OldC->getColonLoc(),
+ NewVars, Locs);
Clauses.push_back(NewC);
}
SemaRef.EndOpenMPDSABlock(nullptr);
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index f0d3a5ca089a3..358f95fed7f60 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1988,15 +1988,16 @@ class TreeTransform {
/// By default, performs semantic analysis to build the new OpenMP clause.
/// Subclasses may override this routine to provide different behavior.
OMPClause *RebuildOMPMapClause(
- ArrayRef MapTypeModifiers,
+ Expr *IteratorModifier, ArrayRef MapTypeModifiers,
ArrayRef MapTypeModifiersLoc,
CXXScopeSpec MapperIdScopeSpec, DeclarationNameInfo MapperId,
OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef VarList,
const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) {
return getSema().ActOnOpenMPMapClause(
- MapTypeModifiers, MapTypeModifiersLoc, MapperIdScopeSpec, MapperId,
- MapType, IsMapTypeImplicit, MapLoc, ColonLoc, VarList, Locs,
+ IteratorModifier, MapTypeModifiers, MapTypeModifiersLoc,
+ MapperIdScopeSpec, MapperId, MapType, IsMapTypeImplicit, MapLoc,
+ ColonLoc, VarList, Locs,
/*NoDiagnose=*/false, UnresolvedMappers);
}
@@ -10227,6 +10228,13 @@ template
OMPClause *TreeTransform::TransformOMPMapClause(OMPMapClause *C) {
OMPVarListLocTy Locs(C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
llvm::SmallVector Vars;
+ Expr *IteratorModifier = C->getIteratorModifier();
+ if (IteratorModifier) {
+ ExprResult MapModRes = getDerived().TransformExpr(IteratorModifier);
+ if (MapModRes.isInvalid())
+ return nullptr;
+ IteratorModifier = MapModRes.get();
+ }
CXXScopeSpec MapperIdScopeSpec;
DeclarationNameInfo MapperIdInfo;
llvm::SmallVector UnresolvedMappers;
@@ -10234,9 +10242,9 @@ OMPClause *TreeTransform::TransformOMPMapClause(OMPMapClause *C) {
*this, C, Vars, MapperIdScopeSpec, MapperIdInfo, UnresolvedMappers))
return nullptr;
return getDerived().RebuildOMPMapClause(
- C->getMapTypeModifiers(), C->getMapTypeModifiersLoc(), MapperIdScopeSpec,
- MapperIdInfo, C->getMapType(), C->isImplicitMapType(), C->getMapLoc(),
- C->getColonLoc(), Vars, Locs, UnresolvedMappers);
+ IteratorModifier, C->getMapTypeModifiers(), C->getMapTypeModifiersLoc(),
+ MapperIdScopeSpec, MapperIdInfo, C->getMapType(), C->isImplicitMapType(),
+ C->getMapLoc(), C->getColonLoc(), Vars, Locs, UnresolvedMappers);
}
template
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 954f8ccebb82e..c43a885f88f47 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -10675,10 +10675,13 @@ void OMPClauseReader::VisitOMPDeviceClause(OMPDeviceClause *C) {
void OMPClauseReader::VisitOMPMapClause(OMPMapClause *C) {
C->setLParenLoc(Record.readSourceLocation());
+ bool HasIteratorModifier = false;
for (unsigned I = 0; I < NumberOfOMPMapClauseModifiers; ++I) {
C->setMapTypeModifier(
I, static_cast(Record.readInt()));
C->setMapTypeModifierLoc(I, Record.readSourceLocation());
+ if (C->getMapTypeModifier(I) == OMPC_MAP_MODIFIER_iterator)
+ HasIteratorModifier = true;
}
C->setMapperQualifierLoc(Record.readNestedNameSpecifierLoc());
C->setMapperIdInfo(Record.readDeclarationNameInfo());
@@ -10703,6 +10706,9 @@ void OMPClauseReader::VisitOMPMapClause(OMPMapClause *C) {
UDMappers.push_back(Record.readExpr());
C->setUDMapperRefs(UDMappers);
+ if (HasIteratorModifier)
+ C->setIteratorModifier(Record.readExpr());
+
SmallVector Decls;
Decls.reserve(UniqueDecls);
for (unsigned i = 0; i < UniqueDecls; ++i)
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index b70eb9526e19b..e740006ca8fc5 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6617,9 +6617,12 @@ void OMPClauseWriter::VisitOMPMapClause(OMPMapClause *C) {
Record.push_back(C->getTotalComponentListNum());
Record.push_back(C->getTotalComponentsNum());
Record.AddSourceLocation(C->getLParenLoc());
+ bool HasIteratorModifier = false;
for (unsigned I = 0; I < NumberOfOMPMapClauseModifiers; ++I) {
Record.push_back(C->getMapTypeModifier(I));
Record.AddSourceLocation(C->getMapTypeModifierLoc(I));
+ if (C->getMapTypeModifier(I) == OMPC_MAP_MODIFIER_iterator)
+ HasIteratorModifier = true;
}
Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc());
Record.AddDeclarationNameInfo(C->getMapperIdInfo());
@@ -6630,6 +6633,8 @@ void OMPClauseWriter::VisitOMPMapClause(OMPMapClause *C) {
Record.AddStmt(E);
for (auto *E : C->mapperlists())
Record.AddStmt(E);
+ if (HasIteratorModifier)
+ Record.AddStmt(C->getIteratorModifier());
for (auto *D : C->all_decls())
Record.AddDeclRef(D);
for (auto N : C->all_num_lists())
diff --git a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
index 16505b34c4a6e..62ccc2bd4d05d 100644
--- a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
@@ -1,7 +1,7 @@
// Create module flag for code object version.
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
-// RUN: -o - %s | FileCheck %s -check-prefix=V4
+// RUN: -o - %s | FileCheck %s -check-prefix=V5
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
// RUN: -mcode-object-version=2 -o - %s | FileCheck -check-prefix=V2 %s
diff --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
index 4d788e6807ab2..847be23ba8e48 100644
--- a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa \
-// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: -fcuda-is-device -mcode-object-version=4 -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefix=PRECOV5 %s
diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
index 2278c26f0bcfd..f996aa24e470b 100644
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -195,7 +195,7 @@ __device__ void func(float *x);
// CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
// CHECK-NEXT: store volatile float [[TMP4]], float* [[X_ASCAST]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load float*, float** [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: call void @_Z4funcPf(float* noundef [[TMP5]]) #[[ATTR8:[0-9]+]]
+// CHECK-NEXT: call void @_Z4funcPf(float* [[TMP5]]) #[[ATTR8:[0-9]+]]
// CHECK-NEXT: ret void
//
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {
diff --git a/clang/test/CodeGenCUDA/lambda-noinline.cu b/clang/test/CodeGenCUDA/lambda-noinline.cu
new file mode 100644
index 0000000000000..de2196e63f074
--- /dev/null
+++ b/clang/test/CodeGenCUDA/lambda-noinline.cu
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -no-opaque-pointers -x hip -emit-llvm -std=c++11 %s -o - \
+// RUN: -triple x86_64-linux-gnu \
+// RUN: | FileCheck -check-prefix=HOST %s
+// RUN: %clang_cc1 -no-opaque-pointers -x hip -emit-llvm -std=c++11 %s -o - \
+// RUN: -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN: | FileCheck -check-prefix=DEV %s
+
+#include "Inputs/cuda.h"
+
+// Checks noinline is correctly added to the lambda function.
+
+// HOST: define{{.*}}@_ZZ4HostvENKUlvE_clEv({{.*}}) #[[ATTR:[0-9]+]]
+// HOST: attributes #[[ATTR]]{{.*}}noinline
+
+// DEV: define{{.*}}@_ZZ6DevicevENKUlvE_clEv({{.*}}) #[[ATTR:[0-9]+]]
+// DEV: attributes #[[ATTR]]{{.*}}noinline
+
+__device__ int a;
+int b;
+
+__device__ int Device() { return ([&] __device__ __noinline__ (){ return a; })(); }
+
+__host__ int Host() { return ([&] __host__ __noinline__ (){ return b; })(); }
diff --git a/clang/test/CodeGenCUDA/lambda.cu b/clang/test/CodeGenCUDA/lambda.cu
index c2012dc963558..01895d50b6810 100644
--- a/clang/test/CodeGenCUDA/lambda.cu
+++ b/clang/test/CodeGenCUDA/lambda.cu
@@ -51,8 +51,8 @@
// DEV-LABEL: define{{.*}} amdgpu_kernel void @_Z1gIZ12test_resolvevEUlvE_EvT_
// DEV: call void @_ZZ12test_resolvevENKUlvE_clEv
// DEV-LABEL: define internal void @_ZZ12test_resolvevENKUlvE_clEv
-// DEV: call noundef i32 @_Z10overloadedIiET_v
-// DEV-LABEL: define linkonce_odr noundef i32 @_Z10overloadedIiET_v
+// DEV: call i32 @_Z10overloadedIiET_v
+// DEV-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v
// DEV: ret i32 1
__device__ int a;
diff --git a/clang/test/CodeGenCUDA/unnamed-types.cu b/clang/test/CodeGenCUDA/unnamed-types.cu
index 6849df5a184ba..b59d5f448dde2 100644
--- a/clang/test/CodeGenCUDA/unnamed-types.cu
+++ b/clang/test/CodeGenCUDA/unnamed-types.cu
@@ -19,16 +19,16 @@ __device__ float d1(float x) {
}
// DEVICE: amdgpu_kernel void @_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_(
-// DEVICE: define internal noundef float @_ZZZ2f1PfENKUlS_E_clES_ENKUlfE_clEf(
+// DEVICE: define internal float @_ZZZ2f1PfENKUlS_E_clES_ENKUlfE_clEf(
template
__global__ void k0(float *p, F f) {
p[0] = f(p[0]) + d0(p[1]) + d1(p[2]);
}
// DEVICE: amdgpu_kernel void @_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_(
-// DEVICE: define internal noundef float @_ZZ2f1PfENKUlfE_clEf(
-// DEVICE: define internal noundef float @_ZZ2f1PfENKUlffE_clEff(
-// DEVICE: define internal noundef float @_ZZ2f1PfENKUlfE0_clEf(
+// DEVICE: define internal float @_ZZ2f1PfENKUlfE_clEf(
+// DEVICE: define internal float @_ZZ2f1PfENKUlffE_clEff(
+// DEVICE: define internal float @_ZZ2f1PfENKUlfE0_clEf(
template
__global__ void k1(float *p, F0 f0, F1 f1, F2 f2) {
p[0] = f0(p[0]) + f1(p[1], p[2]) + f2(p[3]);
diff --git a/clang/test/CodeGenHIP/debug-info-amdgcn-abi-heterogeneous-dwarf.hip b/clang/test/CodeGenHIP/debug-info-amdgcn-abi-heterogeneous-dwarf.hip
index bdcc1da781d69..4edbbdef72391 100644
--- a/clang/test/CodeGenHIP/debug-info-amdgcn-abi-heterogeneous-dwarf.hip
+++ b/clang/test/CodeGenHIP/debug-info-amdgcn-abi-heterogeneous-dwarf.hip
@@ -169,7 +169,7 @@ __device__ void Test_Func_StructTrivialCopyNoMove(StructTrivialCopyNoMove) {}
// CHECK-NOT: {{.*}}memcpy{{.*}}
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %[[#ALLOCA]]),
__global__ void Test_Kern_StructTrivialCopyNoMove(StructTrivialCopyNoMove) {}
-// CHECK: define dso_local void @_Z28Test_Func_StructNoCopyNoMove18StructNoCopyNoMove(ptr addrspace(5) noundef %[[#ARG:]])
+// CHECK: define dso_local void @_Z28Test_Func_StructNoCopyNoMove18StructNoCopyNoMove(ptr addrspace(5) %[[#ARG:]])
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %[[#ARG]]),
__device__ void Test_Func_StructNoCopyNoMove(StructNoCopyNoMove) {}
// CHECK: define dso_local amdgpu_kernel void @_Z28Test_Kern_StructNoCopyNoMove18StructNoCopyNoMove(i8 %.coerce)
@@ -277,7 +277,7 @@ __device__ void Test_Func_Struct9Bytes(StructNBytes<9>) {}
// CHECK: call void @llvm.memcpy.p0.p4.i64(ptr align 1 %{{.+}}, ptr addrspace(4) align 1 %{{.+}}, i64 9, i1 false)
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %coerce),
__global__ void Test_Kern_Struct9Bytes(StructNBytes<9>) {}
-// CHECK: define dso_local void @_Z23Test_Func_Struct64Bytes12StructNBytesILj64EE(ptr addrspace(5) noundef byval(%struct.StructNBytes.7) align 1 %0)
+// CHECK: define dso_local void @_Z23Test_Func_Struct64Bytes12StructNBytesILj64EE(ptr addrspace(5) byval(%struct.StructNBytes.7) align 1 %0)
// CHECK-NOT: alloca
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %0),
__device__ void Test_Func_Struct64Bytes(StructNBytes<64>) {}
@@ -287,196 +287,196 @@ __device__ void Test_Func_Struct64Bytes(StructNBytes<64>) {}
// CHECK: call void @llvm.memcpy.p0.p4.i64(ptr align 1 %{{.+}}, ptr addrspace(4) align 1 %{{.+}}, i64 64, i1 false)
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %coerce),
__global__ void Test_Kern_Struct64Bytes(StructNBytes<64>) {}
-// CHECK: define dso_local void @_Z15Test_Func_Int8Tc(i8 noundef signext %0)
+// CHECK: define dso_local void @_Z15Test_Func_Int8Tc(i8 signext %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: store i8 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_Int8T(int8_t) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z15Test_Kern_Int8Tc(i8 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z15Test_Kern_Int8Tc(i8 %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: store i8 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_Int8T(int8_t) {}
-// CHECK: define dso_local void @_Z16Test_Func_UInt8Th(i8 noundef zeroext %0)
+// CHECK: define dso_local void @_Z16Test_Func_UInt8Th(i8 zeroext %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: store i8 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_UInt8T(uint8_t) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z16Test_Kern_UInt8Th(i8 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z16Test_Kern_UInt8Th(i8 %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: store i8 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_UInt8T(uint8_t) {}
-// CHECK: define dso_local void @_Z16Test_Func_Int16Ts(i16 noundef signext %0)
+// CHECK: define dso_local void @_Z16Test_Func_Int16Ts(i16 signext %0)
// CHECK: %.addr = alloca i16, align 2, addrspace(5)
// CHECK: store i16 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_Int16T(int16_t) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z16Test_Kern_Int16Ts(i16 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z16Test_Kern_Int16Ts(i16 %0)
// CHECK: %.addr = alloca i16, align 2, addrspace(5)
// CHECK: store i16 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_Int16T(int16_t) {}
-// CHECK: define dso_local void @_Z17Test_Func_UInt16Tt(i16 noundef zeroext %0)
+// CHECK: define dso_local void @_Z17Test_Func_UInt16Tt(i16 zeroext %0)
// CHECK: %.addr = alloca i16, align 2, addrspace(5)
// CHECK: store i16 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_UInt16T(uint16_t) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z17Test_Kern_UInt16Tt(i16 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z17Test_Kern_UInt16Tt(i16 %0)
// CHECK: %.addr = alloca i16, align 2, addrspace(5)
// CHECK: store i16 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_UInt16T(uint16_t) {}
-// CHECK: define dso_local void @_Z16Test_Func_Int32Ti(i32 noundef %0)
+// CHECK: define dso_local void @_Z16Test_Func_Int32Ti(i32 %0)
// CHECK: %.addr = alloca i32, align 4, addrspace(5)
// CHECK: store i32 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_Int32T(int32_t) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z16Test_Kern_Int32Ti(i32 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z16Test_Kern_Int32Ti(i32 %0)
// CHECK: %.addr = alloca i32, align 4, addrspace(5)
// CHECK: store i32 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_Int32T(int32_t) {}
-// CHECK: define dso_local void @_Z17Test_Func_UInt32Tj(i32 noundef %0)
+// CHECK: define dso_local void @_Z17Test_Func_UInt32Tj(i32 %0)
// CHECK: %.addr = alloca i32, align 4, addrspace(5)
// CHECK: store i32 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_UInt32T(uint32_t) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z17Test_Kern_UInt32Tj(i32 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z17Test_Kern_UInt32Tj(i32 %0)
// CHECK: %.addr = alloca i32, align 4, addrspace(5)
// CHECK: store i32 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_UInt32T(uint32_t) {}
-// CHECK: define dso_local void @_Z16Test_Func_Int64Tl(i64 noundef %0)
+// CHECK: define dso_local void @_Z16Test_Func_Int64Tl(i64 %0)
// CHECK: %.addr = alloca i64, align 8, addrspace(5)
// CHECK: store i64 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_Int64T(int64_t) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z16Test_Kern_Int64Tl(i64 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z16Test_Kern_Int64Tl(i64 %0)
// CHECK: %.addr = alloca i64, align 8, addrspace(5)
// CHECK: store i64 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_Int64T(int64_t) {}
-// CHECK: define dso_local void @_Z17Test_Func_UInt64Tm(i64 noundef %0)
+// CHECK: define dso_local void @_Z17Test_Func_UInt64Tm(i64 %0)
// CHECK: %.addr = alloca i64, align 8, addrspace(5)
// CHECK: store i64 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_UInt64T(uint64_t) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z17Test_Kern_UInt64Tm(i64 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z17Test_Kern_UInt64Tm(i64 %0)
// CHECK: %.addr = alloca i64, align 8, addrspace(5)
// CHECK: store i64 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_UInt64T(uint64_t) {}
-// CHECK: define dso_local void @_Z19Test_Func_EnumInt8T9EnumInt8T(i8 noundef signext %0)
+// CHECK: define dso_local void @_Z19Test_Func_EnumInt8T9EnumInt8T(i8 signext %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: store i8 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_EnumInt8T(EnumInt8T) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z19Test_Kern_EnumInt8T9EnumInt8T(i8 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z19Test_Kern_EnumInt8T9EnumInt8T(i8 %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: store i8 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_EnumInt8T(EnumInt8T) {}
-// CHECK: define dso_local void @_Z20Test_Func_EnumUInt8T10EnumUInt8T(i8 noundef zeroext %0)
+// CHECK: define dso_local void @_Z20Test_Func_EnumUInt8T10EnumUInt8T(i8 zeroext %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: store i8 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_EnumUInt8T(EnumUInt8T) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z20Test_Kern_EnumUInt8T10EnumUInt8T(i8 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z20Test_Kern_EnumUInt8T10EnumUInt8T(i8 %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: store i8 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_EnumUInt8T(EnumUInt8T) {}
-// CHECK: define dso_local void @_Z20Test_Func_EnumInt16T10EnumInt16T(i16 noundef signext %0)
+// CHECK: define dso_local void @_Z20Test_Func_EnumInt16T10EnumInt16T(i16 signext %0)
// CHECK: %.addr = alloca i16, align 2, addrspace(5)
// CHECK: store i16 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_EnumInt16T(EnumInt16T) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z20Test_Kern_EnumInt16T10EnumInt16T(i16 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z20Test_Kern_EnumInt16T10EnumInt16T(i16 %0)
// CHECK: %.addr = alloca i16, align 2, addrspace(5)
// CHECK: store i16 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_EnumInt16T(EnumInt16T) {}
-// CHECK: define dso_local void @_Z21Test_Func_EnumUInt16T11EnumUInt16T(i16 noundef zeroext %0)
+// CHECK: define dso_local void @_Z21Test_Func_EnumUInt16T11EnumUInt16T(i16 zeroext %0)
// CHECK: %.addr = alloca i16, align 2, addrspace(5)
// CHECK: store i16 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_EnumUInt16T(EnumUInt16T) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z21Test_Kern_EnumUInt16T11EnumUInt16T(i16 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z21Test_Kern_EnumUInt16T11EnumUInt16T(i16 %0)
// CHECK: %.addr = alloca i16, align 2, addrspace(5)
// CHECK: store i16 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_EnumUInt16T(EnumUInt16T) {}
-// CHECK: define dso_local void @_Z20Test_Func_EnumInt32T10EnumInt32T(i32 noundef %0)
+// CHECK: define dso_local void @_Z20Test_Func_EnumInt32T10EnumInt32T(i32 %0)
// CHECK: %.addr = alloca i32, align 4, addrspace(5)
// CHECK: store i32 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_EnumInt32T(EnumInt32T) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z20Test_Kern_EnumInt32T10EnumInt32T(i32 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z20Test_Kern_EnumInt32T10EnumInt32T(i32 %0)
// CHECK: %.addr = alloca i32, align 4, addrspace(5)
// CHECK: store i32 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_EnumInt32T(EnumInt32T) {}
-// CHECK: define dso_local void @_Z21Test_Func_EnumUInt32T11EnumUInt32T(i32 noundef %0)
+// CHECK: define dso_local void @_Z21Test_Func_EnumUInt32T11EnumUInt32T(i32 %0)
// CHECK: %.addr = alloca i32, align 4, addrspace(5)
// CHECK: store i32 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_EnumUInt32T(EnumUInt32T) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z21Test_Kern_EnumUInt32T11EnumUInt32T(i32 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z21Test_Kern_EnumUInt32T11EnumUInt32T(i32 %0)
// CHECK: %.addr = alloca i32, align 4, addrspace(5)
// CHECK: store i32 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_EnumUInt32T(EnumUInt32T) {}
-// CHECK: define dso_local void @_Z20Test_Func_EnumInt64T10EnumInt64T(i64 noundef %0)
+// CHECK: define dso_local void @_Z20Test_Func_EnumInt64T10EnumInt64T(i64 %0)
// CHECK: %.addr = alloca i64, align 8, addrspace(5)
// CHECK: store i64 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_EnumInt64T(EnumInt64T) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z20Test_Kern_EnumInt64T10EnumInt64T(i64 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z20Test_Kern_EnumInt64T10EnumInt64T(i64 %0)
// CHECK: %.addr = alloca i64, align 8, addrspace(5)
// CHECK: store i64 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_EnumInt64T(EnumInt64T) {}
-// CHECK: define dso_local void @_Z21Test_Func_EnumUInt64T11EnumUInt64T(i64 noundef %0)
+// CHECK: define dso_local void @_Z21Test_Func_EnumUInt64T11EnumUInt64T(i64 %0)
// CHECK: %.addr = alloca i64, align 8, addrspace(5)
// CHECK: store i64 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_EnumUInt64T(EnumUInt64T) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z21Test_Kern_EnumUInt64T11EnumUInt64T(i64 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z21Test_Kern_EnumUInt64T11EnumUInt64T(i64 %0)
// CHECK: %.addr = alloca i64, align 8, addrspace(5)
// CHECK: store i64 %0,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_EnumUInt64T(EnumUInt64T) {}
-// CHECK: define dso_local void @_Z27Test_Func_PromotableIntegerb(i1 noundef zeroext %0)
+// CHECK: define dso_local void @_Z27Test_Func_PromotableIntegerb(i1 zeroext %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: %frombool = zext i1 %0 to i8
// CHECK: store i8 %frombool,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_PromotableInteger(bool) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z27Test_Kern_PromotableIntegerb(i1 noundef %0)
+// CHECK: define dso_local amdgpu_kernel void @_Z27Test_Kern_PromotableIntegerb(i1 %0)
// CHECK: %.addr = alloca i8, align 1, addrspace(5)
// CHECK: %frombool = zext i1 %0 to i8
// CHECK: store i8 %frombool,
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_PromotableInteger(bool) {}
-// CHECK: define dso_local void @_Z17Test_Func_PointerPi(ptr noundef %0)
+// CHECK: define dso_local void @_Z17Test_Func_PointerPi(ptr %0)
// CHECK: %.addr = alloca ptr, align 8, addrspace(5)
// CHECK: store ptr %0, ptr %.addr.ascast, align 8
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_Pointer(int32_t *) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z17Test_Kern_PointerPi(ptr addrspace(1) noundef %.coerce)
+// CHECK: define dso_local amdgpu_kernel void @_Z17Test_Kern_PointerPi(ptr addrspace(1) %.coerce)
// CHECK: %.addr = alloca ptr, align 8, addrspace(5)
// FIXME: There is a store, load, store sequence through another alloca here,
// which I don't understand the intent of
// CHECK: store ptr
// call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__global__ void Test_Kern_Pointer(int32_t *) {}
-// CHECK: define dso_local void @_Z19Test_Func_ReferenceRi(ptr noundef nonnull align 4 dereferenceable(4) %0)
+// CHECK: define dso_local void @_Z19Test_Func_ReferenceRi(ptr nonnull align 4 dereferenceable(4) %0)
// CHECK: %.addr = alloca ptr, align 8, addrspace(5)
// CHECK: store ptr %0, ptr %.addr.ascast, align 8
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %.addr),
__device__ void Test_Func_Reference(int32_t &) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z19Test_Kern_ReferenceRi(ptr addrspace(1) noundef nonnull align 4 dereferenceable(4) %.coerce)
+// CHECK: define dso_local amdgpu_kernel void @_Z19Test_Kern_ReferenceRi(ptr addrspace(1) nonnull align 4 dereferenceable(4) %.coerce)
// CHECK: %.addr = alloca ptr, align 8, addrspace(5)
// FIXME: There is a store, load, store sequence through another alloca here,
// which I don't understand the intent of
@@ -504,23 +504,23 @@ __device__ void Test_Func_StructPointerElements(StructPointerElements) {}
// CHECK: call void @llvm.memcpy.p0.p4.i64(ptr align 8 %{{.+}}, ptr addrspace(4) align 8 %{{.+}}, i64 16, i1 false)
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %coerce),
__global__ void Test_Kern_StructPointerElements(StructPointerElements) {}
-// CHECK: define dso_local void @_Z37Test_Func_ParamRegLimitExpandedStructlllllli22StructMultipleElements(i64 noundef %0, i64 noundef %1, i64 noundef %2, i64 noundef %3, i64 noundef %4, i64 noundef %5, i32 noundef %6, i32 %.coerce0, i64 %.coerce1)
+// CHECK: define dso_local void @_Z37Test_Func_ParamRegLimitExpandedStructlllllli22StructMultipleElements(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i32 %6, i32 %.coerce0, i64 %.coerce1)
// CHECK: %[[#ALLOCA:]] = alloca %struct.StructMultipleElements, align 8, addrspace(5)
// CHECK: store i32 %.coerce0,
// CHECK: store i64 %.coerce1,
// CHECK-NOT: {{.*}}memcpy{{.*}}
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %[[#ALLOCA]]),
__device__ void Test_Func_ParamRegLimitExpandedStruct(int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int32_t, StructMultipleElements) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z37Test_Kern_ParamRegLimitExpandedStructlllllli22StructMultipleElements(i64 noundef %0, i64 noundef %1, i64 noundef %2, i64 noundef %3, i64 noundef %4, i64 noundef %5, i32 noundef %6, ptr addrspace(4) byref(%struct.StructMultipleElements) align 8 %7)
+// CHECK: define dso_local amdgpu_kernel void @_Z37Test_Kern_ParamRegLimitExpandedStructlllllli22StructMultipleElements(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i32 %6, ptr addrspace(4) byref(%struct.StructMultipleElements) align 8 %7)
// CHECK: %coerce = alloca %struct.StructMultipleElements, align 8, addrspace(5)
// CHECK: call void @llvm.memcpy.p0.p4.i64(ptr align 8 %{{.+}}, ptr addrspace(4) align 8 %{{.+}}, i64 16, i1 false)
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %coerce),
__global__ void Test_Kern_ParamRegLimitExpandedStruct(int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int32_t, StructMultipleElements) {}
-// CHECK: define dso_local void @_Z39Test_Func_ParamRegLimitUnexpandedStructlllllll22StructMultipleElements(i64 noundef %0, i64 noundef %1, i64 noundef %2, i64 noundef %3, i64 noundef %4, i64 noundef %5, i64 noundef %6, ptr addrspace(5) noundef byval(%struct.StructMultipleElements) align 8 %7)
+// CHECK: define dso_local void @_Z39Test_Func_ParamRegLimitUnexpandedStructlllllll22StructMultipleElements(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr addrspace(5) byval(%struct.StructMultipleElements) align 8 %7)
// CHECK-NOT: {{.*}}memcpy{{.*}}
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %7),
__device__ void Test_Func_ParamRegLimitUnexpandedStruct(int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, StructMultipleElements) {}
-// CHECK: define dso_local amdgpu_kernel void @_Z39Test_Kern_ParamRegLimitUnexpandedStructlllllll22StructMultipleElements(i64 noundef %0, i64 noundef %1, i64 noundef %2, i64 noundef %3, i64 noundef %4, i64 noundef %5, i64 noundef %6, ptr addrspace(4) byref(%struct.StructMultipleElements) align 8 %7)
+// CHECK: define dso_local amdgpu_kernel void @_Z39Test_Kern_ParamRegLimitUnexpandedStructlllllll22StructMultipleElements(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr addrspace(4) byref(%struct.StructMultipleElements) align 8 %7)
// CHECK: %coerce = alloca %struct.StructMultipleElements, align 8, addrspace(5)
// CHECK: call void @llvm.memcpy.p0.p4.i64(ptr align 8 %{{.+}}, ptr addrspace(4) align 8 %{{.+}}, i64 16, i1 false)
// CHECK: call void @llvm.dbg.def(metadata !{{[0-9]+}}, metadata ptr addrspace(5) %coerce),
diff --git a/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp b/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp
index 486e1606e7ba3..72071973a81de 100644
--- a/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp
+++ b/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp
@@ -25,30 +25,30 @@ __device__ struct foo_t {
// Check literals are placed in address space 1 (CrossWorkGroup/__global).
// CHECK: @.str ={{.*}} unnamed_addr addrspace(1) constant
-// CHECK: define{{.*}} spir_func noundef i32 addrspace(4)* @_Z3barPi(i32 addrspace(4)*
+// CHECK: define{{.*}} spir_func i32 addrspace(4)* @_Z3barPi(i32 addrspace(4)*
__device__ int* bar(int *x) {
return x;
}
-// CHECK: define{{.*}} spir_func noundef i32 addrspace(4)* @_Z5baz_dv()
+// CHECK: define{{.*}} spir_func i32 addrspace(4)* @_Z5baz_dv()
__device__ int* baz_d() {
// CHECK: ret i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @d to i32 addrspace(4)*
return &d;
}
-// CHECK: define{{.*}} spir_func noundef i32 addrspace(4)* @_Z5baz_cv()
+// CHECK: define{{.*}} spir_func i32 addrspace(4)* @_Z5baz_cv()
__device__ int* baz_c() {
// CHECK: ret i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @c to i32 addrspace(4)*
return &c;
}
-// CHECK: define{{.*}} spir_func noundef i32 addrspace(4)* @_Z5baz_sv()
+// CHECK: define{{.*}} spir_func i32 addrspace(4)* @_Z5baz_sv()
__device__ int* baz_s() {
// CHECK: ret i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @s to i32 addrspace(4)*
return &s;
}
-// CHECK: define{{.*}} spir_func noundef i8 addrspace(4)* @_Z3quzv()
+// CHECK: define{{.*}} spir_func i8 addrspace(4)* @_Z3quzv()
__device__ const char* quz() {
return "abc";
}
diff --git a/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip b/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip
index afa461f909529..b5cbbd52497ea 100644
--- a/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip
+++ b/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip
@@ -9,11 +9,11 @@
// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(5)* [[TMP2:%.*]] to i32*
// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3:%.*]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = freeze i32 [[TMP5:%.*]]
-// CHECK-NEXT: %call = call noundef i32 @_Z11__shfl_synciii(i32 noundef [[TMP6:%.*]], i32 noundef 64, i32 noundef 0) #4
+// CHECK-NEXT: %call = call i32 @_Z11__shfl_synciii(i32 [[TMP6:%.*]], i32 64, i32 0) #4
// CHECK-NEXT: store i32 %call, i32* [[TMP4:%.*]], align 4
// CHECK-NEXT: ret void
-// CHECK: define linkonce_odr noundef i32 @_Z11__shfl_synciii(i32 noundef [[TMP1:%.*]], i32 noundef [[TMP2:%.*]], i32 noundef [[TMP3:%.*]])
+// CHECK: define linkonce_odr i32 @_Z11__shfl_synciii(i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]])
#define __global__ __attribute__((global))
#define __device__ __attribute__((device))
diff --git a/clang/test/CodeGenHIP/noundef-attr-verify.hip b/clang/test/CodeGenHIP/noundef-attr-verify.hip
new file mode 100644
index 0000000000000..985f7a773a8c5
--- /dev/null
+++ b/clang/test/CodeGenHIP/noundef-attr-verify.hip
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -target-cpu gfx906 -x hip -fcuda-is-device -emit-llvm %s \
+// RUN: -o - | FileCheck %s
+
+#define __global__ __attribute__((global))
+#define __device__ __attribute__((device))
+#define WARP_SIZE 64
+
+static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
+
+__device__ static inline unsigned int __lane_id() {
+ return __builtin_amdgcn_mbcnt_hi(
+ -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
+}
+
+__device__
+inline
+int __shfl(int var, int src_lane, int width = warpSize) {
+ int self = __lane_id();
+ int index = src_lane + (self & ~(width-1));
+ return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+
+template
+static __device__
+T __shfl_sync(unsigned mask, T val, int src_line, int width=WARP_SIZE)
+{
+ return __shfl(val, src_line, width);
+}
+
+// CHECK-LABEL: @_Z13shufflekernelv(
+// CHECK: call i32 @_ZL11__shfl_syncIiET_jS0_ii(i32 64, i32 %0, i32 0, i32 64)
+
+__global__ void
+shufflekernel()
+{
+ int res, t;
+ res = __shfl_sync(WARP_SIZE, t, 0);
+}
diff --git a/clang/test/CodeGenHIP/unsafe-atomic-ops-gfx90a.hip b/clang/test/CodeGenHIP/unsafe-atomic-ops-gfx90a.hip
new file mode 100644
index 0000000000000..c071d197b336b
--- /dev/null
+++ b/clang/test/CodeGenHIP/unsafe-atomic-ops-gfx90a.hip
@@ -0,0 +1,21 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -munsafe-fp-atomics -target-cpu gfx90a -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#define __global__ __attribute__((global))
+#define __device__ __attribute__((device))
+
+// CHECK-LABEL: @_Z15unsafeAtomicAddPff(ptr %addr, float %value
+__device__ inline float unsafeAtomicAdd(float* addr, float value) {
+ // CHECK: %[[ADDR_ADDR:.*]] = alloca ptr, align 8, addrspace(5)
+ // CHECK: %[[ADDR_ADDR_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[ADDR_ADDR]] to ptr
+ // CHECK: %[[ADDR_PTR:.*]] = load ptr, ptr %[[ADDR_ADDR_ASCAST]], align 8
+ // CHECK: %[[ADDR:.*]] = addrspacecast ptr %[[ADDR_PTR]] to ptr addrspace(3)
+ // CHECK: call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %[[ADDR]]
+ return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
+}
+
+__global__ void test_global_atomic_add_f32(float *val){
+ float *rtn;
+ *rtn = unsafeAtomicAdd(val, 1.0);
+}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-debug-info-struct-function-arg.cl b/clang/test/CodeGenOpenCL/amdgpu-debug-info-struct-function-arg.cl
new file mode 100644
index 0000000000000..0f8764ad30c13
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/amdgpu-debug-info-struct-function-arg.cl
@@ -0,0 +1,36 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang -g -target amdgcn-amd-amdhsa -march=gfx900 -O0 -nogpulib %s -c -o - | llvm-dwarfdump -v -debug-info - | FileCheck "%s"
+// CHECK: DW_TAG_subprogram
+// CHECK: DW_AT_name [DW_FORM_strx1]
+// CHECK-SAME: (indexed ({{[0-9a-f]+}}) string = "foo")
+//
+// CHECK: DW_TAG_formal_parameter
+// CHECK: DW_AT_name [DW_FORM_strx1]
+// CHECK-SAME: (indexed ({{[0-9a-f]+}}) string = "data")
+// CHECK: DW_AT_type [DW_FORM_ref4]
+// CHECK-SAME: (cu + 0x{{[0-9a-f]+}} => {0x[[BAR_OFFSET:[0-9a-f]+]]} "bar")
+//
+// CHECK: DW_TAG_variable
+// CHECK: DW_AT_name [DW_FORM_strx1]
+// CHECK-SAME: (indexed ({{[0-9a-f]+}}) string = "offset")
+//
+// CHECK: 0x[[BAR_OFFSET]]: DW_TAG_structure_type
+// CHECK: DW_AT_name [DW_FORM_strx1]
+// CHECK-SAME: (indexed ({{[0-9a-f]+}}) string = "bar")
+//
+// CHECK: DW_TAG_member
+// CHECK: DW_AT_name [DW_FORM_strx1]
+// CHECK-SAME: (indexed ({{[0-9a-f]+}}) string = "C")
+//
+// CHECK: DW_TAG_member
+// CHECK: DW_AT_name [DW_FORM_strx1]
+// CHECK-SAME: (indexed ({{[0-9a-f]+}}) string = "A")
+struct bar {
+ __global unsigned *C;
+ __global unsigned *A;
+};
+
+void foo(struct bar data) {
+ unsigned offset = get_global_id(0);
+ data.C[offset] = data.A[offset];
+}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index ff288e530d17f..8106788727b8b 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -76,9 +76,9 @@
// GFX1034: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
// GFX1035: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
// GFX1036: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX1100: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts"
-// GFX1101: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts"
-// GFX1102: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts"
-// GFX1103: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts"
+// GFX1100: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts"
+// GFX1101: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts"
+// GFX1102: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts"
+// GFX1103: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts"
kernel void test() {}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
index 068ecb1ee444c..dc7069decaaa6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
@@ -14,14 +14,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC)
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false)
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true)
-// CHECK: call i32 @llvm.amdgcn.sdot4(i32 %siA, i32 %siB, i32 %siC, i1 false)
-// CHECK: call i32 @llvm.amdgcn.sdot4(i32 %siA, i32 %siB, i32 %siC, i1 true)
// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
// CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 false)
// CHECK: call i32 @llvm.amdgcn.sudot4(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 true)
-// CHECK: call i32 @llvm.amdgcn.sdot8(i32 %siA, i32 %siB, i32 %siC, i1 false)
-// CHECK: call i32 @llvm.amdgcn.sdot8(i32 %siA, i32 %siB, i32 %siC, i1 true)
// CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
// CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
// CHECK: call i32 @llvm.amdgcn.sudot8(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 false)
@@ -44,18 +40,12 @@ kernel void builtins_amdgcn_dl_insts_err(
fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true);
- siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false);
- siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true);
-
uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false);
uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true);
iOut[0] = __builtin_amdgcn_sudot4(true, A, false, B, C, false);
iOut[1] = __builtin_amdgcn_sudot4(false, A, true, B, C, true);
- siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false);
- siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true);
-
uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false);
uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 9696f3536e2f6..4d66bbc574f96 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -583,13 +583,13 @@ void test_get_local_id(int d, global int *out)
}
// CHECK-LABEL: @test_get_workgroup_size(
-// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 4
+// CHECK: call align 8 dereferenceable(256) i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 12
// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load
-// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 6
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 14
// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
-// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 8
-// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 16
+// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 8, !range [[$WS_RANGE:![0-9]*]], !invariant.load
void test_get_workgroup_size(int d, global int *out)
{
switch (d) {
diff --git a/clang/test/Driver/clang-offload-bundler-asserts-on.c b/clang/test/Driver/clang-offload-bundler-asserts-on.c
index 4b14ad310d2e9..5c7b755c83bfa 100644
--- a/clang/test/Driver/clang-offload-bundler-asserts-on.c
+++ b/clang/test/Driver/clang-offload-bundler-asserts-on.c
@@ -16,13 +16,13 @@
//
// Create few code object bundles and archive them to create an input archive
// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa-gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.simple.bundle
-// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack+,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack+ -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID1.bundle
-// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack- -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID2.bundle
-// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:xnack- -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID3.bundle
+// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack+,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack+ -input=%t.o -input=%t.tgt1 -input=%t.tgt1 -output=%t.targetID1.bundle
+// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack- -input=%t.o -input=%t.tgt1 -input=%t.tgt1 -output=%t.targetID2.bundle
+// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:xnack- -input=%t.o -input=%t.tgt1 -input=%t.tgt1 -output=%t.targetID3.bundle
// RUN: llvm-ar cr %t.input-archive.a %t.simple.bundle %t.targetID1.bundle %t.targetID2.bundle %t.targetID3.bundle
// Tests to check compatibility between Bundle Entry ID formats i.e. between presence/absence of extra hyphen in case of missing environment field
-// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa-gfx908:sramecc+:xnack+ -inputs=%t.input-archive.a -outputs=%t-archive-gfx906-simple.a,%t-archive-gfx908-simple.a -debug-only=CodeObjectCompatibility 2>&1 | FileCheck %s -check-prefix=BUNDLECOMPATIBILITY
+// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa-gfx908:sramecc+:xnack+ -input=%t.input-archive.a -output=%t-archive-gfx906-simple.a -output=%t-archive-gfx908-simple.a -debug-only=CodeObjectCompatibility 2>&1 | FileCheck %s -check-prefix=BUNDLECOMPATIBILITY
// BUNDLECOMPATIBILITY: Compatible: Exact match: [CodeObject: openmp-amdgcn-amd-amdhsa-gfx906] : [Target: openmp-amdgcn-amd-amdhsa--gfx906]
// BUNDLECOMPATIBILITY: Incompatible: Processor mismatch [CodeObject: openmp-amdgcn-amd-amdhsa-gfx906] : [Target: openmp-amdgcn-amd-amdhsa-gfx908:sramecc+:xnack+]
// BUNDLECOMPATIBILITY: Incompatible: Processor mismatch [CodeObject: openmp-amdgcn-amd-amdhsa--gfx908] : [Target: openmp-amdgcn-amd-amdhsa--gfx906]
diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c
index a86fb5ea2d249..c2f2c3cdb2a6e 100644
--- a/clang/test/Driver/clang-offload-bundler.c
+++ b/clang/test/Driver/clang-offload-bundler.c
@@ -471,13 +471,13 @@
//
// Create few code object bundles and archive them to create an input archive
// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa-gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.simple.bundle
-// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack+,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack+ -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID1.bundle
-// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack- -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID2.bundle
-// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:xnack- -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID3.bundle
+// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack+,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack+ -input=%t.o -input=%t.tgt1 -input=%t.tgt1 -output=%t.targetID1.bundle
+// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack- -input=%t.o -input=%t.tgt1 -input=%t.tgt1 -output=%t.targetID2.bundle
+// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:xnack- -input=%t.o -input=%t.tgt1 -input=%t.tgt1 -output=%t.targetID3.bundle
// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,hip-amdgcn-amd-amdhsa--gfx906:xnack-,hip-amdgcn-amd-amdhsa--gfx908:xnack- -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID4.bundle
// RUN: llvm-ar cr %t.input-archive.a %t.simple.bundle %t.targetID1.bundle %t.targetID2.bundle %t.targetID3.bundle %t.targetID4.bundle
-// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -inputs=%t.input-archive.a -outputs=%t-archive-gfx906-simple.a,%t-archive-gfx908-simple.a
+// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -input=%t.input-archive.a -output=%t-archive-gfx906-simple.a -output=%t-archive-gfx908-simple.a
// RUN: llvm-ar t %t-archive-gfx906-simple.a | FileCheck %s -check-prefix=GFX906
// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa-gfx906:xnack+ -input=%t.input-archive.a -output=%t-archive-gfx906-simple.a
// RUN: llvm-ar t %t-archive-gfx906-simple.a | FileCheck %s -check-prefix=GFX906
diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index d276f6ea47244..8535705a09694 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -139,13 +139,13 @@
// Test default code object version.
// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \
// RUN: --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ABI4
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ABI5
// Test default code object version with old device library without abi_version_400.bc
// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \
// RUN: --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode-no-abi-ver \
// RUN: --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=NOABI4
+// RUN: 2>&1 | FileCheck %s --check-prefixes=NOABI5
// Test -mcode-object-version=3
// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \
diff --git a/clang/test/Driver/hip-toolchain-device-only.hip b/clang/test/Driver/hip-toolchain-device-only.hip
index cbc0164069651..9dbb1f21fcc25 100644
--- a/clang/test/Driver/hip-toolchain-device-only.hip
+++ b/clang/test/Driver/hip-toolchain-device-only.hip
@@ -12,7 +12,7 @@
// CHECK-SAME: "-target-cpu" "gfx803"
// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
-// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -21,7 +21,7 @@
// CHECK-SAME: "-target-cpu" "gfx900"
// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
-// CHECK: [[LLD]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 2cd44ca78eb8d..4ae054b62fb7f 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -59,7 +59,7 @@
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
//
@@ -82,7 +82,7 @@
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
//
@@ -122,7 +122,7 @@
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
//
@@ -145,7 +145,7 @@
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
//
diff --git a/clang/test/Driver/openmp-target-fast-flag.c b/clang/test/Driver/openmp-target-fast-flag.c
index 83ed0bb757249..f1f4b2862001a 100644
--- a/clang/test/Driver/openmp-target-fast-flag.c
+++ b/clang/test/Driver/openmp-target-fast-flag.c
@@ -7,7 +7,7 @@
// RUN: | FileCheck -check-prefixes=TFast,EnV,TState,NestParallel %s
// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a -O4 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=O4,NoTFast,EnV,NoTState,NoNestParallel %s
+// RUN: | FileCheck -check-prefixes=O4,NoTFast,NoEnV,NoTState,NoNestParallel %s
// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a -O4 -fno-openmp-target-fast %s 2>&1 \
// RUN: | FileCheck -check-prefixes=O4,NoTFast,NoEnV,NoTState,NoNestParallel %s
@@ -43,4 +43,4 @@
// NestParallel: -fopenmp-assume-no-nested-parallelism
// NestParallel-NOT: -fno-openmp-assume-no-nested-parallelism
// NoNestParallel: -fno-openmp-assume-no-nested-parallelism
-// NoNestParallel-NOT: -fopenmp-assume-no-nested-parallelism
\ No newline at end of file
+// NoNestParallel-NOT: -fopenmp-assume-no-nested-parallelism
diff --git a/clang/test/Driver/rocm-detect.hip b/clang/test/Driver/rocm-detect.hip
index c3ffd21a75b9b..c5f360c5109ac 100644
--- a/clang/test/Driver/rocm-detect.hip
+++ b/clang/test/Driver/rocm-detect.hip
@@ -82,7 +82,6 @@
// SPACK: ROCm installation search path (Spack 4.0.0): [[DIR:.*]]
// SPACK: ROCm installation search path: [[CLANG:.*]]
-// SPACK: ROCm installation search path: [[DIR]]/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z
// SPACK: ROCm installation search path: [[DIR]]/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/lib/clang
// SPACK: ROCm installation search path: /opt/rocm
// SPACK: InstalledDir: [[DIR]]/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin
diff --git a/clang/test/OpenMP/big_jump_loop_codegen.cpp b/clang/test/OpenMP/big_jump_loop_codegen.cpp
new file mode 100644
index 0000000000000..b5ae3a8a2c4bf
--- /dev/null
+++ b/clang/test/OpenMP/big_jump_loop_codegen.cpp
@@ -0,0 +1,116 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-target-ignore-env-vars -fopenmp-assume-no-nested-parallelism -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-target-ignore-env-vars -fopenmp-assume-no-nested-parallelism -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+int main()
+{
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i=0; i
+T tmain(T argc, T *argv) {
+ int N = 100;
+ int v[N];
+ #pragma omp target map(iterator(it = 0:N:2), to: v[it])
+ foo();
+ #pragma omp target map(iterator(it = 0:N:4), from: v[it])
+ foo();
+
+ return 0;
+}
+
+// OMP52: template T tmain(T argc, T *argv) {
+// OMP52-NEXT: int N = 100;
+// OMP52-NEXT: int v[N];
+// OMP52-NEXT: #pragma omp target map(iterator(int it = 0:N:2),to: v[it])
+// OMP52-NEXT: foo()
+// OMP52-NEXT: #pragma omp target map(iterator(int it = 0:N:4),from: v[it])
+// OMP52-NEXT: foo()
+
+// OMP52-LABEL: int main(int argc, char **argv) {
+int main (int argc, char **argv) {
+ int i, j, a[20], always, close;
+// OMP52-NEXT: int i, j, a[20]
+#pragma omp target
+// OMP52-NEXT: #pragma omp target
+ foo();
+// OMP52-NEXT: foo();
+#pragma omp target map(iterator(it = 0:20:2), to: a[it])
+// OMP52-NEXT: #pragma omp target map(iterator(int it = 0:20:2),to: a[it])
+ foo();
+// OMP52-NEXT: foo();
+#pragma omp target map(iterator(it = 0:20:4), from: a[it])
+// OMP52-NEXT: #pragma omp target map(iterator(int it = 0:20:4),from: a[it])
+foo();
+// OMP52-NEXT: foo();
+
+ return tmain(argc, &argc) + tmain(argv[0][0], argv[0]);
+}
+#endif // OMP52
+
#ifdef OMPX
// RUN: %clang_cc1 -DOMPX -verify -fopenmp -fopenmp-extensions -ast-print %s | FileCheck %s --check-prefix=OMPX
diff --git a/clang/test/OpenMP/target_enter_data_ast_print.cpp b/clang/test/OpenMP/target_enter_data_ast_print.cpp
index 0ccafaef5b59a..b11d5de13de67 100644
--- a/clang/test/OpenMP/target_enter_data_ast_print.cpp
+++ b/clang/test/OpenMP/target_enter_data_ast_print.cpp
@@ -6,6 +6,10 @@
// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+
// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
diff --git a/clang/test/OpenMP/target_enter_data_ast_print_openmp52.cpp b/clang/test/OpenMP/target_enter_data_ast_print_openmp52.cpp
new file mode 100644
index 0000000000000..578f9a2542744
--- /dev/null
+++ b/clang/test/OpenMP/target_enter_data_ast_print_openmp52.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck --check-prefix=CHECK --check-prefix=CHECK-52 %s
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+template
+T tmain(T argc, T *argv) {
+ T i_def, i;
+
+ i = argc;
+
+#pragma omp target enter data map(i_def)
+
+#pragma omp target enter data map(to: i)
+
+ return 0;
+}
+
+// CHECK: template T tmain(T argc, T *argv) {
+// CHECK-NEXT: T i_def, i;
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target enter data map(to: i_def){{$}}
+// CHECK-NEXT: #pragma omp target enter data map(to: i){{$}}
+
+// CHECK: template<> int tmain(int argc, int *argv) {
+// CHECK-NEXT: int i_def, i;
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target enter data map(to: i_def){{$}}
+// CHECK-NEXT: #pragma omp target enter data map(to: i)
+
+// CHECK: template<> char tmain(char argc, char *argv) {
+// CHECK-NEXT: char i_def, i;
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target enter data map(to: i_def){{$}}
+// CHECK-NEXT: #pragma omp target enter data map(to: i)
+
+int main (int argc, char **argv) {
+ int b_def, b;
+ static int a_def, a;
+// CHECK: static int a_def, a;
+
+#pragma omp target enter data map(a_def)
+// CHECK: #pragma omp target enter data map(to: a_def)
+ a_def=2;
+// CHECK-NEXT: a_def = 2;
+
+#pragma omp target enter data map(to: a)
+// CHECK: #pragma omp target enter data map(to: a)
+ a=2;
+// CHECK-NEXT: a = 2;
+
+#pragma omp target enter data map(b_def)
+// CHECK-NEXT: #pragma omp target enter data map(to: b_def)
+
+#pragma omp target enter data map(to: b)
+// CHECK-NEXT: #pragma omp target enter data map(to: b)
+
+ return tmain(argc, &argc) + tmain(argv[0][0], argv[0]);
+}
+
+#endif
diff --git a/clang/test/OpenMP/target_exit_data_ast_print.cpp b/clang/test/OpenMP/target_exit_data_ast_print.cpp
index 4b3f65b5835fb..f482f379361bd 100644
--- a/clang/test/OpenMP/target_exit_data_ast_print.cpp
+++ b/clang/test/OpenMP/target_exit_data_ast_print.cpp
@@ -6,6 +6,10 @@
// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+
// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
diff --git a/clang/test/OpenMP/target_exit_data_ast_print_openmp52.cpp b/clang/test/OpenMP/target_exit_data_ast_print_openmp52.cpp
new file mode 100644
index 0000000000000..fbc431eadbccb
--- /dev/null
+++ b/clang/test/OpenMP/target_exit_data_ast_print_openmp52.cpp
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+template
+T tmain(T argc, T *argv) {
+ T i_def, i;
+
+ i = argc;
+#pragma omp target exit data map(i_def)
+
+#pragma omp target exit data map(from: i)
+
+ return 0;
+}
+
+// CHECK: template T tmain(T argc, T *argv) {
+// CHECK-NEXT: T i_def, i;
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target exit data map(from: i_def){{$}}
+// CHECK-NEXT: #pragma omp target exit data map(from: i){{$}}
+
+// CHECK: template<> int tmain(int argc, int *argv) {
+// CHECK-NEXT: int i_def, i;
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target exit data map(from: i_def)
+// CHECK-NEXT: #pragma omp target exit data map(from: i)
+
+// CHECK: template<> char tmain(char argc, char *argv) {
+// CHECK-NEXT: char i_def, i;
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target exit data map(from: i_def)
+// CHECK-NEXT: #pragma omp target exit data map(from: i)
+
+int main (int argc, char **argv) {
+ int b_def, b;
+ static int a_def, a;
+// CHECK: static int a_def, a;
+
+#pragma omp target exit data map(a_def)
+// CHECK: #pragma omp target exit data map(from: a_def)
+ a_def=2;
+// CHECK-NEXT: a_def = 2;
+
+#pragma omp target exit data map(from: a)
+// CHECK: #pragma omp target exit data map(from: a)
+ a=2;
+// CHECK-NEXT: a = 2;
+
+#pragma omp target exit data map(b_def)
+// CHECK-NEXT: #pragma omp target exit data map(from: b_def)
+
+#pragma omp target exit data map(from: b)
+// CHECK-NEXT: #pragma omp target exit data map(from: b)
+
+ return tmain(argc, &argc) + tmain(argv[0][0], argv[0]);
+}
+
+#endif
diff --git a/clang/test/OpenMP/target_map_messages.cpp b/clang/test/OpenMP/target_map_messages.cpp
index ae28a149333fb..703bc9dff80cf 100644
--- a/clang/test/OpenMP/target_map_messages.cpp
+++ b/clang/test/OpenMP/target_map_messages.cpp
@@ -4,6 +4,7 @@
// RUN: %clang_cc1 -verify=expected,lt50,lt51,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized
// RUN: %clang_cc1 -verify=expected,ge50,lt51,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized
// RUN: %clang_cc1 -verify=expected,ge50,ge51,omp,ge51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=51 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized
+// RUN: %clang_cc1 -verify=expected,ge50,ge51,ge52,omp,ge52-omp -fopenmp -fno-openmp-extensions -fopenmp-version=52 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized
// RUN: %clang_cc1 -DCCODE -verify -fopenmp -fno-openmp-extensions -ferror-limit 300 -x c %s -Wno-openmp -Wuninitialized
// -fopenmp-simd, -fno-openmp-extensions
@@ -158,23 +159,28 @@ struct SA {
// expected-error@+1 {{use of undeclared identifier 'present'}}
#pragma omp target map(present)
{}
+ // ge52-omp-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
// ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
// lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
#pragma omp target map(ompx_hold, tofrom: c,f)
{}
+ // ge52-omp-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
// ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
// lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
#pragma omp target map(ompx_hold, tofrom: c[1:2],f)
{}
+ // ge52-omp-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
// ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
// lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
#pragma omp target map(ompx_hold, tofrom: c,f[1:2])
{}
+ // ge52-omp-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
// expected-error@+3 {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
// ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
// lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
#pragma omp target map(ompx_hold, tofrom: c[:],f)
{}
+ // ge52-omp-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
// expected-error@+3 {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
// ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
// lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
@@ -191,11 +197,15 @@ struct SA {
// lt51-error@+1 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
#pragma omp target map(present, present, tofrom: a)
{}
+ // ge52-omp-error@+5 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
+ // ge52-omp-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
// ompx-error@+3 {{same map type modifier has been specified more than once}}
// ge51-omp-error@+2 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
// lt51-omp-error@+1 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
#pragma omp target map(ompx_hold, ompx_hold, tofrom: a)
{}
+ // ge52-omp-error@+9 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
+ // ge52-omp-error@+8 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
// expected-error@+7 2 {{same map type modifier has been specified more than once}}
// ge51-error@+6 {{same map type modifier has been specified more than once}}
// lt51-ompx-error@+5 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'ompx_hold'}}
@@ -239,6 +249,36 @@ struct SA {
{}
#pragma omp target map(([b[I]][bf])f) // lt50-error {{expected ',' or ']' in lambda capture list}} lt50-error {{expected ')'}} lt50-note {{to match this '('}}
{}
+ // ge51-ompx-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'ompx_hold'}}
+ // lt51-ompx-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'ompx_hold'}}
+ // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
+ // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
+ #pragma omp target map(iterator(it=0:10), tofrom:a)
+ {}
+ // ompx-error@+8 {{redefinition of 'it'}}
+ // ompx-note@+7 {{previous definition is here}}
+ // omp-error@+6 {{redefinition of 'it'}}
+ // omp-note@+5 {{previous definition is here}}
+ // ge51-ompx-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'ompx_hold'}}
+ // lt51-ompx-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'ompx_hold'}}
+ // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
+ // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
+ #pragma omp target map(iterator(it=0:10, it=0:20), tofrom:a)
+ {}
+ // ge51-ompx-error@+6 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'ompx_hold'}}
+ // lt51-ompx-error@+5 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'ompx_hold'}}
+ // lt51-error@+4 {{expected '(' after 'iterator'}}
+ // ge51-error@+3 {{expected '(' after 'iterator'}}
+ // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
+ // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
+ #pragma omp target map(iterator, tofrom:a)
+ {}
+ // ge51-ompx-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'ompx_hold'}}
+ // lt51-ompx-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'ompx_hold'}}
+ // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
+ // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
+ #pragma omp target map(iterator(), tofrom:a)
+ {}
return;
}
};
@@ -920,6 +960,24 @@ int main(int argc, char **argv) {
pos(i).y = i+1;
}
+ // ge51-ompx-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'ompx_hold'}}
+ // lt51-ompx-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'ompx_hold'}}
+ // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
+ // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
+ #pragma omp target map(iterator(it=0:10), tofrom:a[it])
+ {}
+
+ // ompx-error@+8 {{use of undeclared identifier 'itt'; did you mean 'it'?}}
+ // ompx-note@+7 {{'it' declared here}}
+ // omp-error@+6 {{use of undeclared identifier 'itt'; did you mean 'it'?}}
+ // omp-note@+5 {{'it' declared here}}
+ // ge51-ompx-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'ompx_hold'}}
+ // lt51-ompx-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'ompx_hold'}}
+ // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
+ // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
+ #pragma omp target map(iterator(it=0:10), tofrom:a[itt])
+ {}
+
return tmain(argc)+tmain(argc); // expected-note {{in instantiation of function template specialization 'tmain' requested here}} expected-note {{in instantiation of function template specialization 'tmain' requested here}}
}
#endif
diff --git a/clang/test/OpenMP/xteam_red_codegen.cpp b/clang/test/OpenMP/xteam_red_codegen.cpp
new file mode 100644
index 0000000000000..76951203da303
--- /dev/null
+++ b/clang/test/OpenMP/xteam_red_codegen.cpp
@@ -0,0 +1,1517 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-target-ignore-env-vars -fopenmp-assume-no-nested-parallelism -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-target-ignore-env-vars -fopenmp-assume-no-nested-parallelism -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+#include
+
+int main()
+{
+ int N = 100;
+
+ double a[N], b[N];
+ int bint[N];
+ unsigned cint[N];
+
+ int8_t int8_sum = 0;
+ int16_t int16_sum = 0;
+ int32_t int32_sum = 0;
+ uint32_t uint32_sum = 0;
+ int64_t int64_sum = 0;
+ uint64_t uint64_sum = 0;
+
+ for (int i=0; i
+__attribute__((enable_if(true, "")))
+T kaboom(T a, T b) {
+ return b;
+}
+
+struct A {
+ double foo();
+};
+
+template
+struct B {
+ A &f;
+
+ void bar() {
+ kaboom(kaboom(0.0, 1.0), f.foo());
+ }
+};
diff --git a/clang/tools/clang-hip/clang-build-select-link/ClangBuildSelectLink.cpp b/clang/tools/clang-hip/clang-build-select-link/ClangBuildSelectLink.cpp
index 7a2dc9f58f1a6..3eef14ad02004 100644
--- a/clang/tools/clang-hip/clang-build-select-link/ClangBuildSelectLink.cpp
+++ b/clang/tools/clang-hip/clang-build-select-link/ClangBuildSelectLink.cpp
@@ -188,22 +188,35 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
static bool convertExternsToLinkOnce(Module *MOUT, LLVMContext &Ctx) {
// Convert all external functions to LinkOnceODR so they get inlined
- // and removed by the optimizer in the next HIP driver step.
- // After next opt step, only kernels will exist
+ // and removed by the optimizer unless optnone is set
for (Module::iterator i = MOUT->begin(), e = MOUT->end(); i != e; ++i) {
llvm::Function *F = &*i;
if (!i->isDeclaration()) {
if (Verbose)
errs() << "Function attribute cleanup for\'"
<< F->getName().str().c_str() << "\' \n";
- if (i->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) {
- F->removeFnAttr(llvm::Attribute::OptimizeNone);
- } else {
+ if (i->getCallingConv() != llvm::CallingConv::AMDGPU_KERNEL) {
+ if (!strncmp(F->getName().str().c_str(), "__ockl_devmem_request",
+ strlen("__ockl_devmem_request")))
+ continue;
+ if (!strncmp(F->getName().str().c_str(), "__ockl_dm_alloc",
+ strlen("__ockl_dm_alloc")))
+ continue;
+ if (!strncmp(F->getName().str().c_str(), "__ockl_dm_dealloc",
+ strlen("__ockl_dm_dealloc")))
+ continue;
+ if (!strncmp(F->getName().str().c_str(), "hostrpc_invoke",
+ strlen("hostrpc_invoke")))
+ continue;
+
+ // all other functions
F->setLinkage(GlobalValue::LinkOnceODRLinkage);
F->setVisibility(GlobalValue::ProtectedVisibility);
- F->removeFnAttr(llvm::Attribute::OptimizeNone);
- F->removeFnAttr(llvm::Attribute::NoInline);
- F->addFnAttr(llvm::Attribute::AlwaysInline);
+ if (!F->hasOptNone()) {
+ F->removeFnAttr(llvm::Attribute::OptimizeNone);
+ F->removeFnAttr(llvm::Attribute::NoInline);
+ F->addFnAttr(llvm::Attribute::AlwaysInline);
+ }
}
}
}
diff --git a/compiler-rt/lib/asan/asan_allocator.cpp b/compiler-rt/lib/asan/asan_allocator.cpp
index 8a2d83600c73c..6c9b4e1b64168 100644
--- a/compiler-rt/lib/asan/asan_allocator.cpp
+++ b/compiler-rt/lib/asan/asan_allocator.cpp
@@ -96,6 +96,7 @@ class ChunkHeader {
// align < 8 -> 0
// else -> log2(min(align, 512)) - 2
u8 user_requested_alignment_log : 3;
+ u8 device_mem : 1;
private:
u16 user_requested_size_hi;
@@ -562,6 +563,7 @@ struct Allocator {
uptr chunk_beg = user_beg - kChunkHeaderSize;
AsanChunk *m = reinterpret_cast(chunk_beg);
m->alloc_type = alloc_type;
+ m->device_mem = da_info ? 1 : 0;
CHECK(size);
m->SetUsedSize(size);
m->user_requested_alignment_log = user_requested_alignment_log;
@@ -617,9 +619,26 @@ struct Allocator {
if (!atomic_compare_exchange_strong(&m->chunk_state, &old_chunk_state,
CHUNK_QUARANTINE,
memory_order_acquire)) {
- ReportInvalidFree(ptr, old_chunk_state, stack);
- // It's not safe to push a chunk in quarantine on invalid free.
- return false;
+ if (!m->device_mem) {
+ ReportInvalidFree(ptr, old_chunk_state, stack);
+ // It's not safe to push a chunk in quarantine on invalid free.
+ return false;
+ } else {
+ // Temporary patch: atomic_compare_exchange_strong will give wrong
+ // results sometimes for device memory, so just use a mutex to protect
+ // us from the possible race conditions
+ //
+ // We need a mutex, borrow fallback_mutex
+ SpinMutexLock l(&fallback_mutex);
+ old_chunk_state = atomic_load(&m->chunk_state, memory_order_relaxed);
+ if (old_chunk_state == CHUNK_ALLOCATED) {
+ atomic_store(&m->chunk_state, CHUNK_QUARANTINE, memory_order_relaxed);
+ } else {
+ ReportInvalidFree(ptr, old_chunk_state, stack);
+ // It's not safe to push a chunk in quarantine on invalid free.
+ return false;
+ }
+ }
}
CHECK_EQ(CHUNK_ALLOCATED, old_chunk_state);
// It was a user data.
diff --git a/lld/Common/Args.cpp b/lld/Common/Args.cpp
index 388c15b3db3ec..c0545194610a6 100644
--- a/lld/Common/Args.cpp
+++ b/lld/Common/Args.cpp
@@ -11,8 +11,10 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/CommandFlags.h"
#include "llvm/Option/ArgList.h"
#include "llvm/Support/Path.h"
+#include "llvm/Support/TargetParser.h"
using namespace llvm;
using namespace lld;
@@ -20,6 +22,21 @@ using namespace lld;
// TODO(sbc): Remove this once CGOptLevel can be set completely based on bitcode
// function metadata.
CodeGenOpt::Level lld::args::getCGOptLevel(int optLevelLTO) {
+ // TODO(slinder1): Workaround for HeterogeneousDWARF to support `-fgpu-rdc
+ // -O0 -g`. Remove this when we support higher optimization levels.
+ if (llvm::AMDGPU::parseArchAMDGCN(llvm::codegen::getCPUStr())) {
+ switch (optLevelLTO) {
+ case 0:
+ return CodeGenOpt::None;
+ case 1:
+ return CodeGenOpt::Less;
+ case 2:
+ return CodeGenOpt::Default;
+ case 3:
+ return CodeGenOpt::Aggressive;
+ }
+ llvm_unreachable("Invalid optimization level");
+ }
if (optLevelLTO == 3)
return CodeGenOpt::Aggressive;
assert(optLevelLTO < 3);
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 271776ddd32b8..3f775610cd4d0 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -176,12 +176,15 @@ static std::tuple parseEmulation(StringRef emul) {
.Case("elf_iamcu", {ELF32LEKind, EM_IAMCU})
.Case("elf64_sparc", {ELF64BEKind, EM_SPARCV9})
.Case("msp430elf", {ELF32LEKind, EM_MSP430})
+ .Case("elf64_amdgpu", {ELF64LEKind, EM_AMDGPU})
.Default({ELFNoneKind, EM_NONE});
if (ret.first == ELFNoneKind)
error("unknown emulation: " + emul);
if (ret.second == EM_MSP430)
osabi = ELFOSABI_STANDALONE;
+ else if (ret.second == EM_AMDGPU)
+ osabi = ELFOSABI_AMDGPU_HSA;
return std::make_tuple(ret.first, ret.second, osabi);
}
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index ab04748b76afa..0ca5bf8461af0 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -177,8 +177,10 @@ static lto::Config createConfig() {
};
}
- if (config->ltoEmitAsm)
+ if (config->ltoEmitAsm) {
c.CGFileType = CGFT_AssemblyFile;
+ c.Options.MCOptions.AsmVerbose = true;
+ }
if (!config->saveTempsArgs.empty())
checkError(c.addSaveTemps(config->outputFile.str() + ".",
diff --git a/lld/test/ELF/emulation-amdgpu.s b/lld/test/ELF/emulation-amdgpu.s
new file mode 100644
index 0000000000000..329fb1c69b166
--- /dev/null
+++ b/lld/test/ELF/emulation-amdgpu.s
@@ -0,0 +1,36 @@
+# REQUIRES: amdgpu
+
+# RUN: llvm-mc -filetype=obj -triple=amdgcn-amd-amdhsa %s -o %t.o
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readobj --file-headers %t | FileCheck %s
+# RUN: ld.lld -m elf64_amdgpu %t.o -o %t
+# RUN: llvm-readobj --file-headers %t | FileCheck %s
+
+# CHECK: ElfHeader {
+# CHECK-NEXT: Ident {
+# CHECK-NEXT: Magic: (7F 45 4C 46)
+# CHECK-NEXT: Class: 64-bit (0x2)
+# CHECK-NEXT: DataEncoding: LittleEndian (0x1)
+# CHECK-NEXT: FileVersion: 1
+# CHECK-NEXT: OS/ABI: AMDGPU_HSA (0x40)
+# CHECK-NEXT: ABIVersion: 3
+# CHECK-NEXT: Unused: (00 00 00 00 00 00 00)
+# CHECK-NEXT: }
+# CHECK-NEXT: Type: Executable (0x2)
+# CHECK-NEXT: Machine: EM_AMDGPU (0xE0)
+# CHECK-NEXT: Version: 1
+# CHECK-NEXT: Entry:
+# CHECK-NEXT: ProgramHeaderOffset: 0x40
+# CHECK-NEXT: SectionHeaderOffset:
+# CHECK-NEXT: Flags [ (0x0)
+# CHECK-NEXT: ]
+# CHECK-NEXT: HeaderSize: 64
+# CHECK-NEXT: ProgramHeaderEntrySize: 56
+# CHECK-NEXT: ProgramHeaderCount:
+# CHECK-NEXT: SectionHeaderEntrySize: 64
+# CHECK-NEXT: SectionHeaderCount:
+# CHECK-NEXT: StringTableSectionIndex:
+# CHECK-NEXT: }
+
+.globl _start
+_start:
diff --git a/lld/test/ELF/lto/amdgcn-oses.ll b/lld/test/ELF/lto/amdgcn-oses.ll
index a2f25cdd57d87..a70b678ac2514 100644
--- a/lld/test/ELF/lto/amdgcn-oses.ll
+++ b/lld/test/ELF/lto/amdgcn-oses.ll
@@ -15,7 +15,7 @@
; RUN: llvm-readobj --file-headers %t/mesa3d.so | FileCheck %s --check-prefixes=GCN,NON-AMDHSA,MESA3D
; AMDHSA: OS/ABI: AMDGPU_HSA (0x40)
-; AMDHSA: ABIVersion: 2
+; AMDHSA: ABIVersion: 3
; AMDPAL: OS/ABI: AMDGPU_PAL (0x41)
; MESA3D: OS/ABI: AMDGPU_MESA3D (0x42)
diff --git a/lld/test/ELF/lto/amdgpu-cg-opt-level.ll b/lld/test/ELF/lto/amdgpu-cg-opt-level.ll
new file mode 100644
index 0000000000000..2e21f032c19b2
--- /dev/null
+++ b/lld/test/ELF/lto/amdgpu-cg-opt-level.ll
@@ -0,0 +1,23 @@
+; REQUIRES: amdgpu
+
+; TODO(slinder1): Workaround for HeterogeneousDWARF to support `-fgpu-rdc
+; -O0 -g`. Remove this when we support higher optimization levels.
+
+; RUN: llvm-as %s -o %t.o
+; RUN: ld.lld -plugin-opt=O0 -plugin-opt=mcpu=gfx90a %t.o -o %t -mllvm -debug-pass=Structure 2>&1 | FileCheck --check-prefix=CHECK-O0 %s
+; RUN: ld.lld -plugin-opt=O1 -plugin-opt=mcpu=gfx90a %t.o -o %t -mllvm -debug-pass=Structure 2>&1 | FileCheck --check-prefix=CHECK-O1 %s
+; RUN: ld.lld -plugin-opt=O2 -plugin-opt=mcpu=gfx90a %t.o -o %t -mllvm -debug-pass=Structure 2>&1 | FileCheck --check-prefix=CHECK-O2 %s
+; RUN: ld.lld -plugin-opt=O3 -plugin-opt=mcpu=gfx90a %t.o -o %t -mllvm -debug-pass=Structure 2>&1 | FileCheck --check-prefix=CHECK-O3 %s
+
+; CHECK-O0: Fast Register Allocator
+; CHECK-O1: Greedy Register Allocator
+; CHECK-O2: Greedy Register Allocator
+; CHECK-O3: Greedy Register Allocator
+
+target triple = "amdgcn-amd-amdhsa"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+define void @_start() {
+entry:
+ ret void
+}
diff --git a/lld/test/ELF/lto/emit-asm.ll b/lld/test/ELF/lto/emit-asm.ll
index d0719411a5bad..3f635b8dbe7f7 100644
--- a/lld/test/ELF/lto/emit-asm.ll
+++ b/lld/test/ELF/lto/emit-asm.ll
@@ -11,14 +11,18 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
+;; Note: we also check for the presence of comments; --lto-emit-asm output should be verbose.
+
+; CHECK-DAG: # -- Begin function f1
; CHECK-DAG: f1:
-; OPT-DAG: define void @f1()
+; OPT: define void @f1()
define void @f1() {
ret void
}
+; CHECK-DAG: # -- Begin function f2
; CHECK-DAG: f2:
-; OPT-DAG: define void @f2()
+; OPT: define void @f2()
define void @f2() {
ret void
}
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 9034ee4747185..4a4104e786fe4 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1,8 +1,8 @@
# Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
# Notified per clause 4(b) of the license.
-# See docs/CMake.html for instructions about how to build LLVM with CMake.
+# See docs/CMake.html for instructions about how to build LLVM with CMake.
cmake_minimum_required(VERSION 3.13.4)
set(LLVM_COMMON_CMAKE_UTILS ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 403a1f80dc54a..5fedc0afeb1b5 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -3672,6 +3672,11 @@ Code object V5 metadata is the same as
buffer that conforms to the requirements of the malloc/free
device library V1 version implementation.
+ "hidden_heap_v1"
+ A global address space pointer to an initialized memory
+ buffer that conforms to the requirements of the malloc/free
+ device library V1 version implementation.
+
"hidden_private_base"
The high 32 bits of the flat addressing private aperture base.
Only used by GFX8 to allow conversion between private segment
diff --git a/llvm/include/llvm/ADT/PointerIntPair.h b/llvm/include/llvm/ADT/PointerIntPair.h
index 119285087957f..9278ccdb47887 100644
--- a/llvm/include/llvm/ADT/PointerIntPair.h
+++ b/llvm/include/llvm/ADT/PointerIntPair.h
@@ -227,6 +227,32 @@ struct PointerLikeTypeTraits<
PtrTraits::NumLowBitsAvailable - IntBits;
};
+// Allow structured bindings on PointerIntPair.
+template
+decltype(auto)
+get(const PointerIntPair &Pair) {
+ static_assert(I < 2);
+ if constexpr (I == 0)
+ return Pair.getPointer();
+ else
+ return Pair.getInt();
+}
+
} // end namespace llvm
+namespace std {
+template
+struct tuple_size<
+ llvm::PointerIntPair>
+ : std::integral_constant {};
+
+template
+struct tuple_element<
+ I, llvm::PointerIntPair>
+ : std::conditional {};
+} // namespace std
+
#endif // LLVM_ADT_POINTERINTPAIR_H
diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index bbc5bc8ceeea8..7b1cc24e47bae 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -705,6 +705,7 @@ class SmallVectorImpl : public SmallVectorTemplateBase {
void append(const SmallVectorImpl &RHS) { append(RHS.begin(), RHS.end()); }
+
void assign(size_type NumElts, ValueParamT Elt) {
// Note that Elt could be an internal reference.
if (NumElts > this->capacity()) {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 3a53017a4e1c9..246b5612c544f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -1275,7 +1275,8 @@ class LegalizationArtifactCombiner {
// Adding Use to ArtifactList.
WrapperObserver.changedInstr(Use);
break;
- case TargetOpcode::COPY: {
+ case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY: {
Register Copy = Use.getOperand(0).getReg();
if (Copy.isVirtual())
UpdatedDefs.push_back(Copy);
@@ -1296,6 +1297,7 @@ class LegalizationArtifactCombiner {
static Register getArtifactSrcReg(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
case TargetOpcode::G_TRUNC:
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_ANYEXT:
@@ -1333,8 +1335,7 @@ class LegalizationArtifactCombiner {
MachineInstr *TmpDef = MRI.getVRegDef(PrevRegSrc);
if (MRI.hasOneUse(PrevRegSrc)) {
if (TmpDef != &DefMI) {
- assert((TmpDef->getOpcode() == TargetOpcode::COPY ||
- isArtifactCast(TmpDef->getOpcode())) &&
+ assert((TmpDef->isCopy() || isArtifactCast(TmpDef->getOpcode())) &&
"Expecting copy or artifact cast here");
DeadInsts.push_back(TmpDef);
@@ -1421,7 +1422,8 @@ class LegalizationArtifactCombiner {
using namespace llvm::MIPatternMatch;
Register TmpReg;
- while (mi_match(Reg, MRI, m_Copy(m_Reg(TmpReg)))) {
+ while (mi_match(Reg, MRI, m_Copy(m_Reg(TmpReg))) ||
+ mi_match(Reg, MRI, m_Pred_Copy(m_Reg(TmpReg)))) {
if (MRI.getType(TmpReg).isValid())
Reg = TmpReg;
else
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 3879e22552ecb..decfda3da7a5f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -629,6 +629,12 @@ inline UnaryOp_match m_Copy(SrcTy &&Src) {
return UnaryOp_match(std::forward(Src));
}
+template
+inline UnaryOp_match m_Pred_Copy(SrcTy &&Src) {
+ return UnaryOp_match(
+ std::forward(Src));
+}
+
template
inline UnaryOp_match m_GFSqrt(const SrcTy &Src) {
return UnaryOp_match(Src);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h b/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
index d0918485249dc..6b1a973147558 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
@@ -85,6 +85,7 @@ class Pass;
class raw_ostream;
class TargetPassConfig;
class TargetRegisterInfo;
+class TargetInstrInfo;
/// This pass implements the reg bank selector pass used in the GlobalISel
/// pipeline. At the end of this pass, all register operands have been assigned
@@ -493,6 +494,9 @@ class RegBankSelect : public MachineFunctionPass {
/// Information on the register classes for the current function.
const TargetRegisterInfo *TRI = nullptr;
+ /// Information used to access the description of the opcodes.
+ const TargetInstrInfo *TII = nullptr;
+
/// Get the frequency of blocks.
/// This is required for non-fast mode.
MachineBlockFrequencyInfo *MBFI = nullptr;
diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
index 86ac30e181a6d..f0ed566b1dce9 100644
--- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
@@ -134,7 +134,7 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate {
: Parent(parent), NewRegs(newRegs), MRI(MF.getRegInfo()), LIS(lis),
VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()), TheDelegate(delegate),
FirstNew(newRegs.size()), DeadRemats(deadRemats) {
- MRI.setDelegate(this);
+ MRI.addDelegate(this);
}
~LiveRangeEdit() override { MRI.resetDelegate(this); }
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index fe4ad270f2a3b..39f0bf4122233 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -775,10 +775,15 @@ class MachineBasicBlock
/// Return the fallthrough block if the block can implicitly
/// transfer control to the block after it by falling off the end of
- /// it. This should return null if it can reach the block after
- /// it, but it uses an explicit branch to do so (e.g., a table
- /// jump). Non-null return is a conservative answer.
- MachineBasicBlock *getFallThrough();
+ /// it. If an explicit branch to the fallthrough block is not allowed,
+ /// set JumpToFallThrough to be false. Non-null return is a conservative
+ /// answer.
+ MachineBasicBlock *getFallThrough(bool JumpToFallThrough = false);
+
+ /// Return the fallthrough block if the block can implicitly
+ /// transfer control to it's successor, whether by a branch or
+ /// a fallthrough. Non-null return is a conservative answer.
+ MachineBasicBlock *getLogicalFallThrough() { return getFallThrough(true); }
/// Return true if the block can implicitly transfer control to the
/// block after it by falling off the end of it. This should return
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index cd3aa938ed870..3cb8d7b35e374 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -99,9 +99,10 @@ struct MachineFunctionInfo {
/// supplied allocator.
///
/// This function can be overridden in a derive class.
- template
- static Ty *create(BumpPtrAllocator &Allocator, MachineFunction &MF) {
- return new (Allocator.Allocate()) Ty(MF);
+ template
+ static FuncInfoTy *create(BumpPtrAllocator &Allocator, const Function &F,
+ const SubtargetTy *STI) {
+ return new (Allocator.Allocate()) FuncInfoTy(F, STI);
}
template
@@ -280,6 +281,7 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
// Keep track of the function section.
MCSection *Section = nullptr;
+ // Catchpad unwind destination info for wasm EH.
// Keeps track of Wasm exception handling related data. This will be null for
// functions that aren't using a wasm EH personality.
WasmEHFuncInfo *WasmEHInfo = nullptr;
@@ -752,14 +754,12 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
///
template
Ty *getInfo() {
- if (!MFInfo)
- MFInfo = Ty::template create(Allocator, *this);
return static_cast(MFInfo);
}
template
const Ty *getInfo() const {
- return const_cast(this)->getInfo();
+ return static_cast(MFInfo);
}
template Ty *cloneInfo(const Ty &Old) {
@@ -768,6 +768,9 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
return static_cast(MFInfo);
}
+ /// Initialize the target specific MachineFunctionInfo
+ void initTargetMachineFunctionInfo(const TargetSubtargetInfo &STI);
+
MachineFunctionInfo *cloneInfoFrom(
const MachineFunction &OrigMF,
const DenseMap &Src2DstMBB) {
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index ec0cc763dbac3..942d52f387489 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1354,7 +1354,8 @@ class MachineInstr
}
bool isCopy() const {
- return getOpcode() == TargetOpcode::COPY;
+ return getOpcode() == TargetOpcode::COPY ||
+ getOpcode() == TargetOpcode::PRED_COPY;
}
bool isFullCopy() const {
@@ -1388,6 +1389,7 @@ class MachineInstr
case TargetOpcode::PHI:
case TargetOpcode::G_PHI:
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
case TargetOpcode::INSERT_SUBREG:
case TargetOpcode::SUBREG_TO_REG:
case TargetOpcode::REG_SEQUENCE:
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index a51f1c753cd02..572217213920d 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -17,6 +17,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/IndexedMap.h"
#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/ADT/iterator_range.h"
@@ -56,11 +57,15 @@ class MachineRegisterInfo {
virtual ~Delegate() = default;
virtual void MRI_NoteNewVirtualRegister(Register Reg) = 0;
+ virtual void MRI_NotecloneVirtualRegister(Register NewReg,
+ Register SrcReg) {
+ MRI_NoteNewVirtualRegister(NewReg);
+ }
};
private:
MachineFunction *MF;
- Delegate *TheDelegate = nullptr;
+ SmallPtrSet TheDelegates;
/// True if subregister liveness is tracked.
const bool TracksSubRegLiveness;
@@ -154,19 +159,28 @@ class MachineRegisterInfo {
void resetDelegate(Delegate *delegate) {
// Ensure another delegate does not take over unless the current
- // delegate first unattaches itself. If we ever need to multicast
- // notifications, we will need to change to using a list.
- assert(TheDelegate == delegate &&
- "Only the current delegate can perform reset!");
- TheDelegate = nullptr;
+ // delegate first unattaches itself.
+ assert(TheDelegates.count(delegate) &&
+ "Only an existing delegate can perform reset!");
+ TheDelegates.erase(delegate);
}
- void setDelegate(Delegate *delegate) {
- assert(delegate && !TheDelegate &&
- "Attempted to set delegate to null, or to change it without "
+ void addDelegate(Delegate *delegate) {
+ assert(delegate && !TheDelegates.count(delegate) &&
+ "Attempted to add null delegate, or to change it without "
"first resetting it!");
- TheDelegate = delegate;
+ TheDelegates.insert(delegate);
+ }
+
+ void noteNewVirtualRegister(Register Reg) {
+ for (auto *TheDelegate : TheDelegates)
+ TheDelegate->MRI_NoteNewVirtualRegister(Reg);
+ }
+
+ void noteCloneVirtualRegister(Register NewReg, Register SrcReg) {
+ for (auto *TheDelegate : TheDelegates)
+ TheDelegate->MRI_NotecloneVirtualRegister(NewReg, SrcReg);
}
//===--------------------------------------------------------------------===//
@@ -900,6 +914,18 @@ class MachineRegisterInfo {
/// of reserved registers before allocation begins.
void freezeReservedRegs(const MachineFunction&);
+ /// reserveReg -- Mark a register as reserved so checks like isAllocatable
+ /// will not suggest using it. This should not be used during the middle
+ /// of a function walk, or when liveness info is available.
+ void reserveReg(MCRegister PhysReg, const TargetRegisterInfo *TRI) {
+ assert(reservedRegsFrozen() &&
+ "Reserved registers haven't been frozen yet. ");
+ MCRegAliasIterator R(PhysReg, TRI, true);
+
+ for (; R.isValid(); ++R)
+ ReservedRegs.set(*R);
+ }
+
/// reservedRegsFrozen - Returns true after freezeReservedRegs() was called
/// to ensure the set of reserved registers stays constant.
bool reservedRegsFrozen() const {
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 40663f95fa0a8..665222efaeacd 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -98,10 +98,11 @@ struct ExtAddrMode {
class TargetInstrInfo : public MCInstrInfo {
public:
TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u,
- unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u)
+ unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u,
+ unsigned CopyOpcode = TargetOpcode::COPY)
: CallFrameSetupOpcode(CFSetupOpcode),
CallFrameDestroyOpcode(CFDestroyOpcode), CatchRetOpcode(CatchRetOpcode),
- ReturnOpcode(ReturnOpcode) {}
+ ReturnOpcode(ReturnOpcode), CopyOpcode(CopyOpcode) {}
TargetInstrInfo(const TargetInstrInfo &) = delete;
TargetInstrInfo &operator=(const TargetInstrInfo &) = delete;
virtual ~TargetInstrInfo();
@@ -240,6 +241,7 @@ class TargetInstrInfo : public MCInstrInfo {
unsigned getCatchReturnOpcode() const { return CatchRetOpcode; }
unsigned getReturnOpcode() const { return ReturnOpcode; }
+ unsigned getCopyOpcode() const { return CopyOpcode; }
/// Returns the actual stack pointer adjustment made by an instruction
/// as part of a call sequence. By default, only call frame setup/destroy
@@ -1060,24 +1062,36 @@ class TargetInstrInfo : public MCInstrInfo {
/// Store the specified register of the given register class to the specified
/// stack frame index. The store instruction is to be added to the given
/// machine basic block before the specified machine instruction. If isKill
- /// is true, the register operand is the last use and must be marked kill.
+ /// is true, the register operand is the last use and must be marked kill. If
+ /// \p SrcReg is being directly spilled as part of assigning a virtual
+ /// register, \p VReg is the register being assigned. This additional register
+ /// argument is needed for certain targets when invoked from RegAllocFast to
+ /// map the spilled physical register to its virtual register. A null register
+ /// can be passed elsewhere.
virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register SrcReg, bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
+ const TargetRegisterInfo *TRI,
+ Register VReg) const {
llvm_unreachable("Target didn't implement "
"TargetInstrInfo::storeRegToStackSlot!");
}
/// Load the specified register of the given register class from the specified
/// stack frame index. The load instruction is to be added to the given
- /// machine basic block before the specified machine instruction.
+ /// machine basic block before the specified machine instruction. If \p
+ /// DestReg is being directly reloaded as part of assigning a virtual
+ /// register, \p VReg is the register being assigned. This additional register
+ /// argument is needed for certain targets when invoked from RegAllocFast to
+ /// map the loaded physical register to its virtual register. A null register
+ /// can be passed elsewhere.
virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
+ const TargetRegisterInfo *TRI,
+ Register VReg) const {
llvm_unreachable("Target didn't implement "
"TargetInstrInfo::loadRegFromStackSlot!");
}
@@ -1899,14 +1913,48 @@ class TargetInstrInfo : public MCInstrInfo {
return false;
}
+ /// Helper function for inserting a COPY to \p Dst at insertion point \p InsPt
+ /// in \p MBB block.
+ MachineInstr *buildCopy(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsPt, const DebugLoc &DL,
+ Register Dst) const {
+ return BuildMI(MBB, InsPt, DL, get(getCopyOpcode()), Dst);
+ }
+
+ /// Helper function for inserting a COPY to \p Dst from \p Src at insertion
+ /// point \p InsPt in \p MBB block.
+ MachineInstr *buildCopy(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsPt, const DebugLoc &DL,
+ Register Dst, Register Src, unsigned Flags = 0,
+ unsigned SubReg = 0) const {
+ return BuildMI(MBB, InsPt, DL, get(getCopyOpcode()), Dst)
+ .addReg(Src, Flags, SubReg);
+ }
+
+ /// Helper function for inserting a COPY to \p Dst from \p Src at insertion
+ /// point \p InsPt in \p MBB block. Get the Debug Location from \p MIMD.
+ MachineInstrBuilder buildCopy(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsPt,
+ const MIMetadata &MIMD, Register Dst,
+ Register Src, unsigned Flags = 0,
+ unsigned SubReg = 0) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineInstr *MI =
+ MF.CreateMachineInstr(get(getCopyOpcode()), MIMD.getDL());
+ MBB.insert(InsPt, MI);
+ return MachineInstrBuilder(MF, MI)
+ .setPCSections(MIMD.getPCSections())
+ .addReg(Dst, RegState::Define)
+ .addReg(Src, Flags, SubReg);
+ }
+
/// During PHI eleimination lets target to make necessary checks and
/// insert the copy to the PHI destination register in a target specific
/// manner.
virtual MachineInstr *createPHIDestinationCopy(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
const DebugLoc &DL, Register Src, Register Dst) const {
- return BuildMI(MBB, InsPt, DL, get(TargetOpcode::COPY), Dst)
- .addReg(Src);
+ return buildCopy(MBB, InsPt, DL, Dst, Src);
}
/// During PHI eleimination lets target to make necessary checks and
@@ -1917,8 +1965,7 @@ class TargetInstrInfo : public MCInstrInfo {
const DebugLoc &DL, Register Src,
unsigned SrcSubReg,
Register Dst) const {
- return BuildMI(MBB, InsPt, DL, get(TargetOpcode::COPY), Dst)
- .addReg(Src, 0, SrcSubReg);
+ return buildCopy(MBB, InsPt, DL, Dst, Src, 0, SrcSubReg);
}
/// Returns a \p outliner::OutlinedFunction struct containing target-specific
@@ -2019,6 +2066,7 @@ class TargetInstrInfo : public MCInstrInfo {
unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
unsigned CatchRetOpcode;
unsigned ReturnOpcode;
+ unsigned CopyOpcode;
};
/// Provide DenseMapInfo for TargetInstrInfo::RegSubRegPair.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a76fb97a14dc5..41f90d5152235 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1049,6 +1049,10 @@ class TargetLoweringBase {
// value representing memory location
PointerUnion ptrVal;
+ // Fallback address space for use if ptrVal is nullptr. None means unknown
+ // address space.
+ Optional fallbackAddressSpace;
+
int offset = 0; // offset off of ptrVal
uint64_t size = 0; // the size of the memory location
// (taken from memVT if zero)
@@ -4050,23 +4054,6 @@ class TargetLowering : public TargetLoweringBase {
return false;
}
- /// Allows the target to handle physreg-carried dependency
- /// in target-specific way. Used from the ScheduleDAGSDNodes to decide whether
- /// to add the edge to the dependency graph.
- /// Def - input: Selection DAG node defininfg physical register
- /// User - input: Selection DAG node using physical register
- /// Op - input: Number of User operand
- /// PhysReg - inout: set to the physical register if the edge is
- /// necessary, unchanged otherwise
- /// Cost - inout: physical register copy cost.
- /// Returns 'true' is the edge is necessary, 'false' otherwise
- virtual bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
- const TargetRegisterInfo *TRI,
- const TargetInstrInfo *TII,
- unsigned &PhysReg, int &Cost) const {
- return false;
- }
-
/// Target-specific combining of register parts into its original value
virtual SDValue
joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index d55f88dd50e57..2e8527b00dcc2 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -905,6 +905,10 @@ class TargetRegisterInfo : public MCRegisterInfo {
/// (3) Bottom-up allocation is no longer guaranteed to optimally color.
virtual bool reverseLocalAssignment() const { return false; }
+ /// Add the allocation priority to global and split ranges as well as the
+ /// local ranges when registers are added to the queue.
+ virtual bool addAllocPriorityToGlobalRanges() const { return false; }
+
/// Allow the target to override the cost of using a callee-saved register for
/// the first time. Default value of 0 means we will use a callee-saved
/// register if it is available.
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index d37d5c053ca4f..9a6167d0f5887 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -193,6 +193,7 @@ enum OMPTgtExecModeFlags : int8_t {
OMP_TGT_EXEC_MODE_GENERIC_SPMD =
OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
OMP_TGT_EXEC_MODE_SPMD_NO_LOOP = 1 << 2,
+ OMP_TGT_EXEC_MODE_SPMD_BIG_JUMP_LOOP = OMP_TGT_EXEC_MODE_SPMD_NO_LOOP | 1,
OMP_TGT_EXEC_MODE_XTEAM_RED = 1 << 3,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ OMP_TGT_EXEC_MODE_XTEAM_RED)
};
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index a9fdf97782e8e..6da08a06b46c0 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -531,6 +531,38 @@ __OMP_RTL(__kmpc_xteamr_f_16x64, false, Void, Float, FloatPtr, FloatPtr, Int32Pt
__OMP_RTL(__kmpc_xteamr_d_16x64, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32)
+__OMP_RTL(__kmpc_xteamr_ui_8x64, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_ul_8x64, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_f_8x64, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_d_8x64, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_ui_4x64, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_ul_4x64, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_f_4x64, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_d_4x64, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_ui_2x64, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_ul_2x64, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_f_2x64, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_d_2x64, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_ui_1x64, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_ul_1x64, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_f_1x64, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32)
+
+__OMP_RTL(__kmpc_xteamr_d_1x64, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32)
+
__OMP_RTL(__last, false, Void, )
#undef __OMP_RTL
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 24da08d70b726..9b210c9317035 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -516,6 +516,8 @@ class LLVM_EXTERNAL_VISIBILITY Module {
void addModuleFlag(MDNode *Node);
/// Like addModuleFlag but replaces the old module flag if it already exists.
void setModuleFlag(ModFlagBehavior Behavior, StringRef Key, Metadata *Val);
+ void setModuleFlag(ModFlagBehavior Behavior, StringRef Key, Constant *Val);
+ void setModuleFlag(ModFlagBehavior Behavior, StringRef Key, uint32_t Val);
/// @}
/// @name Materialization
diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h
index 0f33d3b6a2398..d5d3db0891e3f 100644
--- a/llvm/include/llvm/MC/MCSubtargetInfo.h
+++ b/llvm/include/llvm/MC/MCSubtargetInfo.h
@@ -230,6 +230,10 @@ class MCSubtargetInfo {
return Found != ProcDesc.end() && StringRef(Found->Key) == CPU;
}
+ ArrayRef getAllProcessorDescriptions() const {
+ return ProcDesc;
+ }
+
virtual unsigned getHwMode() const { return 0; }
/// Return the cache size in bytes for the given level of cache.
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 5fb7400b7d024..15f968bab91c3 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -117,6 +117,10 @@ HANDLE_TARGET_OPCODE(DBG_KILL)
/// used to copy between subregisters of virtual registers.
HANDLE_TARGET_OPCODE(COPY)
+/// PRED_COPY - Target-independent register copy with a predication.
+/// Some targets require it for special handling certain register copies.
+ HANDLE_TARGET_OPCODE(PRED_COPY)
+
/// BUNDLE - This instruction represents an instruction bundle. Instructions
/// which immediately follow a BUNDLE instruction which are marked with
/// 'InsideBundle' flag are inside the bundle.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9f29e9faf385b..08f6418f6810b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -126,8 +126,8 @@ GIDefMatchData<"std::function">;
def unsigned_matchinfo: GIDefMatchData<"unsigned">;
def copy_prop : GICombineRule<
- (defs root:$d),
- (match (COPY $d, $s):$mi,
+ (defs root:$mi),
+ (match (wip_match_opcode COPY, PRED_COPY):$mi,
[{ return Helper.matchCombineCopy(*${mi}); }]),
(apply [{ Helper.applyCombineCopy(*${mi}); }])>;
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index a425b0d2bbea1..0f9d0f7b05159 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1265,6 +1265,15 @@ def COPY : StandardPseudoInstruction {
let isAsCheapAsAMove = true;
let hasNoSchedulingInfo = false;
}
+def PRED_COPY : StandardPseudoInstruction {
+ let OutOperandList = (outs unknown:$dst);
+ let InOperandList = (ins unknown:$src);
+ let AsmString = "PRED_COPY";
+ let hasSideEffects = false;
+ let isAsCheapAsAMove = true;
+ let hasNoSchedulingInfo = false;
+ let isPredicable = true;
+}
def BUNDLE : StandardPseudoInstruction {
let OutOperandList = (outs);
let InOperandList = (ins variable_ops);
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index d45be68e8a235..55dadffa88ad2 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -20,6 +20,7 @@
#include "llvm/ADT/Triple.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Support/Allocator.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/PGOOptions.h"
@@ -66,6 +67,7 @@ class PassManagerBase;
}
using legacy::PassManagerBase;
+struct MachineFunctionInfo;
namespace yaml {
struct MachineFunctionInfo;
}
@@ -139,6 +141,13 @@ class TargetMachine {
return nullptr;
}
+ /// Create the target's instance of MachineFunctionInfo
+ virtual MachineFunctionInfo *
+ createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
+ const TargetSubtargetInfo *STI) const {
+ return nullptr;
+ }
+
/// Allocate and return a default initialized instance of the YAML
/// representation for the MachineFunctionInfo.
virtual yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const {
@@ -507,6 +516,9 @@ class LLVMTargetMachine : public TargetMachine {
/// The default variant to use in unqualified `asm` instructions.
/// If this returns 0, `asm "$(foo$|bar$)"` will evaluate to `asm "foo"`.
virtual int unqualifiedInlineAsmVariant() const { return 0; }
+
+ // MachineRegisterInfo callback function
+ virtual void registerMachineRegisterInfoCallback(MachineFunction &MF) const {}
};
/// Helper method for getting the code model, returning Default if
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 61c26dfabed0b..5d1ecdc147f15 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -211,43 +211,65 @@ combineOptionalValuesInAAValueLatice(const Optional &A,
/// Helper to represent an access offset and size, with logic to deal with
/// uncertainty and check for overlapping accesses.
-struct OffsetAndSize {
+struct RangeTy {
int64_t Offset = Unassigned;
int64_t Size = Unassigned;
- OffsetAndSize(int64_t Offset, int64_t Size) : Offset(Offset), Size(Size) {}
- OffsetAndSize() = default;
- static OffsetAndSize getUnknown() { return OffsetAndSize{Unknown, Unknown}; }
+ RangeTy(int64_t Offset, int64_t Size) : Offset(Offset), Size(Size) {}
+ RangeTy() = default;
+ static RangeTy getUnknown() { return RangeTy{Unknown, Unknown}; }
/// Return true if offset or size are unknown.
bool offsetOrSizeAreUnknown() const {
- return Offset == OffsetAndSize::Unknown || Size == OffsetAndSize::Unknown;
+ return Offset == RangeTy::Unknown || Size == RangeTy::Unknown;
}
/// Return true if offset and size are unknown, thus this is the default
/// unknown object.
bool offsetAndSizeAreUnknown() const {
- return Offset == OffsetAndSize::Unknown && Size == OffsetAndSize::Unknown;
+ return Offset == RangeTy::Unknown && Size == RangeTy::Unknown;
}
/// Return true if the offset and size are unassigned.
bool isUnassigned() const {
- assert((Offset == OffsetAndSize::Unassigned) ==
- (Size == OffsetAndSize::Unassigned) &&
+ assert((Offset == RangeTy::Unassigned) == (Size == RangeTy::Unassigned) &&
"Inconsistent state!");
- return Offset == OffsetAndSize::Unassigned;
+ return Offset == RangeTy::Unassigned;
}
/// Return true if this offset and size pair might describe an address that
- /// overlaps with \p OAS.
- bool mayOverlap(const OffsetAndSize &OAS) const {
+ /// overlaps with \p Range.
+ bool mayOverlap(const RangeTy &Range) const {
// Any unknown value and we are giving up -> overlap.
- if (offsetOrSizeAreUnknown() || OAS.offsetOrSizeAreUnknown())
+ if (offsetOrSizeAreUnknown() || Range.offsetOrSizeAreUnknown())
return true;
// Check if one offset point is in the other interval [offset,
// offset+size].
- return OAS.Offset + OAS.Size > Offset && OAS.Offset < Offset + Size;
+ return Range.Offset + Range.Size > Offset && Range.Offset < Offset + Size;
+ }
+
+ RangeTy &operator&=(const RangeTy &R) {
+ if (Offset == Unassigned)
+ Offset = R.Offset;
+ else if (R.Offset != Unassigned && R.Offset != Offset)
+ Offset = Unknown;
+
+ if (Size == Unassigned)
+ Size = R.Size;
+ else if (Size == Unknown || R.Size == Unknown)
+ Size = Unknown;
+ else if (R.Size != Unassigned)
+ Size = std::max(Size, R.Size);
+
+ return *this;
+ }
+
+ /// Comparison for sorting ranges by offset.
+ ///
+ /// Returns true if the offset \p L is less than that of \p R.
+ inline static bool OffsetLessThan(const RangeTy &L, const RangeTy &R) {
+ return L.Offset < R.Offset;
}
/// Constants used to represent special offsets or sizes.
@@ -258,19 +280,22 @@ struct OffsetAndSize {
static constexpr int64_t Unknown = -2;
};
-inline bool operator==(const OffsetAndSize &A, const OffsetAndSize &B) {
- return A.Offset == B.Offset && A.Size == B.Size;
+inline raw_ostream &operator<<(raw_ostream &OS, const RangeTy &R) {
+ OS << "[" << R.Offset << ", " << R.Size << "]";
+ return OS;
}
-inline bool operator!=(const OffsetAndSize &A, const OffsetAndSize &B) {
- return !(A == B);
+inline bool operator==(const RangeTy &A, const RangeTy &B) {
+ return A.Offset == B.Offset && A.Size == B.Size;
}
+inline bool operator!=(const RangeTy &A, const RangeTy &B) { return !(A == B); }
+
/// Return the initial value of \p Obj with type \p Ty if that is a constant.
Constant *getInitialValueForObj(Value &Obj, Type &Ty,
const TargetLibraryInfo *TLI,
const DataLayout &DL,
- OffsetAndSize *OASPtr = nullptr);
+ RangeTy *RangePtr = nullptr);
/// Collect all potential underlying objects of \p Ptr at position \p CtxI in
/// \p Objects. Assumed information is used and dependences onto \p QueryingAA
@@ -1764,7 +1789,10 @@ struct Attributor {
/// Try to simplify \p IRP and in the scope \p S. If successful, true is
/// returned and all potential values \p IRP can take are put into \p Values.
- /// If false is returned no other information is valid.
+ /// If the result in \p Values contains select or PHI instructions it means
+ /// those could not be simplified to a single value. Recursive calls with
+ /// these instructions will yield their respective potential values. If false
+ /// is returned no other information is valid.
bool getAssumedSimplifiedValues(const IRPosition &IRP,
const AbstractAttribute *AA,
SmallVectorImpl &Values,
@@ -4972,7 +5000,7 @@ struct AAPointerInfo : public AbstractAttribute {
AAPointerInfo(const IRPosition &IRP) : AbstractAttribute(IRP) {}
enum AccessKind {
- // First two bits to distinguish may and must accesses
+ // First two bits to distinguish may and must accesses.
AK_MUST = 1 << 0,
AK_MAY = 1 << 1,
@@ -4981,6 +5009,11 @@ struct AAPointerInfo : public AbstractAttribute {
AK_W = 1 << 3,
AK_RW = AK_R | AK_W,
+ // One special case for assumptions about memory content. These
+ // are neither reads nor writes. They are however always modeled
+ // as read to avoid using them for write removal.
+ AK_ASSUMPTION = (1 << 4) | AK_MUST,
+
// Helper for easy access.
AK_MAY_READ = AK_MAY | AK_R,
AK_MAY_WRITE = AK_MAY | AK_W,
@@ -4990,41 +5023,221 @@ struct AAPointerInfo : public AbstractAttribute {
AK_MUST_READ_WRITE = AK_MUST | AK_R | AK_W,
};
+ /// A container for a list of ranges.
+ struct RangeList {
+ // The set of ranges rarely contains more than one element, and is unlikely
+ // to contain more than say four elements. So we find the middle-ground with
+ // a sorted vector. This avoids hard-coding a rarely used number like "four"
+ // into every instance of a SmallSet.
+ using RangeTy = AA::RangeTy;
+ using VecTy = SmallVector;
+ using iterator = VecTy::iterator;
+ using const_iterator = VecTy::const_iterator;
+ VecTy Ranges;
+
+ RangeList(const RangeTy &R) { Ranges.push_back(R); }
+ RangeList(ArrayRef Offsets, int64_t Size) {
+ Ranges.reserve(Offsets.size());
+ for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+ assert(((i + 1 == e) || Offsets[i] < Offsets[i + 1]) &&
+ "Expected strictly ascending offsets.");
+ Ranges.emplace_back(Offsets[i], Size);
+ }
+ }
+ RangeList() = default;
+
+ iterator begin() { return Ranges.begin(); }
+ iterator end() { return Ranges.end(); }
+ const_iterator begin() const { return Ranges.begin(); }
+ const_iterator end() const { return Ranges.end(); }
+
+ // Helpers required for std::set_difference
+ using value_type = RangeTy;
+ void push_back(const RangeTy &R) {
+ assert((Ranges.empty() || RangeTy::OffsetLessThan(Ranges.back(), R)) &&
+ "Ensure the last element is the greatest.");
+ Ranges.push_back(R);
+ }
+
+ /// Copy ranges from \p L that are not in \p R, into \p D.
+ static void set_difference(const RangeList &L, const RangeList &R,
+ RangeList &D) {
+ std::set_difference(L.begin(), L.end(), R.begin(), R.end(),
+ std::back_inserter(D), RangeTy::OffsetLessThan);
+ }
+
+ unsigned size() const { return Ranges.size(); }
+
+ bool operator==(const RangeList &OI) const { return Ranges == OI.Ranges; }
+
+ /// Merge the ranges in \p RHS into the current ranges.
+ /// - Merging a list of unknown ranges makes the current list unknown.
+ /// - Ranges with the same offset are merged according to RangeTy::operator&
+ /// \return true if the current RangeList changed.
+ bool merge(const RangeList &RHS) {
+ if (isUnknown())
+ return false;
+ if (RHS.isUnknown()) {
+ setUnknown();
+ return true;
+ }
+
+ if (Ranges.empty()) {
+ Ranges = RHS.Ranges;
+ return true;
+ }
+
+ bool Changed = false;
+ auto LPos = Ranges.begin();
+ for (auto &R : RHS.Ranges) {
+ auto Result = insert(LPos, R);
+ if (isUnknown())
+ return true;
+ LPos = Result.first;
+ Changed |= Result.second;
+ }
+ return Changed;
+ }
+
+ /// Insert \p R at the given iterator \p Pos, and merge if necessary.
+ ///
+ /// This assumes that all ranges before \p Pos are OffsetLessThan \p R, and
+ /// then maintains the sorted order for the suffix list.
+ ///
+ /// \return The place of insertion and true iff anything changed.
+ std::pair insert(iterator Pos, const RangeTy &R) {
+ if (isUnknown())
+ return std::make_pair(Ranges.begin(), false);
+ if (R.offsetOrSizeAreUnknown()) {
+ return std::make_pair(setUnknown(), true);
+ }
+
+ // Maintain this as a sorted vector of unique entries.
+ auto LB = std::lower_bound(Pos, Ranges.end(), R, RangeTy::OffsetLessThan);
+ if (LB == Ranges.end() || LB->Offset != R.Offset)
+ return std::make_pair(Ranges.insert(LB, R), true);
+ bool Changed = *LB != R;
+ *LB &= R;
+ if (LB->offsetOrSizeAreUnknown())
+ return std::make_pair(setUnknown(), true);
+ return std::make_pair(LB, Changed);
+ }
+
+ /// Insert the given range \p R, maintaining sorted order.
+ ///
+ /// \return The place of insertion and true iff anything changed.
+ std::pair insert(const RangeTy &R) {
+ return insert(Ranges.begin(), R);
+ }
+
+ /// Add the increment \p Inc to the offset of every range.
+ void addToAllOffsets(int64_t Inc) {
+ assert(!isUnassigned() &&
+ "Cannot increment if the offset is not yet computed!");
+ if (isUnknown())
+ return;
+ for (auto &R : Ranges) {
+ R.Offset += Inc;
+ }
+ }
+
+ /// Return true iff there is exactly one range and it is known.
+ bool isUnique() const {
+ return Ranges.size() == 1 && !Ranges.front().offsetOrSizeAreUnknown();
+ }
+
+ /// Return the unique range, assuming it exists.
+ const RangeTy &getUnique() const {
+ assert(isUnique() && "No unique range to return!");
+ return Ranges.front();
+ }
+
+ /// Return true iff the list contains an unknown range.
+ bool isUnknown() const {
+ if (isUnassigned())
+ return false;
+ if (Ranges.front().offsetOrSizeAreUnknown()) {
+ assert(Ranges.size() == 1 && "Unknown is a singleton range.");
+ return true;
+ }
+ return false;
+ }
+
+ /// Discard all ranges and insert a single unknown range.
+ iterator setUnknown() {
+ Ranges.clear();
+ Ranges.push_back(RangeTy::getUnknown());
+ return Ranges.begin();
+ }
+
+ /// Return true if no ranges have been inserted.
+ bool isUnassigned() const { return Ranges.size() == 0; }
+ };
+
/// An access description.
struct Access {
- Access(Instruction *I, Optional Content, AccessKind Kind, Type *Ty)
- : LocalI(I), RemoteI(I), Content(Content), Kind(Kind), Ty(Ty) {
+ Access(Instruction *I, int64_t Offset, int64_t Size,
+ Optional Content, AccessKind Kind, Type *Ty)
+ : LocalI(I), RemoteI(I), Content(Content), Ranges(Offset, Size),
+ Kind(Kind), Ty(Ty) {
verify();
}
- Access(Instruction *LocalI, Instruction *RemoteI, Optional Content,
- AccessKind Kind, Type *Ty)
- : LocalI(LocalI), RemoteI(RemoteI), Content(Content), Kind(Kind),
- Ty(Ty) {
+ Access(Instruction *LocalI, Instruction *RemoteI, const RangeList &Ranges,
+ Optional Content, AccessKind K, Type *Ty)
+ : LocalI(LocalI), RemoteI(RemoteI), Content(Content), Ranges(Ranges),
+ Kind(K), Ty(Ty) {
+ if (Ranges.size() > 1) {
+ Kind = AccessKind(Kind | AK_MAY);
+ Kind = AccessKind(Kind & ~AK_MUST);
+ }
+ verify();
+ }
+ Access(Instruction *LocalI, Instruction *RemoteI, int64_t Offset,
+ int64_t Size, Optional Content, AccessKind Kind, Type *Ty)
+ : LocalI(LocalI), RemoteI(RemoteI), Content(Content),
+ Ranges(Offset, Size), Kind(Kind), Ty(Ty) {
verify();
}
Access(const Access &Other) = default;
- Access(const Access &&Other)
- : LocalI(Other.LocalI), RemoteI(Other.RemoteI), Content(Other.Content),
- Kind(Other.Kind), Ty(Other.Ty) {}
Access &operator=(const Access &Other) = default;
bool operator==(const Access &R) const {
- return LocalI == R.LocalI && RemoteI == R.RemoteI &&
+ return LocalI == R.LocalI && RemoteI == R.RemoteI && Ranges == R.Ranges &&
Content == R.Content && Kind == R.Kind;
}
bool operator!=(const Access &R) const { return !(*this == R); }
Access &operator&=(const Access &R) {
assert(RemoteI == R.RemoteI && "Expected same instruction!");
+ assert(LocalI == R.LocalI && "Expected same instruction!");
+
+ // Note that every Access object corresponds to a unique Value, and only
+ // accesses to the same Value are merged. Hence we assume that all ranges
+ // are the same size. If ranges can be different size, then the contents
+ // must be dropped.
+ Ranges.merge(R.Ranges);
Content =
AA::combineOptionalValuesInAAValueLatice(Content, R.Content, Ty);
+
+ // Combine the access kind, which results in a bitwise union.
+ // If there is more than one range, then this must be a MAY.
+ // If we combine a may and a must access we clear the must bit.
Kind = AccessKind(Kind | R.Kind);
+ if ((Kind & AK_MAY) || Ranges.size() > 1) {
+ Kind = AccessKind(Kind | AK_MAY);
+ Kind = AccessKind(Kind & ~AK_MUST);
+ }
+ verify();
return *this;
}
void verify() {
assert(isMustAccess() + isMayAccess() == 1 &&
"Expect must or may access, not both.");
+ assert(isAssumption() + isWrite() <= 1 &&
+ "Expect assumption access or write access, never both.");
+ assert((isMayAccess() || Ranges.size() == 1) &&
+ "Cannot be a must access if there are multiple ranges.");
}
/// Return the access kind.
@@ -5036,8 +5249,25 @@ struct AAPointerInfo : public AbstractAttribute {
/// Return true if this is a write access.
bool isWrite() const { return Kind & AK_W; }
- bool isMustAccess() const { return Kind & AK_MUST; }
- bool isMayAccess() const { return Kind & AK_MAY; }
+ /// Return true if this is a write access.
+ bool isWriteOrAssumption() const { return isWrite() | isAssumption(); }
+
+ /// Return true if this is an assumption access.
+ bool isAssumption() const { return Kind == AK_ASSUMPTION; }
+
+ bool isMustAccess() const {
+ bool MustAccess = Kind & AK_MUST;
+ assert((!MustAccess || Ranges.size() < 2) &&
+ "Cannot be a must access if there are multiple ranges.");
+ return MustAccess;
+ }
+
+ bool isMayAccess() const {
+ bool MayAccess = Kind & AK_MAY;
+ assert((MayAccess || Ranges.size() < 2) &&
+ "Cannot be a must access if there are multiple ranges.");
+ return MayAccess;
+ }
/// Return the instruction that causes the access with respect to the local
/// scope of the associated attribute.
@@ -5054,18 +5284,43 @@ struct AAPointerInfo : public AbstractAttribute {
return Content.has_value() && !*Content;
}
+ /// Set the value written to nullptr, i.e., unknown.
+ void setWrittenValueUnknown() { Content = nullptr; }
+
/// Return the type associated with the access, if known.
Type *getType() const { return Ty; }
- /// Return the value writen, if any. As long as
- /// isWrittenValueYetUndetermined return true this function shall not be
- /// called.
- Value *getWrittenValue() const { return *Content; }
+ /// Return the value writen, if any.
+ Value *getWrittenValue() const {
+ assert(!isWrittenValueYetUndetermined() &&
+ "Value needs to be determined before accessing it.");
+ return *Content;
+ }
/// Return the written value which can be `llvm::null` if it is not yet
/// determined.
Optional getContent() const { return Content; }
+ bool hasUniqueRange() const { return Ranges.isUnique(); }
+ const AA::RangeTy &getUniqueRange() const { return Ranges.getUnique(); }
+
+ /// Add a range accessed by this Access.
+ ///
+ /// If there are multiple ranges, then this is a "may access".
+ void addRange(int64_t Offset, int64_t Size) {
+ Ranges.insert({Offset, Size});
+ if (!hasUniqueRange()) {
+ Kind = AccessKind(Kind | AK_MAY);
+ Kind = AccessKind(Kind & ~AK_MUST);
+ }
+ }
+
+ const RangeList &getRanges() const { return Ranges; }
+
+ using const_iterator = RangeList::const_iterator;
+ const_iterator begin() const { return Ranges.begin(); }
+ const_iterator end() const { return Ranges.end(); }
+
private:
/// The instruction responsible for the access with respect to the local
/// scope of the associated attribute.
@@ -5078,6 +5333,9 @@ struct AAPointerInfo : public AbstractAttribute {
/// cannot be determined.
Optional Content;
+ /// Set of potential ranges accessed from the base pointer.
+ RangeList Ranges;
+
/// The access kind, e.g., READ, as bitset (could be more than one).
AccessKind Kind;
@@ -5095,13 +5353,12 @@ struct AAPointerInfo : public AbstractAttribute {
/// See AbstractAttribute::getIdAddr()
const char *getIdAddr() const override { return &ID; }
- /// Call \p CB on all accesses that might interfere with \p OAS and return
+ /// Call \p CB on all accesses that might interfere with \p Range and return
/// true if all such accesses were known and the callback returned true for
/// all of them, false otherwise. An access interferes with an offset-size
/// pair if it might read or write that memory region.
virtual bool forallInterferingAccesses(
- AA::OffsetAndSize OAS,
- function_ref CB) const = 0;
+ AA::RangeTy Range, function_ref CB) const = 0;
/// Call \p CB on all accesses that might interfere with \p I and
/// return true if all such accesses were known and the callback returned true
@@ -5113,7 +5370,7 @@ struct AAPointerInfo : public AbstractAttribute {
virtual bool forallInterferingAccesses(
Attributor &A, const AbstractAttribute &QueryingAA, Instruction &I,
function_ref CB, bool &HasBeenWrittenTo,
- AA::OffsetAndSize *OASPtr = nullptr) const = 0;
+ AA::RangeTy &Range) const = 0;
/// This function should return true if the type of the \p AA is AAPointerInfo
static bool classof(const AbstractAttribute *AA) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index d4f32e52cdb8d..f79b0b268bdf3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -938,7 +938,10 @@ void DwarfExprAST::lowerDIOpReferrer(DwarfExprAST::Node *OpNode) {
if (Referrer->isReg() && Referrer->getReg()) {
auto DWARFRegister = TRI->getDwarfRegNum(Referrer->getReg(), false);
- assert(DWARFRegister != -1 && "No DWARF register for referrer");
+ if (DWARFRegister == -1) {
+ IsImplemented = false;
+ return;
+ }
emitReg(DWARFRegister);
} else if (Referrer->isImm()) {
auto I = Referrer->getImm();
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp
index 87d5d053318fc..8767377934656 100644
--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -432,7 +432,6 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
- MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector Cond;
unsigned OldBrSize = TII->getInstSizeInBytes(MI);
MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
@@ -446,20 +445,6 @@ bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
MachineBasicBlock *BranchBB = MBB;
- auto RemoveBranch = [&](MachineBasicBlock *MBB) {
- unsigned &BBSize = BlockInfo[MBB->getNumber()].Size;
- int RemovedSize = 0;
- TII->removeBranch(*MBB, &RemovedSize);
- BBSize -= RemovedSize;
- };
-
- auto InsertUncondBranch = [&](MachineBasicBlock *MBB,
- MachineBasicBlock *Dst) {
- TII->insertUnconditionalBranch(*MBB, Dst, DebugLoc());
- // Recalculate the block size.
- BlockInfo[MBB->getNumber()].Size = computeBlockSize(*MBB);
- };
-
// If this was an expanded conditional branch, there is already a single
// unconditional branch in a block.
if (!MBB->empty()) {
@@ -500,13 +485,10 @@ bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
MachineBasicBlock *PrevBB = &*std::prev(DestBB->getIterator());
// Fall through only if PrevBB has no unconditional branch as one of its
// terminators.
- if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
- report_fatal_error("Could not analyze terminators.");
- if (!FBB) {
- if (!Cond.empty() && TBB && TBB == DestBB)
- RemoveBranch(PrevBB);
- if (!TBB || (TBB && !Cond.empty()))
- InsertUncondBranch(PrevBB, DestBB);
+ if (auto *FT = PrevBB->getLogicalFallThrough()) {
+ assert(FT == DestBB);
+ TII->insertUnconditionalBranch(*PrevBB, FT, DebugLoc());
+ BlockInfo[PrevBB->getNumber()].Size = computeBlockSize(*PrevBB);
}
// Now, RestoreBB could be placed directly before DestBB.
MF->splice(DestBB->getIterator(), RestoreBB->getIterator());
diff --git a/llvm/lib/CodeGen/CodeGenCommonISel.cpp b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
index 44cdd8275beda..b9ca26aabdc72 100644
--- a/llvm/lib/CodeGen/CodeGenCommonISel.cpp
+++ b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
@@ -203,7 +203,7 @@ unsigned llvm::getInvertedFPClassTest(unsigned Test) {
static MachineOperand *getSalvageOpsForCopy(const MachineRegisterInfo &MRI,
MachineInstr &Copy) {
- assert(Copy.getOpcode() == TargetOpcode::COPY && "Must be a COPY");
+ assert(Copy.isCopy() && "Must be a COPY");
return &Copy.getOperand(1);
}
@@ -234,6 +234,7 @@ static MachineOperand *salvageDebugInfoImpl(const MachineRegisterInfo &MRI,
case TargetOpcode::G_TRUNC:
return getSalvageOpsForTrunc(MRI, MI, Ops);
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
return getSalvageOpsForCopy(MRI, MI);
default:
return nullptr;
diff --git a/llvm/lib/CodeGen/DetectDeadLanes.cpp b/llvm/lib/CodeGen/DetectDeadLanes.cpp
index 565c8b405f828..87cb3c8cca2d9 100644
--- a/llvm/lib/CodeGen/DetectDeadLanes.cpp
+++ b/llvm/lib/CodeGen/DetectDeadLanes.cpp
@@ -134,6 +134,7 @@ static bool lowersToCopies(const MachineInstr &MI) {
// are not lowered to a COPY.
switch (MI.getOpcode()) {
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
case TargetOpcode::PHI:
case TargetOpcode::INSERT_SUBREG:
case TargetOpcode::REG_SEQUENCE:
@@ -229,6 +230,7 @@ LaneBitmask DetectDeadLanes::transferUsedLanes(const MachineInstr &MI,
switch (MI.getOpcode()) {
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
case TargetOpcode::PHI:
return UsedLanes;
case TargetOpcode::REG_SEQUENCE: {
@@ -331,6 +333,7 @@ LaneBitmask DetectDeadLanes::transferDefinedLanes(const MachineOperand &Def,
break;
}
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
case TargetOpcode::PHI:
break;
default:
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index c108f0088d43a..dcca111e8fa70 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -618,8 +618,7 @@ void SSAIfConv::replacePHIInstrs() {
if (hasSameValue(*MRI, TII, PI.TReg, PI.FReg)) {
// We do not need the select instruction if both incoming values are
// equal, but we do need a COPY.
- BuildMI(*Head, FirstTerm, HeadDL, TII->get(TargetOpcode::COPY), DstReg)
- .addReg(PI.TReg);
+ TII->buildCopy(*Head, FirstTerm, HeadDL, DstReg, PI.TReg);
} else {
TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg,
PI.FReg);
diff --git a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
index 086b4a4dcc47b..99edc925835ba 100644
--- a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -211,6 +211,7 @@ bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
MadeChange |= LowerSubregToReg(&MI);
break;
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
MadeChange |= LowerCopy(&MI);
break;
case TargetOpcode::DBG_VALUE:
diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
index 252910fd94627..55d939de426e3 100644
--- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
+++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -422,7 +422,7 @@ class StatepointState {
LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore);
TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, IsKill, FI,
- RC, &TRI);
+ RC, &TRI, Register());
}
}
@@ -431,7 +431,7 @@ class StatepointState {
const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
int FI = RegToSlotIdx[Reg];
if (It != MBB->end()) {
- TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI);
+ TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register());
return;
}
@@ -439,7 +439,7 @@ class StatepointState {
// and then swap them.
assert(!MBB->empty() && "Empty block");
--It;
- TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI);
+ TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register());
MachineInstr *Reload = It->getPrevNode();
int Dummy = 0;
(void)Dummy;
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 79837aa54f234..345d1e3497f9d 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1012,7 +1012,7 @@ bool CallLowering::parametersInCSRMatch(
// registers. Note that getDefIgnoringCopies does not ignore copies from
// physical registers.
MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
- if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) {
+ if (!RegDef || !RegDef->isCopy()) {
LLVM_DEBUG(
dbgs()
<< "... Parameter was not copied into a VReg, cannot tail call.\n");
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1fea2607c061f..40f49ed406829 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -204,7 +204,7 @@ bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
return false;
}
bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
- if (MI.getOpcode() != TargetOpcode::COPY)
+ if (!MI.isCopy())
return false;
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
@@ -4223,7 +4223,7 @@ bool CombinerHelper::matchICmpToLHSKnownBits(
LLT LHSTy = MRI.getType(LHS);
unsigned LHSSize = LHSTy.getSizeInBits();
unsigned DstSize = DstTy.getSizeInBits();
- unsigned Op = TargetOpcode::COPY;
+ unsigned Op = Builder.getTII().getCopyOpcode();
if (DstSize != LHSSize)
Op = DstSize < LHSSize ? TargetOpcode::G_TRUNC : TargetOpcode::G_ZEXT;
if (!isLegalOrBeforeLegalizer({Op, {DstTy, LHSTy}}))
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index e2c34a31d9a1b..180e900490c5a 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -36,6 +36,7 @@ Align GISelKnownBits::computeKnownAlignment(Register R, unsigned Depth) {
const MachineInstr *MI = MRI.getVRegDef(R);
switch (MI->getOpcode()) {
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
return computeKnownAlignment(MI->getOperand(1).getReg(), Depth);
case TargetOpcode::G_ASSERT_ALIGN: {
// TODO: Min with source
@@ -200,6 +201,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
break;
}
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
case TargetOpcode::G_PHI:
case TargetOpcode::PHI: {
Known.One = APInt::getAllOnes(BitWidth);
@@ -234,7 +236,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
MRI.getType(SrcReg).isValid()) {
// For COPYs we don't do anything, don't increase the depth.
computeKnownBitsImpl(SrcReg, Known2, DemandedElts,
- Depth + (Opcode != TargetOpcode::COPY));
+ Depth + (!MI.isCopy()));
Known = KnownBits::commonBits(Known, Known2);
// If we reach a point where we don't know anything
// just stop looking through the operands.
@@ -631,7 +633,8 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
unsigned FirstAnswer = 1;
switch (Opcode) {
- case TargetOpcode::COPY: {
+ case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY: {
MachineOperand &Src = MI.getOperand(1);
if (Src.getReg().isVirtual() && Src.getSubReg() == 0 &&
MRI.getType(Src.getReg()).isValid()) {
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 3dc95e3e9df59..02b9343d0e72e 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2514,8 +2514,16 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
LLT MemTy = Info.memVT.isSimple()
? getLLTForMVT(Info.memVT.getSimpleVT())
: LLT::scalar(Info.memVT.getStoreSizeInBits());
- MIB.addMemOperand(MF->getMachineMemOperand(MachinePointerInfo(Info.ptrVal),
- Info.flags, MemTy, Alignment));
+
+ // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
+ // didn't yield anything useful.
+ MachinePointerInfo MPI;
+ if (Info.ptrVal)
+ MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
+ else if (Info.fallbackAddressSpace)
+ MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
+ MIB.addMemOperand(
+ MF->getMachineMemOperand(MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata()));
}
return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 14b8a141af437..802d36c2dd4c2 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -226,7 +226,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
ReachedBegin = true;
else
--MII;
- if (MI.getOpcode() != TargetOpcode::COPY)
+ if (!MI.isCopy())
continue;
Register SrcReg = MI.getOperand(1).getReg();
Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 2b9bc22bbb2f3..98afe27039f56 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7337,7 +7337,8 @@ LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
return UnableToLegalize; // FIXME: handle extension.
// This can be just a plain copy.
Observer.changingInstr(MI);
- MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
+ const TargetInstrInfo &TII = MIRBuilder.getTII();
+ MI.setDesc(TII.get(TII.getCopyOpcode()));
Observer.changedInstr(MI);
return Legalized;
}
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 78a8f85e63f20..5da061fe16331 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -287,7 +287,7 @@ MachineInstrBuilder MachineIRBuilder::buildBrJT(Register TablePtr,
MachineInstrBuilder MachineIRBuilder::buildCopy(const DstOp &Res,
const SrcOp &Op) {
- return buildInstr(TargetOpcode::COPY, Res, Op);
+ return buildInstr(getTII().getCopyOpcode(), Res, Op);
}
MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res,
@@ -511,7 +511,7 @@ MachineInstrBuilder MachineIRBuilder::buildExtOrTrunc(unsigned ExtOpc,
assert(Res.getLLTTy(*getMRI()).isScalar() ==
Op.getLLTTy(*getMRI()).isScalar());
- unsigned Opcode = TargetOpcode::COPY;
+ unsigned Opcode = getTII().getCopyOpcode();
if (Res.getLLTTy(*getMRI()).getSizeInBits() >
Op.getLLTTy(*getMRI()).getSizeInBits())
Opcode = ExtOpc;
@@ -1114,6 +1114,7 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
break;
}
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
assert(DstOps.size() == 1 && "Invalid Dst");
// If the caller wants to add a subreg source it has to be done separately
// so we may not have any SrcOps at this point yet.
diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 07eece77143fe..1b4411bb574d0 100644
--- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -25,6 +25,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterBank.h"
#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -83,6 +84,7 @@ void RegBankSelect::init(MachineFunction &MF) {
assert(RBI && "Cannot work without RegisterBankInfo");
MRI = &MF.getRegInfo();
TRI = MF.getSubtarget().getRegisterInfo();
+ TII = MF.getSubtarget().getInstrInfo();
TPC = &getAnalysis();
if (OptMode != Mode::Fast) {
MBFI = &getAnalysis();
@@ -160,9 +162,9 @@ bool RegBankSelect::repairReg(
// Build the instruction used to repair, then clone it at the right
// places. Avoiding buildCopy bypasses the check that Src and Dst have the
// same types because the type is a placeholder when this function is called.
- MI = MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY)
- .addDef(Dst)
- .addUse(Src);
+ MI = MIRBuilder.buildInstrNoInsert(TII->getCopyOpcode())
+ .addDef(Dst)
+ .addUse(Src);
LLVM_DEBUG(dbgs() << "Copy: " << printReg(Src) << " to: " << printReg(Dst)
<< '\n');
} else {
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 92368ab9beb78..662b6dd9bca30 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -73,14 +73,11 @@ Register llvm::constrainOperandRegClass(
// FIXME: The copy needs to have the classes constrained for its operands.
// Use operand's regbank to get the class for old register (Reg).
if (RegMO.isUse()) {
- BuildMI(MBB, InsertIt, InsertPt.getDebugLoc(),
- TII.get(TargetOpcode::COPY), ConstrainedReg)
- .addReg(Reg);
+ TII.buildCopy(MBB, InsertIt, InsertPt.getDebugLoc(), ConstrainedReg, Reg);
} else {
assert(RegMO.isDef() && "Must be a definition");
- BuildMI(MBB, std::next(InsertIt), InsertPt.getDebugLoc(),
- TII.get(TargetOpcode::COPY), Reg)
- .addReg(ConstrainedReg);
+ TII.buildCopy(MBB, std::next(InsertIt), InsertPt.getDebugLoc(), Reg,
+ ConstrainedReg);
}
if (GISelChangeObserver *Observer = MF.getObserver()) {
Observer->changingInstr(*RegMO.getParent());
@@ -332,6 +329,7 @@ Optional getConstantVRegValWithLookThrough(
VReg = MI->getOperand(1).getReg();
break;
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
VReg = MI->getOperand(1).getReg();
if (Register::isPhysicalRegister(VReg))
return None;
@@ -446,7 +444,7 @@ llvm::getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) {
if (!DstTy.isValid())
return None;
unsigned Opc = DefMI->getOpcode();
- while (Opc == TargetOpcode::COPY || isPreISelGenericOptimizationHint(Opc)) {
+ while (DefMI->isCopy() || isPreISelGenericOptimizationHint(Opc)) {
Register SrcReg = DefMI->getOperand(1).getReg();
auto SrcTy = MRI.getType(SrcReg);
if (!SrcTy.isValid())
@@ -751,8 +749,7 @@ Register llvm::getFunctionLiveInPhysReg(MachineFunction &MF,
MRI.setType(LiveIn, RegTy);
}
- BuildMI(EntryMBB, EntryMBB.begin(), DL, TII.get(TargetOpcode::COPY), LiveIn)
- .addReg(PhysReg);
+ TII.buildCopy(EntryMBB, EntryMBB.begin(), DL, LiveIn, PhysReg);
if (!EntryMBB.isLiveIn(PhysReg))
EntryMBB.addLiveIn(PhysReg);
return LiveIn;
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index d243cb5c8c58f..22716a1118489 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -417,7 +417,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
MachineInstrSpan MIS(MII, MBB);
// Insert spill without kill flag immediately after def.
TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot,
- MRI.getRegClass(SrcReg), &TRI);
+ MRI.getRegClass(SrcReg), &TRI, Register());
LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
for (const MachineInstr &MI : make_range(MIS.begin(), MII))
getVDefInterval(MI, LIS);
@@ -993,7 +993,7 @@ void InlineSpiller::insertReload(Register NewVReg,
MachineInstrSpan MIS(MI, &MBB);
TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
- MRI.getRegClass(NewVReg), &TRI);
+ MRI.getRegClass(NewVReg), &TRI, Register());
LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
@@ -1030,7 +1030,7 @@ void InlineSpiller::insertSpill(Register NewVReg, bool isKill,
if (IsRealSpill)
TII.storeRegToStackSlot(MBB, SpillBefore, NewVReg, isKill, StackSlot,
- MRI.getRegClass(NewVReg), &TRI);
+ MRI.getRegClass(NewVReg), &TRI, Register());
else
// Don't spill undef value.
// Anything works for undef, in particular keeping the memory
@@ -1596,7 +1596,7 @@ void HoistSpillHelper::hoistAllSpills() {
MachineBasicBlock::iterator MII = IPA.getLastInsertPointIter(OrigLI, *BB);
MachineInstrSpan MIS(MII, BB);
TII.storeRegToStackSlot(*BB, MII, LiveReg, false, Slot,
- MRI.getRegClass(LiveReg), &TRI);
+ MRI.getRegClass(LiveReg), &TRI, Register());
LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
for (const MachineInstr &MI : make_range(MIS.begin(), MII))
getVDefInterval(MI, LIS);
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 539d5e7524a6a..47173d353241c 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -1743,9 +1743,8 @@ void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
return;
LLVM_DEBUG(dbgs() << " Split " << NumComp << " components: " << LI << '\n');
Register Reg = LI.reg();
- const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
for (unsigned I = 1; I < NumComp; ++I) {
- Register NewVReg = MRI->createVirtualRegister(RegClass);
+ Register NewVReg = MRI->cloneVirtualRegister(Reg);
LiveInterval &NewLI = createEmptyInterval(NewVReg);
SplitLIs.push_back(&NewLI);
}
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index afc04f0045c26..5c8af456fc206 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -33,7 +33,7 @@ void LiveRangeEdit::Delegate::anchor() { }
LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(Register OldReg,
bool createSubRanges) {
- Register VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+ Register VReg = MRI.cloneVirtualRegister(OldReg);
if (VRM)
VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
@@ -53,7 +53,7 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(Register OldReg,
}
Register LiveRangeEdit::createFrom(Register OldReg) {
- Register VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+ Register VReg = MRI.cloneVirtualRegister(OldReg);
if (VRM) {
VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
}
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index f7684ae67d38c..7242a8fa5c278 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -639,8 +639,7 @@ MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC)
// No luck, create a virtual register.
Register VirtReg = MRI.createVirtualRegister(RC);
- BuildMI(*this, I, DebugLoc(), TII.get(TargetOpcode::COPY), VirtReg)
- .addReg(PhysReg, RegState::Kill);
+ TII.buildCopy(*this, I, DebugLoc(), VirtReg, PhysReg, RegState::Kill);
if (!LiveIn)
addLiveIn(PhysReg);
return VirtReg;
@@ -934,7 +933,7 @@ const MachineBasicBlock *MachineBasicBlock::getSingleSuccessor() const {
return Successors.size() == 1 ? Successors[0] : nullptr;
}
-MachineBasicBlock *MachineBasicBlock::getFallThrough() {
+MachineBasicBlock *MachineBasicBlock::getFallThrough(bool JumpToFallThrough) {
MachineFunction::iterator Fallthrough = getIterator();
++Fallthrough;
// If FallthroughBlock is off the end of the function, it can't fall through.
@@ -965,8 +964,8 @@ MachineBasicBlock *MachineBasicBlock::getFallThrough() {
// If there is some explicit branch to the fallthrough block, it can obviously
// reach, even though the branch should get folded to fall through implicitly.
- if (MachineFunction::iterator(TBB) == Fallthrough ||
- MachineFunction::iterator(FBB) == Fallthrough)
+ if (!JumpToFallThrough && (MachineFunction::iterator(TBB) == Fallthrough ||
+ MachineFunction::iterator(FBB) == Fallthrough))
return &*Fallthrough;
// If it's an unconditional branch to some block not the fall through, it
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 78b49c9236629..c6653ae3a6659 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -187,6 +187,7 @@ void MachineFunction::init() {
RegInfo = nullptr;
MFInfo = nullptr;
+
// We can realign the stack if the target supports it and the user hasn't
// explicitly asked us not to.
bool CanRealignSP = STI->getFrameLowering()->isStackRealignable() &&
@@ -232,6 +233,12 @@ void MachineFunction::init() {
PSVManager = std::make_unique(getTarget());
}
+void MachineFunction::initTargetMachineFunctionInfo(
+ const TargetSubtargetInfo &STI) {
+ assert(!MFInfo && "MachineFunctionInfo already set");
+ MFInfo = Target.createMachineFunctionInfo(Allocator, F, &STI);
+}
+
MachineFunction::~MachineFunction() {
clear();
}
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 23d55a5df9f57..a610690648bc0 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -118,6 +118,11 @@ MachineFunction &MachineModuleInfo::getOrCreateMachineFunction(Function &F) {
// No pre-existing machine function, create a new one.
const TargetSubtargetInfo &STI = *TM.getSubtargetImpl(F);
MF = new MachineFunction(F, TM, STI, NextFnNum++, *this);
+ MF->initTargetMachineFunctionInfo(STI);
+
+ // MRI callback for target specific initializations.
+ TM.registerMachineRegisterInfoCallback(*MF);
+
// Update the set entry.
I.first->second.reset(MF);
} else {
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 721bd52448ace..bed78b122177a 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -418,9 +418,8 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
MachineBasicBlock::iterator At = PredB.getFirstTerminator();
const DebugLoc &DL = PredB.findDebugLoc(At);
- auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
- .addReg(RegOp.getReg(), getRegState(RegOp),
- RegOp.getSubReg());
+ auto Copy = TII->buildCopy(PredB, At, DL, NewReg, RegOp.getReg(),
+ getRegState(RegOp), RegOp.getSubReg());
Slots.insertMachineInstrInMaps(*Copy);
RegOp.setReg(NewReg);
RegOp.setSubReg(0);
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index e48f1beaae2be..b24c850957a3c 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -48,6 +48,7 @@ MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF)
RegAllocHints.reserve(256);
UsedPhysRegMask.resize(NumRegs);
PhysRegUseDefLists.reset(new MachineOperand*[NumRegs]());
+ TheDelegates.clear();
}
/// setRegClass - Set the register class of the specified virtual register.
@@ -79,10 +80,10 @@ constrainRegClass(MachineRegisterInfo &MRI, Register Reg,
return NewRC;
}
-const TargetRegisterClass *MachineRegisterInfo::constrainRegClass(
- Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs) {
- if (Reg.isPhysical())
- return nullptr;
+const TargetRegisterClass *
+MachineRegisterInfo::constrainRegClass(Register Reg,
+ const TargetRegisterClass *RC,
+ unsigned MinNumRegs) {
return ::constrainRegClass(*this, Reg, getRegClass(Reg), RC, MinNumRegs);
}
@@ -162,8 +163,7 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
// New virtual register number.
Register Reg = createIncompleteVirtualRegister(Name);
VRegInfo[Reg].first = RegClass;
- if (TheDelegate)
- TheDelegate->MRI_NoteNewVirtualRegister(Reg);
+ noteNewVirtualRegister(Reg);
return Reg;
}
@@ -172,8 +172,7 @@ Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
Register Reg = createIncompleteVirtualRegister(Name);
VRegInfo[Reg].first = VRegInfo[VReg].first;
setType(Reg, getType(VReg));
- if (TheDelegate)
- TheDelegate->MRI_NoteNewVirtualRegister(Reg);
+ noteCloneVirtualRegister(Reg, VReg);
return Reg;
}
@@ -189,8 +188,7 @@ MachineRegisterInfo::createGenericVirtualRegister(LLT Ty, StringRef Name) {
// FIXME: Should we use a dummy register class?
VRegInfo[Reg].first = static_cast(nullptr);
setType(Reg, Ty);
- if (TheDelegate)
- TheDelegate->MRI_NoteNewVirtualRegister(Reg);
+ noteNewVirtualRegister(Reg);
return Reg;
}
@@ -479,9 +477,8 @@ MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
--i; --e;
} else {
// Emit a copy.
- BuildMI(*EntryMBB, EntryMBB->begin(), DebugLoc(),
- TII.get(TargetOpcode::COPY), LiveIns[i].second)
- .addReg(LiveIns[i].first);
+ TII.buildCopy(*EntryMBB, EntryMBB->begin(), DebugLoc(),
+ LiveIns[i].second, LiveIns[i].first);
// Add the register to the entry block live-in set.
EntryMBB->addLiveIn(LiveIns[i].first);
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 6ef36d86891a1..e315d77382f39 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1818,7 +1818,8 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
// Verify properties of various specific instruction types
switch (MI->getOpcode()) {
- case TargetOpcode::COPY: {
+ case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY: {
const MachineOperand &DstOp = MI->getOperand(0);
const MachineOperand &SrcOp = MI->getOperand(1);
const Register SrcReg = SrcOp.getReg();
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index c7fde45eba6a6..d75673a4f9afa 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -824,9 +824,7 @@ void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
// We split the lifetime when we find the first use.
if (SplitReg == 0) {
SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
- BuildMI(*KernelBB, MI, MI->getDebugLoc(),
- TII->get(TargetOpcode::COPY), SplitReg)
- .addReg(Def);
+ TII->buildCopy(*KernelBB, MI, MI->getDebugLoc(), SplitReg, Def);
}
BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
}
@@ -1191,9 +1189,7 @@ void ModuloScheduleExpander::rewriteScheduledInstr(
UseOp.setReg(ReplaceReg);
else {
Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
- BuildMI(*BB, UseMI, UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY),
- SplitReg)
- .addReg(ReplaceReg);
+ TII->buildCopy(*BB, UseMI, UseMI->getDebugLoc(), SplitReg, ReplaceReg);
UseOp.setReg(SplitReg);
}
}
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 31e37c4cd7e3e..ceabe03ceee57 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -603,9 +603,8 @@ optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
RC = MRI->getRegClass(UseMI->getOperand(0).getReg());
Register NewVR = MRI->createVirtualRegister(RC);
- BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
- TII->get(TargetOpcode::COPY), NewVR)
- .addReg(DstReg, 0, SubIdx);
+ TII->buildCopy(*UseMBB, UseMI, UseMI->getDebugLoc(), NewVR, DstReg, 0,
+ SubIdx);
if (UseSrcSubIdx)
UseMO->setSubReg(0);
@@ -1024,7 +1023,7 @@ class ExtractSubregRewriter : public Rewriter {
// Get rid of the sub-register index.
CopyLike.removeOperand(2);
// Morph the operation into a COPY.
- CopyLike.setDesc(TII.get(TargetOpcode::COPY));
+ CopyLike.setDesc(TII.get(TII.getCopyOpcode()));
return true;
}
CopyLike.getOperand(CurrentSrcIdx + 1).setImm(NewSubReg);
@@ -1112,6 +1111,7 @@ static Rewriter *getCopyRewriter(MachineInstr &MI, const TargetInstrInfo &TII) {
default:
return nullptr;
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
return new CopyRewriter(MI);
case TargetOpcode::INSERT_SUBREG:
return new InsertSubregRewriter(MI);
@@ -1253,9 +1253,8 @@ PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
Register NewVReg = MRI->createVirtualRegister(DefRC);
MachineInstr *NewCopy =
- BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
- TII->get(TargetOpcode::COPY), NewVReg)
- .addReg(NewSrc.Reg, 0, NewSrc.SubReg);
+ TII->buildCopy(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
+ NewVReg, NewSrc.Reg, 0, NewSrc.SubReg);
if (Def.SubReg) {
NewCopy->getOperand(0).setSubReg(Def.SubReg);
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index ec27272127d6f..26cb1a90a6a51 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -594,13 +594,12 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
unsigned Reg = CS.getReg();
if (CS.isSpilledToReg()) {
- BuildMI(SaveBlock, I, DebugLoc(),
- TII.get(TargetOpcode::COPY), CS.getDstReg())
- .addReg(Reg, getKillRegState(true));
+ TII.buildCopy(SaveBlock, I, DebugLoc(), CS.getDstReg(), Reg,
+ getKillRegState(true));
} else {
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
- TRI);
+ TRI, Register());
}
}
}
@@ -622,11 +621,12 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
for (const CalleeSavedInfo &CI : reverse(CSI)) {
unsigned Reg = CI.getReg();
if (CI.isSpilledToReg()) {
- BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)
- .addReg(CI.getDstReg(), getKillRegState(true));
+ TII.buildCopy(RestoreBlock, I, DebugLoc(), Reg, CI.getDstReg(),
+ getKillRegState(true));
} else {
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI);
+ TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC,
+ TRI, Register());
assert(I != RestoreBlock.begin() &&
"loadRegFromStackSlot didn't insert any code!");
// Insert in reverse order. loadRegFromStackSlot can insert
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 3da1cfbb05870..0e7ad506f884c 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -527,7 +527,8 @@ void RegAllocFast::spill(MachineBasicBlock::iterator Before, Register VirtReg,
LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
- TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI);
+ TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI,
+ VirtReg);
++NumStores;
MachineBasicBlock::iterator FirstTerm = MBB->getFirstTerminator();
@@ -592,7 +593,7 @@ void RegAllocFast::reload(MachineBasicBlock::iterator Before, Register VirtReg,
<< printReg(PhysReg, TRI) << '\n');
int FI = getStackSpaceFor(VirtReg);
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
- TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI);
+ TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI, VirtReg);
++NumLoads;
}
@@ -1024,9 +1025,8 @@ void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
std::next((MachineBasicBlock::iterator)MI.getIterator());
LLVM_DEBUG(dbgs() << "Copy " << printReg(LRI->PhysReg, TRI) << " to "
<< printReg(PrevReg, TRI) << '\n');
- BuildMI(*MBB, InsertBefore, MI.getDebugLoc(),
- TII->get(TargetOpcode::COPY), PrevReg)
- .addReg(LRI->PhysReg, llvm::RegState::Kill);
+ TII->buildCopy(*MBB, InsertBefore, MI.getDebugLoc(), PrevReg,
+ LRI->PhysReg, llvm::RegState::Kill);
}
MachineOperand &MO = MI.getOperand(OpNum);
if (MO.getSubReg() && !MO.isUndef()) {
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 3310cdd697c47..9a3854c2c063e 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -317,6 +317,7 @@ unsigned DefaultPriorityAdvisor::getPriority(const LiveInterval &LI) const {
} else {
// Giant live ranges fall back to the global assignment heuristic, which
// prevents excessive spilling in pathological cases.
+ bool AddPriorityToGlobal = TRI->addAllocPriorityToGlobalRanges();
const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
bool ForceGlobal = RC.GlobalPriority ||
(!ReverseLocalAssignment &&
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 8865bcf9cd6db..46c94cda5fbe0 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1186,9 +1186,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
<< printMBBReference(*CopyLeftBB) << '\t' << CopyMI);
// Insert new copy to CopyLeftBB.
- MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(),
- TII->get(TargetOpcode::COPY), IntB.reg())
- .addReg(IntA.reg());
+ MachineInstr *NewCopyMI = TII->buildCopy(
+ *CopyLeftBB, InsPos, CopyMI.getDebugLoc(), IntB.reg(), IntA.reg());
SlotIndex NewCopyIdx =
LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot();
IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index 289d31be2d2d6..e4a9dc827701c 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -499,14 +499,14 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
": Cannot scavenge register without an emergency "
"spill slot!");
}
- TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, TRI);
+ TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, TRI, Register());
MachineBasicBlock::iterator II = std::prev(Before);
unsigned FIOperandNum = getFrameIndexOperandNum(*II);
TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
// Restore the scavenged register before its use (or first terminator).
- TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, TRI);
+ TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, TRI, Register());
II = std::prev(UseMI);
FIOperandNum = getFrameIndexOperandNum(*II);
diff --git a/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp b/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
index 0f73973c8a51c..11bdf3bb2ba8c 100644
--- a/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
+++ b/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
#define DEBUG_TYPE "reset-machine-function"
@@ -66,6 +67,12 @@ namespace {
LLVM_DEBUG(dbgs() << "Resetting: " << MF.getName() << '\n');
++NumFunctionsReset;
MF.reset();
+ MF.initTargetMachineFunctionInfo(MF.getSubtarget());
+
+ const LLVMTargetMachine &TM = MF.getTarget();
+ // MRI callback for target specific initializations.
+ TM.registerMachineRegisterInfoCallback(MF);
+
if (EmitFallbackDiag) {
const Function &F = MF.getFunction();
DiagnosticInfoISelFallback DiagFallback(F);
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 053f825478568..60522597f955c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1504,8 +1504,7 @@ bool FastISel::selectFreeze(const User *I) {
MVT Ty = ETy.getSimpleVT();
const TargetRegisterClass *TyRegClass = TLI.getRegClassFor(Ty);
Register ResultReg = createResultReg(TyRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg, Reg);
updateValueMap(I, ResultReg);
return true;
@@ -1962,8 +1961,7 @@ Register FastISel::constrainOperandRegClass(const MCInstrDesc &II, Register Op,
// If it's not legal to COPY between the register classes, something
// has gone very wrong before we got here.
Register NewOp = createResultReg(RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), NewOp).addReg(Op);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, NewOp, Op);
return NewOp;
}
}
@@ -1992,8 +1990,8 @@ Register FastISel::fastEmitInst_r(unsigned MachineInstOpcode,
else {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
.addReg(Op0);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg,
+ II.ImplicitDefs[0]);
}
return ResultReg;
@@ -2016,8 +2014,8 @@ Register FastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
.addReg(Op0)
.addReg(Op1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg,
+ II.ImplicitDefs[0]);
}
return ResultReg;
}
@@ -2042,8 +2040,8 @@ Register FastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
.addReg(Op0)
.addReg(Op1)
.addReg(Op2);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg,
+ II.ImplicitDefs[0]);
}
return ResultReg;
}
@@ -2064,8 +2062,8 @@ Register FastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
.addReg(Op0)
.addImm(Imm);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg,
+ II.ImplicitDefs[0]);
}
return ResultReg;
}
@@ -2088,8 +2086,8 @@ Register FastISel::fastEmitInst_rii(unsigned MachineInstOpcode,
.addReg(Op0)
.addImm(Imm1)
.addImm(Imm2);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg,
+ II.ImplicitDefs[0]);
}
return ResultReg;
}
@@ -2107,8 +2105,8 @@ Register FastISel::fastEmitInst_f(unsigned MachineInstOpcode,
else {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
.addFPImm(FPImm);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg,
+ II.ImplicitDefs[0]);
}
return ResultReg;
}
@@ -2132,8 +2130,8 @@ Register FastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
.addReg(Op0)
.addReg(Op1)
.addImm(Imm);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg,
+ II.ImplicitDefs[0]);
}
return ResultReg;
}
@@ -2148,8 +2146,8 @@ Register FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
.addImm(Imm);
else {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II).addImm(Imm);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg,
+ II.ImplicitDefs[0]);
}
return ResultReg;
}
@@ -2161,8 +2159,7 @@ Register FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
"Cannot yet extract from physregs");
const TargetRegisterClass *RC = MRI.getRegClass(Op0);
MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
- ResultReg).addReg(Op0, 0, Idx);
+ TII.buildCopy(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, ResultReg, Op0, 0, Idx);
return ResultReg;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 7b5414aeb1350..3b51f5872773c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -174,8 +174,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
} else {
// Create the reg, emit the copy.
VRBase = MRI->createVirtualRegister(DstRC);
- BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
- VRBase).addReg(SrcReg);
+ TII->buildCopy(*MBB, InsertPos, Node->getDebugLoc(), VRBase, SrcReg);
}
SDValue Op(Node, ResNo);
@@ -332,8 +331,8 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
OpRC = TRI->getAllocatableClass(OpRC);
assert(OpRC && "Constraints cannot be fulfilled for allocation");
Register NewVReg = MRI->createVirtualRegister(OpRC);
- BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
- TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
+ TII->buildCopy(*MBB, InsertPos, Op.getNode()->getDebugLoc(), NewVReg,
+ VReg);
VReg = NewVReg;
} else {
assert(ConstrainedRC->isAllocatable() &&
@@ -399,8 +398,8 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
if (OpRC && IIRC && OpRC != IIRC && Register::isVirtualRegister(VReg)) {
Register NewVReg = MRI->createVirtualRegister(IIRC);
- BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
- TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
+ TII->buildCopy(*MBB, InsertPos, Op.getNode()->getDebugLoc(), NewVReg,
+ VReg);
VReg = NewVReg;
}
// Turn additional physreg operands into implicit uses on non-variadic
@@ -468,8 +467,7 @@ Register InstrEmitter::ConstrainForSubReg(Register VReg, unsigned SubIdx,
RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT, isDivergent), SubIdx);
assert(RC && "No legal register class for VT supports that SubIdx");
Register NewReg = MRI->createVirtualRegister(RC);
- BuildMI(*MBB, InsertPos, DL, TII->get(TargetOpcode::COPY), NewReg)
- .addReg(VReg);
+ TII->buildCopy(*MBB, InsertPos, DL, NewReg, VReg);
return NewReg;
}
@@ -525,8 +523,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
// to a copy
// r1026 = copy r1024
VRBase = MRI->createVirtualRegister(TRC);
- BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
- TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg);
+ TII->buildCopy(*MBB, InsertPos, Node->getDebugLoc(), VRBase, SrcReg);
MRI->clearKillFlags(SrcReg);
} else {
// Reg may not support a SubIdx sub-register, and we may need to
@@ -541,9 +538,9 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
VRBase = MRI->createVirtualRegister(TRC);
// Create the extract_subreg machine instruction.
- MachineInstrBuilder CopyMI =
- BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
- TII->get(TargetOpcode::COPY), VRBase);
+ MachineInstrBuilder CopyMI = MachineInstrBuilder(
+ *MBB->getParent(),
+ TII->buildCopy(*MBB, InsertPos, Node->getDebugLoc(), VRBase));
if (Reg.isVirtual())
CopyMI.addReg(Reg, 0, SubIdx);
else
@@ -618,8 +615,7 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
const TargetRegisterClass *DstRC =
TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
Register NewVReg = MRI->createVirtualRegister(DstRC);
- BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
- NewVReg).addReg(VReg);
+ TII->buildCopy(*MBB, InsertPos, Node->getDebugLoc(), NewVReg, VReg);
SDValue Op(Node, 0);
bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
@@ -1231,8 +1227,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
break;
- BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
- DestReg).addReg(SrcReg);
+ TII->buildCopy(*MBB, InsertPos, Node->getDebugLoc(), DestReg, SrcReg);
break;
}
case ISD::CopyFromReg: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index d8eb97a4b47ee..0172d654055d8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -112,15 +112,11 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
const TargetRegisterInfo *TRI,
const TargetInstrInfo *TII,
- const TargetLowering &TLI,
unsigned &PhysReg, int &Cost) {
if (Op != 2 || User->getOpcode() != ISD::CopyToReg)
return;
unsigned Reg = cast(User->getOperand(1))->getReg();
- if (TLI.checkForPhysRegDependency(Def, User, Op, TRI, TII, PhysReg, Cost))
- return;
-
if (Register::isVirtualRegister(Reg))
return;
@@ -491,8 +487,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
unsigned PhysReg = 0;
int Cost = 1;
// Determine if this is a physical register dependency.
- const TargetLowering &TLI = DAG->getTargetLoweringInfo();
- CheckForPhysRegDependency(OpN, N, i, TRI, TII, TLI, PhysReg, Cost);
+ CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost);
assert((PhysReg == 0 || !isChain) &&
"Chain dependence via physreg data?");
// FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler
@@ -828,8 +823,7 @@ EmitPhysRegCopy(SUnit *SU, DenseMap &VRBaseMap,
break;
}
}
- BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
- .addReg(VRI->second);
+ TII->buildCopy(*BB, InsertPos, DebugLoc(), Reg, VRI->second);
} else {
// Copy from physical register.
assert(Pred.getReg() && "Unknown physical register!");
@@ -837,8 +831,7 @@ EmitPhysRegCopy(SUnit *SU, DenseMap &VRBaseMap,
bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
(void)isNew; // Silence compiler warning.
assert(isNew && "Node emitted out of order - early");
- BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
- .addReg(Pred.getReg());
+ TII->buildCopy(*BB, InsertPos, DebugLoc(), VRBase, Pred.getReg());
}
break;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 084f16f897554..7c45da2fc78cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4873,11 +4873,17 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
TLI.CollectTargetIntrinsicOperands(I, Ops, DAG);
if (IsTgtIntrinsic) {
// This is target intrinsic that touches memory
- Result =
- DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
- MachinePointerInfo(Info.ptrVal, Info.offset),
- Info.align, Info.flags, Info.size,
- I.getAAMetadata());
+ //
+ // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
+ // didn't yield anything useful.
+ MachinePointerInfo MPI;
+ if (Info.ptrVal)
+ MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
+ else if (Info.fallbackAddressSpace)
+ MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
+ Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops,
+ Info.memVT, MPI, Info.align, Info.flags,
+ Info.size, I.getAAMetadata());
} else if (!HasChain) {
Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
} else if (!I.getType()->isVoidTy()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 5dd6cc6225573..dd385bd3432d4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1240,9 +1240,8 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
assert(EHPhysReg && "target lacks exception pointer register");
MBB->addLiveIn(EHPhysReg);
unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
- BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
- TII->get(TargetOpcode::COPY), VReg)
- .addReg(EHPhysReg, RegState::Kill);
+ TII->buildCopy(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), VReg,
+ EHPhysReg, RegState::Kill);
}
}
return true;
@@ -2198,7 +2197,7 @@ void SelectionDAGISel::Select_FREEZE(SDNode *N) {
// TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now.
// If FREEZE instruction is added later, the code below must be changed as
// well.
- CurDAG->SelectNodeTo(N, TargetOpcode::COPY, N->getValueType(0),
+ CurDAG->SelectNodeTo(N, TII->getCopyOpcode(), N->getValueType(0),
N->getOperand(0));
}
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 94149f56e7035..d34a8b9c9b3ec 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -518,7 +518,7 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
- const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+ const MCInstrDesc &Desc = TII.get(TII.getCopyOpcode());
bool FirstCopy = !Def.isValid();
MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc)
.addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy)
@@ -535,14 +535,14 @@ SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
}
SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
- LaneBitmask LaneMask, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
- const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+ LaneBitmask LaneMask, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ bool Late, unsigned RegIdx) {
SlotIndexes &Indexes = *LIS.getSlotIndexes();
if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
// The full vreg is copied.
MachineInstr *CopyMI =
- BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg);
+ TII.buildCopy(MBB, InsertBefore, DebugLoc(), ToReg, FromReg);
return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
}
diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
index 83a7063de112d..4c2b52ef1f472 100644
--- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
+++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -228,9 +228,8 @@ void SwiftErrorValueTracking::propagateVRegs() {
assert(!VRegs.empty() &&
"No predecessors? Is the Calling Convention correct?");
Register DestReg = UUseVReg;
- BuildMI(*MBB, MBB->getFirstNonPHI(), DLoc, TII->get(TargetOpcode::COPY),
- DestReg)
- .addReg(VRegs[0].second);
+ TII->buildCopy(*MBB, MBB->getFirstNonPHI(), DLoc, DestReg,
+ VRegs[0].second);
continue;
}
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 86ba57d09c2a2..62d1db4120a7f 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -444,9 +444,8 @@ void TailDuplicator::duplicateInstruction(
if (NewRC == nullptr)
NewRC = OrigRC;
Register NewReg = MRI->createVirtualRegister(NewRC);
- BuildMI(*PredBB, NewMI, NewMI.getDebugLoc(),
- TII->get(TargetOpcode::COPY), NewReg)
- .addReg(VI->second.Reg, 0, VI->second.SubReg);
+ TII->buildCopy(*PredBB, NewMI, NewMI.getDebugLoc(), NewReg,
+ VI->second.Reg, 0, VI->second.SubReg);
LocalVRMap.erase(VI);
LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
MO.setReg(NewReg);
@@ -1034,10 +1033,9 @@ void TailDuplicator::appendCopies(MachineBasicBlock *MBB,
SmallVectorImpl> &CopyInfos,
SmallVectorImpl &Copies) {
MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
- const MCInstrDesc &CopyD = TII->get(TargetOpcode::COPY);
for (auto &CI : CopyInfos) {
- auto C = BuildMI(*MBB, Loc, DebugLoc(), CopyD, CI.first)
- .addReg(CI.second.Reg, 0, CI.second.SubReg);
+ auto C = TII->buildCopy(*MBB, Loc, DebugLoc(), CI.first, CI.second.Reg, 0,
+ CI.second.SubReg);
Copies.push_back(C);
}
}
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index c14b64c18b214..645de85141315 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -642,9 +642,10 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
MachineBasicBlock::iterator Pos = MI;
if (Flags == MachineMemOperand::MOStore)
- storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
+ storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI,
+ Register());
else
- loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI);
+ loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register());
return &*--Pos;
}
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index ac346585b0f8f..b311301a72924 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -571,10 +571,14 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes(
break;
}
- // Try to cover as much of the remaining lanes as possible but
- // as few of the already covered lanes as possible.
- int Cover = (SubRegMask & LanesLeft).getNumLanes() -
- (SubRegMask & ~LanesLeft).getNumLanes();
+ // Do not cover already-covered lanes to avoid creating cycles
+ // in copy bundles (= bundle contains copies that write to the
+ // registers).
+ if ((SubRegMask & ~LanesLeft).any())
+ continue;
+
+ // Try to cover as many of the remaining lanes as possible.
+ const int Cover = (SubRegMask & LanesLeft).getNumLanes();
if (Cover > BestCover) {
BestCover = Cover;
BestIdx = Idx;
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 78bf030132c35..5dfa09fdf5d3d 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1494,8 +1494,9 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
#endif
// Emit a copy.
- MachineInstrBuilder MIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(TargetOpcode::COPY), RegA);
+ MachineInstrBuilder MIB = MachineInstrBuilder(
+ *MI->getParent()->getParent(),
+ TII->buildCopy(*MI->getParent(), MI, MI->getDebugLoc(), RegA));
// If this operand is folding a truncation, the truncation now moves to the
// copy so that the register classes remain valid for the operands.
MIB.addReg(RegB, 0, SubRegB);
@@ -1831,7 +1832,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
mi->getOperand(0).setSubReg(SubIdx);
mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
mi->removeOperand(1);
- mi->setDesc(TII->get(TargetOpcode::COPY));
+ mi->setDesc(TII->get(TII->getCopyOpcode()));
LLVM_DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
// Update LiveIntervals.
@@ -1919,7 +1920,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
// Insert the sub-register copy.
MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
- TII->get(TargetOpcode::COPY))
+ TII->get(TII->getCopyOpcode()))
.addReg(DstReg, RegState::Define, SubIdx)
.add(UseMO);
diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index 5e8514f525e9d..ff3229b747350 100644
--- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -182,9 +182,8 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
// insert a COPY instead of simply replacing the output
// with the input.
const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo();
- BuildMI(BB, BB.getFirstNonPHI(), phi->getDebugLoc(),
- TII->get(TargetOpcode::COPY), OutputReg)
- .addReg(InputReg, getRegState(Input), InputSub);
+ TII->buildCopy(BB, BB.getFirstNonPHI(), phi->getDebugLoc(),
+ OutputReg, InputReg, getRegState(Input), InputSub);
}
phi++->eraseFromParent();
}
diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
index 88460971338cb..fb691dba216d4 100644
--- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
+++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
@@ -122,6 +122,7 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
case TargetOpcode::REG_SEQUENCE:
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
case TargetOpcode::INLINEASM:
case TargetOpcode::INLINEASM_BR:
break;
@@ -172,6 +173,7 @@ bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) {
case TargetOpcode::CFI_INSTRUCTION:
case TargetOpcode::EH_LABEL:
case TargetOpcode::COPY:
+ case TargetOpcode::PRED_COPY:
case TargetOpcode::INLINEASM:
case TargetOpcode::INLINEASM_BR:
break;
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 069aca742da07..c386ab9352a41 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -404,6 +404,16 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
return true;
}
+// Returns true when all the implicit operands of the copy instruction \p MI are
+// reserved registers.
+static bool isCopyWithReservedImplicitOpnds(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI) {
+ for (unsigned I = 2, E = MI.getNumOperands(); I != E; ++I) {
+ if (!MRI.isReserved(MI.getOperand(I).getReg()))
+ return false;
+ }
+ return true;
+}
void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) {
if (!MI.isIdentityCopy())
return;
@@ -424,8 +434,11 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) {
// %al = COPY %al, implicit-def %eax
// give us additional liveness information: The target (super-)register
// must not be valid before this point. Replace the COPY with a KILL
- // instruction to maintain this information.
- if (MI.getOperand(1).isUndef() || MI.getNumOperands() > 2) {
+ // instruction to maintain this information. Do not insert KILL when the
+ // implicit operands are all reserved registers.
+ if (MI.getOperand(1).isUndef() ||
+ ((MI.getNumOperands() > 2) &&
+ !isCopyWithReservedImplicitOpnds(MI, *MRI))) {
MI.setDesc(TII->get(TargetOpcode::KILL));
LLVM_DEBUG(dbgs() << " replace by: " << MI);
return;
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 6c512023f3051..95e76320729e8 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -1204,6 +1204,13 @@ Instruction *DIBuilder::insertDef(DILifetime *Lifetime, llvm::Value *Referrer,
DefFn = getDefIntrin(M);
trackIfUnresolved(Lifetime);
+
+ // Ideally, the intrinsic would be able to handle any type of
+ // pointer. However, SelectionDAGBuilder::visitIntrinsicCall (for dbg_def) and
+ // InstEmitter::EmitDbgDefKill expect the intrinsic to refer directly to the
+ // alloca / argument and have problems handling addrspacecasts
+ Referrer = Referrer->stripPointerCasts();
+
Value *Args[] = {MetadataAsValue::get(VMContext, Lifetime),
getDbgIntrinsicValueImpl(VMContext, Referrer)};
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 5dd114c269ccb..51828b9422bf0 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -389,6 +389,17 @@ void Module::setModuleFlag(ModFlagBehavior Behavior, StringRef Key,
addModuleFlag(Behavior, Key, Val);
}
+void Module::setModuleFlag(ModFlagBehavior Behavior, StringRef Key,
+ Constant *Val) {
+ setModuleFlag(Behavior, Key, ConstantAsMetadata::get(Val));
+}
+
+void Module::setModuleFlag(ModFlagBehavior Behavior, StringRef Key,
+ uint32_t Val) {
+ Type *Int32Ty = Type::getInt32Ty(Context);
+ setModuleFlag(Behavior, Key, ConstantInt::get(Int32Ty, Val));
+}
+
void Module::setDataLayout(StringRef Desc) {
DL.reset(Desc);
}
diff --git a/llvm/lib/IR/TypeFinder.cpp b/llvm/lib/IR/TypeFinder.cpp
index 904af7e737ccf..dbdcb78615443 100644
--- a/llvm/lib/IR/TypeFinder.cpp
+++ b/llvm/lib/IR/TypeFinder.cpp
@@ -176,6 +176,30 @@ void TypeFinder::incorporateMDNode(const MDNode *V) {
return;
}
+ // The operations in a DIExpr are not exposed as operands, so handle such
+ // nodes specifically here.
+ if (const auto *E = dyn_cast(V)) {
+ for (auto &&Op : E->builder())
+ visit(
+ makeVisitor(
+#define HANDLE_OP0(NAME) [](DIOp::NAME) {},
+#include "llvm/IR/DIExprOps.def"
+ [&](DIOp::Referrer R) { incorporateType(R.getResultType()); },
+ [&](DIOp::Arg A) { incorporateType(A.getResultType()); },
+ [&](DIOp::TypeObject T) { incorporateType(T.getResultType()); },
+ [&](DIOp::Constant C) { incorporateValue(C.getLiteralValue()); },
+ [&](DIOp::Convert C) { incorporateType(C.getResultType()); },
+ [&](DIOp::Reinterpret R) { incorporateType(R.getResultType()); },
+ [&](DIOp::BitOffset B) { incorporateType(B.getResultType()); },
+ [&](DIOp::ByteOffset B) { incorporateType(B.getResultType()); },
+ [&](DIOp::Composite C) { incorporateType(C.getResultType()); },
+ [&](DIOp::Extend) {}, [&](DIOp::AddrOf) {},
+ [&](DIOp::Deref D) { incorporateType(D.getResultType()); },
+ [&](DIOp::PushLane P) { incorporateType(P.getResultType()); }),
+ Op);
+ return;
+ }
+
// Look in operands for types.
for (Metadata *Op : V->operands()) {
if (!Op)
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 286d3ca3e2cc0..828203080710e 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1148,7 +1148,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
GV->setLinkage(GlobalValue::InternalLinkage);
}
- RegularLTO.CombinedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
+ RegularLTO.CombinedModule->setModuleFlag(Module::Error, "LTOPostLink", 1);
if (Conf.PostInternalizeModuleHook &&
!Conf.PostInternalizeModuleHook(0, *RegularLTO.CombinedModule))
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 68ef8d60beac7..6037072d16863 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -606,7 +606,7 @@ bool LTOCodeGenerator::optimize() {
this->applyScopeRestrictions();
// Write LTOPostLink flag for passes that require all the modules.
- MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
+ MergedModule->setModuleFlag(Module::Error, "LTOPostLink", 1);
// Add an appropriate DataLayout instance for this module...
MergedModule->setDataLayout(TargetMach->createDataLayout());
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index eaf22eaa73a1d..e6fe9af415b3e 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -2310,11 +2310,6 @@ void MCAsmStreamer::emitInstruction(const MCInst &Inst,
assert(getCurrentSectionOnly() &&
"Cannot emit contents before setting section!");
- if (!MAI->usesDwarfFileAndLocDirectives())
- // Now that a machine instruction has been assembled into this section, make
- // a line entry for any .loc directive that has been seen.
- MCDwarfLineEntry::make(this, getCurrentSectionOnly());
-
// Show the encoding in a comment if we have a code emitter.
AddEncodingComment(Inst, STI);
diff --git a/llvm/lib/OffloadArch/amdgpu/codename2offloadarch.txt b/llvm/lib/OffloadArch/amdgpu/codename2offloadarch.txt
index 7df4c0a33b62c..eb1dd697ddd29 100644
--- a/llvm/lib/OffloadArch/amdgpu/codename2offloadarch.txt
+++ b/llvm/lib/OffloadArch/amdgpu/codename2offloadarch.txt
@@ -6,14 +6,19 @@ CYAN_SKILLFISH gfx1013
DIMGREY_CAVEFISH gfx1032
FIJI gfx803
HAWAII gfx701
+HOTPINK_BONEFISH gfx1102
NAVI10 gfx1010
NAVI12 gfx1011
NAVI14 gfx1012
NAVY_FLOUNDER gfx1031
+PINK_SARDINE gfx1103
+PLUM_BONITO gfx1100
POLARIS10 gfx803
POLARIS11 gfx803
POLARIS12 gfx803
+RAPHAEL gfx1036
RAVEN gfx902
+REMBRANDT gfx1035
RENOIR gfx90c
SIENNA_CICHLID gfx1030
SPECTRE gfx700
@@ -24,9 +29,5 @@ VEGA10 gfx900
VEGA12 gfx904
VEGA20 gfx906
VEGAM gfx803
-YELLOW_CARP gfx1035
-PLUM_BONITO gfx1100
WHEAT_NAS gfx1101
-HOTPINK_BONEFISH gfx1102
-PINK_SARDINE gfx1103
-PHOENIX gfx1103
+YELLOW_CARP gfx1035
diff --git a/llvm/lib/OffloadArch/amdgpu/pciid2codename.txt b/llvm/lib/OffloadArch/amdgpu/pciid2codename.txt
index b77b9471eb22a..e76bae21d8c61 100644
--- a/llvm/lib/OffloadArch/amdgpu/pciid2codename.txt
+++ b/llvm/lib/OffloadArch/amdgpu/pciid2codename.txt
@@ -177,4 +177,6 @@
1002:743F 0000 0000 BEIGE_GOBY : BEIGE_GOBY
1002:164D 0000 0000 YELLOW_CARP : YELLOW_CARP
1002:1681 0000 0000 YELLOW_CARP : YELLOW_CARP
-1002:DEBF 0000 0000 PLUM_BONITO : PLUM_BONITO
+1002:744C 0000 0000 PLUM_BONITO : PLUM_BONITO
+1002:164d 0000 0000 REMBRANDT : Rembrandt
+1002:164e 0000 0000 RAPHAEL : Raphael
diff --git a/llvm/lib/OffloadArch/generated_offload_arch.h b/llvm/lib/OffloadArch/generated_offload_arch.h
index 96205c15197b7..456e0e59f268d 100644
--- a/llvm/lib/OffloadArch/generated_offload_arch.h
+++ b/llvm/lib/OffloadArch/generated_offload_arch.h
@@ -13,6 +13,7 @@ typedef enum {
AOT_GFX1033,
AOT_GFX1034,
AOT_GFX1035,
+ AOT_GFX1036,
AOT_GFX1100,
AOT_GFX1101,
AOT_GFX1102,
@@ -45,14 +46,19 @@ typedef enum {
AOT_CN_DIMGREY_CAVEFISH,
AOT_CN_FIJI,
AOT_CN_HAWAII,
+ AOT_CN_HOTPINK_BONEFISH,
AOT_CN_NAVI10,
AOT_CN_NAVI12,
AOT_CN_NAVI14,
AOT_CN_NAVY_FLOUNDER,
+ AOT_CN_PINK_SARDINE,
+ AOT_CN_PLUM_BONITO,
AOT_CN_POLARIS10,
AOT_CN_POLARIS11,
AOT_CN_POLARIS12,
+ AOT_CN_RAPHAEL,
AOT_CN_RAVEN,
+ AOT_CN_REMBRANDT,
AOT_CN_RENOIR,
AOT_CN_SIENNA_CICHLID,
AOT_CN_SPECTRE,
@@ -63,12 +69,8 @@ typedef enum {
AOT_CN_VEGA12,
AOT_CN_VEGA20,
AOT_CN_VEGAM,
- AOT_CN_YELLOW_CARP,
- AOT_CN_PLUM_BONITO,
AOT_CN_WHEAT_NAS,
- AOT_CN_HOTPINK_BONEFISH,
- AOT_CN_PINK_SARDINE,
- AOT_CN_PHOENIX,
+ AOT_CN_YELLOW_CARP,
AOT_CN_K4000,
AOT_CN_K4200,
AOT_CN_GTX750,
@@ -108,14 +110,19 @@ extern const AOT_CODENAME_ID_TO_STRING AOT_CODENAMES[] = {
{AOT_CN_DIMGREY_CAVEFISH, "DIMGREY_CAVEFISH"},
{AOT_CN_FIJI, "FIJI"},
{AOT_CN_HAWAII, "HAWAII"},
+ {AOT_CN_HOTPINK_BONEFISH, "HOTPINK_BONEFISH"},
{AOT_CN_NAVI10, "NAVI10"},
{AOT_CN_NAVI12, "NAVI12"},
{AOT_CN_NAVI14, "NAVI14"},
{AOT_CN_NAVY_FLOUNDER, "NAVY_FLOUNDER"},
+ {AOT_CN_PINK_SARDINE, "PINK_SARDINE"},
+ {AOT_CN_PLUM_BONITO, "PLUM_BONITO"},
{AOT_CN_POLARIS10, "POLARIS10"},
{AOT_CN_POLARIS11, "POLARIS11"},
{AOT_CN_POLARIS12, "POLARIS12"},
+ {AOT_CN_RAPHAEL, "RAPHAEL"},
{AOT_CN_RAVEN, "RAVEN"},
+ {AOT_CN_REMBRANDT, "REMBRANDT"},
{AOT_CN_RENOIR, "RENOIR"},
{AOT_CN_SIENNA_CICHLID, "SIENNA_CICHLID"},
{AOT_CN_SPECTRE, "SPECTRE"},
@@ -126,12 +133,8 @@ extern const AOT_CODENAME_ID_TO_STRING AOT_CODENAMES[] = {
{AOT_CN_VEGA12, "VEGA12"},
{AOT_CN_VEGA20, "VEGA20"},
{AOT_CN_VEGAM, "VEGAM"},
- {AOT_CN_YELLOW_CARP, "YELLOW_CARP"},
- {AOT_CN_PLUM_BONITO, "PLUM_BONITO"},
{AOT_CN_WHEAT_NAS, "WHEAT_NAS"},
- {AOT_CN_HOTPINK_BONEFISH, "HOTPINK_BONEFISH"},
- {AOT_CN_PINK_SARDINE, "PINK_SARDINE"},
- {AOT_CN_PHOENIX, "PHOENIX"},
+ {AOT_CN_YELLOW_CARP, "YELLOW_CARP"},
{AOT_CN_K4000, "k4000"},
{AOT_CN_K4200, "k4200"},
{AOT_CN_GTX750, "gtx750"},
@@ -156,6 +159,7 @@ extern const AOT_OFFLOADARCH_TO_STRING AOT_OFFLOADARCHS[] = {
{AOT_GFX1033, "gfx1033"},
{AOT_GFX1034, "gfx1034"},
{AOT_GFX1035, "gfx1035"},
+ {AOT_GFX1036, "gfx1036"},
{AOT_GFX1100, "gfx1100"},
{AOT_GFX1101, "gfx1101"},
{AOT_GFX1102, "gfx1102"},
@@ -214,7 +218,9 @@ extern const AOT_TABLE_ENTRY AOT_TABLE[] = {
{ 0x1002, 0x1638, AOT_CN_RENOIR, AOT_GFX90C },
{ 0x1002, 0x163F, AOT_CN_VANGOGH, AOT_GFX1033 },
{ 0x1002, 0x164C, AOT_CN_RENOIR, AOT_GFX90C },
+{ 0x1002, 0x164d, AOT_CN_REMBRANDT, AOT_GFX1035 },
{ 0x1002, 0x164D, AOT_CN_YELLOW_CARP, AOT_GFX1035 },
+{ 0x1002, 0x164e, AOT_CN_RAPHAEL, AOT_GFX1036 },
{ 0x1002, 0x1681, AOT_CN_YELLOW_CARP, AOT_GFX1035 },
{ 0x1002, 0x66A0, AOT_CN_VEGA20, AOT_GFX906 },
{ 0x1002, 0x66A1, AOT_CN_VEGA20, AOT_GFX906 },
@@ -354,12 +360,12 @@ extern const AOT_TABLE_ENTRY AOT_TABLE[] = {
{ 0x1002, 0x7422, AOT_CN_BEIGE_GOBY, AOT_GFX1034 },
{ 0x1002, 0x7423, AOT_CN_BEIGE_GOBY, AOT_GFX1034 },
{ 0x1002, 0x743F, AOT_CN_BEIGE_GOBY, AOT_GFX1034 },
+{ 0x1002, 0x744C, AOT_CN_PLUM_BONITO, AOT_GFX1100 },
{ 0x1002, 0x9870, AOT_CN_CARRIZO, AOT_GFX801 },
{ 0x1002, 0x9874, AOT_CN_CARRIZO, AOT_GFX801 },
{ 0x1002, 0x9875, AOT_CN_CARRIZO, AOT_GFX801 },
{ 0x1002, 0x9876, AOT_CN_CARRIZO, AOT_GFX801 },
{ 0x1002, 0x9877, AOT_CN_CARRIZO, AOT_GFX801 },
-{ 0x1002, 0xDEBF, AOT_CN_PLUM_BONITO, AOT_GFX1100 },
{ 0x10de, 0x0f02, AOT_CN_GT730, AOT_SM_35 },
{ 0x10de, 0x0f06, AOT_CN_GT730, AOT_SM_35 },
{ 0x10de, 0x0fc9, AOT_CN_GT730, AOT_SM_35 },
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 462a86273a4ed..3986c9103754d 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -2500,7 +2500,7 @@ class VersionPrinter {
#ifdef PACKAGE_VENDOR
OS << PACKAGE_VENDOR << " ";
#else
- OS << "AOMP-15.0-61 (http://github.com/ROCm-Developer-Tools/aomp):\n Source ID:15.0-61-595b0d8133fafef5742f7d39f8e6a07b31afff56\n ";
+ OS << "LLVM (http://llvm.org/):\n ";
#endif
OS << PACKAGE_NAME << " version " << PACKAGE_VERSION << "\n ";
#if LLVM_IS_DEBUG_BUILD
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index bc67fa20c60d1..201182b1f0130 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -611,7 +611,7 @@ void AArch64FrameLowering::resetCFIToInitialState(
BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
// Flip the RA sign state.
- if (MFI.shouldSignReturnAddress()) {
+ if (MFI.shouldSignReturnAddress(MF)) {
CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
}
@@ -1363,7 +1363,7 @@ static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
.addImm(-8)
.setMIFlag(MachineInstr::FrameDestroy);
- if (MF.getInfo()->needsAsyncDwarfUnwindInfo()) {
+ if (MF.getInfo()->needsAsyncDwarfUnwindInfo(MF)) {
unsigned CFIIndex =
MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, 18));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1382,7 +1382,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
AArch64FunctionInfo *AFI = MF.getInfo();
- bool EmitCFI = AFI->needsDwarfUnwindInfo();
+ bool EmitCFI = AFI->needsDwarfUnwindInfo(MF);
bool HasFP = hasFP(MF);
bool NeedsWinCFI = needsWinCFI(MF);
bool HasWinCFI = false;
@@ -1402,9 +1402,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
const auto &MFnI = *MF.getInfo();
if (needsShadowCallStackPrologueEpilogue(MF))
emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
- MFnI.needsDwarfUnwindInfo());
+ MFnI.needsDwarfUnwindInfo(MF));
- if (MFnI.shouldSignReturnAddress()) {
+ if (MFnI.shouldSignReturnAddress(MF)) {
unsigned PACI;
if (MFnI.shouldSignWithBKey()) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
@@ -1876,7 +1876,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
static void InsertReturnAddressAuth(MachineFunction &MF, MachineBasicBlock &MBB,
bool NeedsWinCFI, bool *HasWinCFI) {
const auto &MFI = *MF.getInfo();
- if (!MFI.shouldSignReturnAddress())
+ if (!MFI.shouldSignReturnAddress(MF))
return;
const AArch64Subtarget &Subtarget = MF.getSubtarget();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
@@ -1936,7 +1936,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL;
bool NeedsWinCFI = needsWinCFI(MF);
- bool EmitCFI = MF.getInfo()->needsAsyncDwarfUnwindInfo();
+ bool EmitCFI =
+ MF.getInfo()->needsAsyncDwarfUnwindInfo(MF);
bool HasWinCFI = false;
bool IsFunclet = false;
auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
@@ -3748,11 +3749,11 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
EndOffset = Instr.Offset + Instr.Size;
}
+ const MachineFunction *MF = MBB->getParent();
// Multiple FP/SP updates in a loop cannot be described by CFI instructions.
- TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */
- !MBB->getParent()
- ->getInfo()
- ->needsAsyncDwarfUnwindInfo());
+ TSE.emitCode(
+ InsertI, TFI, /*TryMergeSPUpdate = */
+ !MF->getInfo()->needsAsyncDwarfUnwindInfo(*MF));
return InsertI;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f558cf2c2cbf6..1dc3980f160a4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3783,10 +3783,12 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
.addMemOperand(MMO);
}
-void AArch64InstrInfo::storeRegToStackSlot(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
- bool isKill, int FI, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
+void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ Register SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ Register VReg) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -3937,10 +3939,12 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
.addMemOperand(MMO);
}
-void AArch64InstrInfo::loadRegFromStackSlot(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
- int FI, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
+void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ Register DestReg, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ Register VReg) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -4505,10 +4509,10 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
"Mismatched register size in non subreg COPY");
if (IsSpill)
storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
- getRegClass(SrcReg), &TRI);
+ getRegClass(SrcReg), &TRI, Register());
else
loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
- getRegClass(DstReg), &TRI);
+ getRegClass(DstReg), &TRI, Register());
return &*--InsertPt;
}
@@ -4554,7 +4558,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
if (unsigned WidenedSrcReg =
TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
- FrameIndex, SpillRC, &TRI);
+ FrameIndex, SpillRC, &TRI, Register());
return &*--InsertPt;
}
}
@@ -4589,7 +4593,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
TRI.getRegSizeInBits(*FillRC) &&
"Mismatched regclass size on folded subreg COPY");
- loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
+ loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
+ Register());
MachineInstr &LoadMI = *--InsertPt;
MachineOperand &LoadDst = LoadMI.getOperand(0);
assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
@@ -7784,7 +7789,7 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
.addReg(AArch64::SP, RegState::InternalRead);
MI.setMIFlag(MachineInstr::FrameSetup);
- if (MF.getInfo()->needsDwarfUnwindInfo()) {
+ if (MF.getInfo()->needsDwarfUnwindInfo(MF)) {
unsigned CFIIndex =
MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
@@ -7883,7 +7888,7 @@ void AArch64InstrInfo::buildOutlinedFrame(
.addImm(-16);
It = MBB.insert(It, STRXpre);
- if (MF.getInfo()->needsDwarfUnwindInfo()) {
+ if (MF.getInfo()->needsDwarfUnwindInfo(MF)) {
const TargetSubtargetInfo &STI = MF.getSubtarget();
const MCRegisterInfo *MRI = STI.getRegisterInfo();
unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 96e16b0d1ee93..1057b6255e730 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -182,12 +182,14 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
MachineBasicBlock::iterator MBBI, Register SrcReg,
bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const override;
+ const TargetRegisterInfo *TRI,
+ Register VReg) const override;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, Register DestReg,
int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const override;
+ const TargetRegisterInfo *TRI,
+ Register VReg) const override;
// This tells target independent code that it is okay to pass instructions
// with subreg operands to foldMemoryOperandImpl.
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 469e1448602c0..961a19317d666 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -66,12 +66,12 @@ static std::pair GetSignReturnAddress(const Function &F) {
return {true, false};
}
-static bool ShouldSignWithBKey(const Function &F, const MachineFunction &MF) {
+static bool ShouldSignWithBKey(const Function &F, const AArch64Subtarget &STI) {
if (!F.hasFnAttribute("sign-return-address-key")) {
if (const auto *BKey = mdconst::extract_or_null(
F.getParent()->getModuleFlag("sign-return-address-with-bkey")))
return BKey->getZExtValue();
- if (MF.getTarget().getTargetTriple().isOSWindows())
+ if (STI.getTargetTriple().isOSWindows())
return true;
return false;
}
@@ -82,15 +82,14 @@ static bool ShouldSignWithBKey(const Function &F, const MachineFunction &MF) {
return Key.equals_insensitive("b_key");
}
-AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF_) : MF(&MF_) {
+AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
+ const AArch64Subtarget *STI) {
// If we already know that the function doesn't have a redzone, set
// HasRedZone here.
- if (MF->getFunction().hasFnAttribute(Attribute::NoRedZone))
+ if (F.hasFnAttribute(Attribute::NoRedZone))
HasRedZone = false;
-
- const Function &F = MF->getFunction();
std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F);
- SignWithBKey = ShouldSignWithBKey(F, *MF);
+ SignWithBKey = ShouldSignWithBKey(F, *STI);
// TODO: skip functions that have no instrumented allocas for optimization
IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag);
@@ -112,9 +111,7 @@ MachineFunctionInfo *AArch64FunctionInfo::clone(
BumpPtrAllocator &Allocator, MachineFunction &DestMF,
const DenseMap &Src2DstMBB)
const {
- AArch64FunctionInfo *InfoClone = DestMF.cloneInfo(*this);
- InfoClone->MF = &DestMF;
- return InfoClone;
+ return DestMF.cloneInfo(*this);
}
bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
@@ -125,27 +122,30 @@ bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
return SpillsLR;
}
-bool AArch64FunctionInfo::shouldSignReturnAddress() const {
+bool AArch64FunctionInfo::shouldSignReturnAddress(
+ const MachineFunction &MF) const {
return shouldSignReturnAddress(llvm::any_of(
- MF->getFrameInfo().getCalleeSavedInfo(),
+ MF.getFrameInfo().getCalleeSavedInfo(),
[](const auto &Info) { return Info.getReg() == AArch64::LR; }));
}
-bool AArch64FunctionInfo::needsDwarfUnwindInfo() const {
+bool AArch64FunctionInfo::needsDwarfUnwindInfo(
+ const MachineFunction &MF) const {
if (!NeedsDwarfUnwindInfo)
- NeedsDwarfUnwindInfo = MF->needsFrameMoves() &&
- !MF->getTarget().getMCAsmInfo()->usesWindowsCFI();
+ NeedsDwarfUnwindInfo = MF.needsFrameMoves() &&
+ !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
return *NeedsDwarfUnwindInfo;
}
-bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo() const {
+bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo(
+ const MachineFunction &MF) const {
if (!NeedsAsyncDwarfUnwindInfo) {
- const Function &F = MF->getFunction();
+ const Function &F = MF.getFunction();
// The check got "minsize" is because epilogue unwind info is not emitted
// (yet) for homogeneous epilogues, outlined functions, and functions
// outlined from.
- NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo() &&
+ NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo(MF) &&
F.getUWTableKind() == UWTableKind::Async &&
!F.hasMinSize();
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index c11506c898fa9..5e4c5926c371a 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -31,14 +31,12 @@ namespace yaml {
struct AArch64FunctionInfo;
} // end namespace yaml
+class AArch64Subtarget;
class MachineInstr;
/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
/// contains private AArch64-specific information for each MachineFunction.
class AArch64FunctionInfo final : public MachineFunctionInfo {
- /// Backreference to the machine function.
- MachineFunction *MF;
-
/// Number of bytes of arguments this function has on the stack. If the callee
/// is expected to restore the argument stack this should be a multiple of 16,
/// all usable during a tail call.
@@ -199,7 +197,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
mutable Optional NeedsAsyncDwarfUnwindInfo;
public:
- explicit AArch64FunctionInfo(MachineFunction &MF);
+ AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
MachineFunctionInfo *
clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
@@ -433,7 +431,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
CalleeSaveBaseToFrameRecordOffset = Offset;
}
- bool shouldSignReturnAddress() const;
+ bool shouldSignReturnAddress(const MachineFunction &MF) const;
bool shouldSignReturnAddress(bool SpillsLR) const;
bool shouldSignWithBKey() const { return SignWithBKey; }
@@ -451,8 +449,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
}
int getSwiftAsyncContextFrameIdx() const { return SwiftAsyncContextFrameIdx; }
- bool needsDwarfUnwindInfo() const;
- bool needsAsyncDwarfUnwindInfo() const;
+ bool needsDwarfUnwindInfo(const MachineFunction &MF) const;
+ bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const;
private:
// Hold the lists of LOHs.
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td
index e378b043d37e6..d34d567f961b7 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -205,6 +205,7 @@ def : ReadAdvance