-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[SLP]Reduce number of alternate instruction, where possible #123360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Reduce number of alternate instruction, where possible #123360
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesPatch tries to remove wide alternate operations.
i.e. half of the results are just unused. This leads to increased Patch introduces SplitVectorize mode, where it splits the operations by
It allows to improve the performance by reducing number of ops. Also, it -O3+LTO, AVX512 Prolangs-C/TimberWolfMC/timberwolfmc - small variations, some code not -O3+LTO, march=rva32u64 CINT2017rate/525.x264_r - similar to x86, extra code in pixel_hadamard_ac -O3+LTO, mcpu=sifive-p470 Metric: size..text Program size..text CINT2006/464.h264ref - extra vector code in find_sad_16x16 Patch is 243.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123360.diff 21 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b0b8f8249d657b..59063d6b4c9bc4 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1461,6 +1461,7 @@ class BoUpSLP {
VectorizableTree.clear();
ScalarToTreeEntry.clear();
MultiNodeScalars.clear();
+ ScalarsInSplitNodes.clear();
MustGather.clear();
NonScheduledFirst.clear();
EntryToLastInstruction.clear();
@@ -3196,12 +3197,30 @@ class BoUpSLP {
/// \returns Common mask for reorder indices and reused scalars.
SmallVector<int> getCommonMask() const {
+ if (State == TreeEntry::SplitVectorize)
+ return {};
SmallVector<int> Mask;
inversePermutation(ReorderIndices, Mask);
::addMask(Mask, ReuseShuffleIndices);
return Mask;
}
+ /// \returns The mask for split nodes.
+ SmallVector<int> getSplitMask() const {
+ assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
+ "Expected only split vectorize node.");
+ SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
+ unsigned CommonVF = std::max<unsigned>(
+ CombinedEntriesWithIndices.back().second,
+ Scalars.size() - CombinedEntriesWithIndices.back().second);
+ for (auto [Idx, I] : enumerate(ReorderIndices))
+ Mask[I] =
+ Idx + (Idx >= CombinedEntriesWithIndices.back().second
+ ? CommonVF - CombinedEntriesWithIndices.back().second
+ : 0);
+ return Mask;
+ }
+
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
@@ -3293,6 +3312,8 @@ class BoUpSLP {
///< complex node like select/cmp to minmax, mul/add to
///< fma, etc. Must be used for the following nodes in
///< the pattern, not the very first one.
+ SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
+ ///< independently and then combines back.
};
EntryState State;
@@ -3324,7 +3345,7 @@ class BoUpSLP {
/// The index of this treeEntry in VectorizableTree.
unsigned Idx = 0;
- /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
+ /// For gather/buildvector/alt opcode nodes, which are combined from
/// other nodes as a series of insertvector instructions.
SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
@@ -3471,8 +3492,9 @@ class BoUpSLP {
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
/// Return true if this is a non-power-of-2 node.
- bool isNonPowOf2Vec() const {
- bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
+ bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
+ bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
+ TTI, getValueType(Scalars.front()), Scalars.size());
return IsNonPowerOf2;
}
@@ -3530,6 +3552,9 @@ class BoUpSLP {
case CombinedVectorize:
dbgs() << "CombinedVectorize\n";
break;
+ case SplitVectorize:
+ dbgs() << "SplitVectorize\n";
+ break;
}
dbgs() << "MainOp: ";
if (MainOp)
@@ -3611,8 +3636,10 @@ class BoUpSLP {
const EdgeInfo &UserTreeIdx,
ArrayRef<int> ReuseShuffleIndices = {},
ArrayRef<unsigned> ReorderIndices = {}) {
- assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
- (Bundle && EntryState != TreeEntry::NeedToGather)) &&
+ assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
+ EntryState == TreeEntry::SplitVectorize)) ||
+ (Bundle && EntryState != TreeEntry::NeedToGather &&
+ EntryState != TreeEntry::SplitVectorize)) &&
"Need to vectorize gather entry?");
// Gathered loads still gathered? Do not create entry, use the original one.
if (GatheredLoadsEntriesFirst.has_value() &&
@@ -3646,12 +3673,29 @@ class BoUpSLP {
return VL[Idx];
});
InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
- if (S)
+ if (S) {
Last->setOperations(S);
+ } else if (EntryState == TreeEntry::SplitVectorize) {
+ auto *MainOp =
+ cast<Instruction>(*find_if(Last->Scalars, IsaPred<Instruction>));
+ auto *AltOp = cast<Instruction>(*find_if(Last->Scalars, [=](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && I->getOpcode() != MainOp->getOpcode();
+ }));
+ Last->setOperations(InstructionsState(MainOp, AltOp));
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ ScalarsInSplitNodes.try_emplace(I, Last);
+ }
+ }
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
}
- if (!Last->isGather()) {
+ if (!Last->isGather() && Last->State != TreeEntry::SplitVectorize) {
for (Value *V : VL) {
+ if (isa<PoisonValue>(V))
+ continue;
const TreeEntry *TE = getTreeEntry(V);
assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
"Scalar already in tree!");
@@ -3679,7 +3723,7 @@ class BoUpSLP {
}
}
assert(!BundleMember && "Bundle and VL out of sync");
- } else {
+ } else if (Last->isGather()) {
// Build a map for gathered scalars to the nodes where they are used.
bool AllConstsOrCasts = true;
for (Value *V : VL)
@@ -3745,6 +3789,9 @@ class BoUpSLP {
/// nodes.
SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
+ /// Scalars, used in split vectorize nodes.
+ SmallDenseMap<Value *, TreeEntry *> ScalarsInSplitNodes;
+
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
@@ -5648,12 +5695,14 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
}) &&
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
return std::nullopt;
- if ((TE.State == TreeEntry::Vectorize ||
- TE.State == TreeEntry::StridedVectorize) &&
- (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
- (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
- assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
- "BinaryOperator and CastInst.");
+ if (TE.State == TreeEntry::SplitVectorize ||
+ ((TE.State == TreeEntry::Vectorize ||
+ TE.State == TreeEntry::StridedVectorize) &&
+ (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
+ (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
+ assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
+ "Alternate instructions are only supported by "
+ "BinaryOperator and CastInst.");
return TE.ReorderIndices;
}
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
@@ -5938,7 +5987,7 @@ void BoUpSLP::reorderTopToBottom() {
// Patterns like [fadd,fsub] can be combined into a single instruction in
// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
// to take into account their order when looking for the most used order.
- if (TE->isAltShuffle()) {
+ if (TE->isAltShuffle() && TE->State != TreeEntry::SplitVectorize) {
VectorType *VecTy =
getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
unsigned Opcode0 = TE->getOpcode();
@@ -5976,7 +6025,8 @@ void BoUpSLP::reorderTopToBottom() {
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) ||
+ TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::SplitVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
@@ -5985,6 +6035,30 @@ void BoUpSLP::reorderTopToBottom() {
}
});
+ auto UpdateSplitUserNode = [&](TreeEntry *UserTE, unsigned Idx,
+ ArrayRef<int> Mask, ArrayRef<int> MaskOrder) {
+ assert(UserTE->State == TreeEntry::SplitVectorize &&
+ "Expected split user node.");
+ SmallVector<int> NewMask(UserTE->getVectorFactor());
+ SmallVector<int> NewMaskOrder(UserTE->getVectorFactor());
+ std::iota(NewMask.begin(), NewMask.end(), 0);
+ std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
+ if (Idx == 0) {
+ copy(Mask, NewMask.begin());
+ copy(MaskOrder, NewMaskOrder.begin());
+ } else {
+ assert(Idx == 1 && "Expected either 0 or 1 index.");
+ unsigned Offset = UserTE->CombinedEntriesWithIndices.back().second;
+ for (unsigned I : seq<unsigned>(Mask.size())) {
+ NewMask[I + Offset] = Mask[I] + Offset;
+ NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
+ }
+ }
+ reorderScalars(UserTE->Scalars, NewMask);
+ reorderOrder(UserTE->ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
+ if (isIdentityOrder(UserTE->ReorderIndices))
+ UserTE->ReorderIndices.clear();
+ };
// Reorder the graph nodes according to their vectorization factor.
for (unsigned VF = VectorizableTree.front()->getVectorFactor();
!VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
@@ -6007,7 +6081,8 @@ void BoUpSLP::reorderTopToBottom() {
for (const TreeEntry *OpTE : OrderedEntries) {
// No need to reorder this nodes, still need to extend and to use shuffle,
// just need to merge reordering shuffle and the reuse shuffle.
- if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
+ if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
+ OpTE->State != TreeEntry::SplitVectorize)
continue;
// Count number of orders uses.
const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
@@ -6114,6 +6189,8 @@ void BoUpSLP::reorderTopToBottom() {
// Just do the reordering for the nodes with the given VF.
if (TE->Scalars.size() != VF) {
if (TE->ReuseShuffleIndices.size() == VF) {
+ assert(TE->State != TreeEntry::SplitVectorize &&
+ "Split vectorized not expected.");
// Need to reorder the reuses masks of the operands with smaller VF to
// be able to find the match between the graph nodes and scalar
// operands of the given node during vectorization/cost estimation.
@@ -6121,7 +6198,8 @@ void BoUpSLP::reorderTopToBottom() {
[VF, &TE](const EdgeInfo &EI) {
return EI.UserTE->Scalars.size() == VF ||
EI.UserTE->Scalars.size() ==
- TE->Scalars.size();
+ TE->Scalars.size() ||
+ EI.UserTE->State == TreeEntry::SplitVectorize;
}) &&
"All users must be of VF size.");
if (SLPReVec) {
@@ -6144,19 +6222,29 @@ void BoUpSLP::reorderTopToBottom() {
// Update ordering of the operands with the smaller VF than the given
// one.
reorderNodeWithReuses(*TE, Mask);
+ // Update orders in user split vectorize nodes.
+ for (EdgeInfo &EI : TE->UserTreeIndices) {
+ if (EI.UserTE->State != TreeEntry::SplitVectorize)
+ continue;
+ UpdateSplitUserNode(EI.UserTE, EI.EdgeIdx, Mask, MaskOrder);
+ }
}
continue;
}
- if ((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) &&
- (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
- InsertElementInst>(TE->getMainOp()) ||
- (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
- assert(!TE->isAltShuffle() &&
- "Alternate instructions are only supported by BinaryOperator "
- "and CastInst.");
- // Build correct orders for extract{element,value}, loads and
- // stores.
+ if ((TE->State == TreeEntry::SplitVectorize &&
+ TE->ReuseShuffleIndices.empty()) ||
+ ((TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::StridedVectorize) &&
+ (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
+ InsertElementInst>(TE->getMainOp()) ||
+ (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
+ assert(
+ (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
+ TE->ReuseShuffleIndices.empty())) &&
+ "Alternate instructions are only supported by BinaryOperator "
+ "and CastInst.");
+ // Build correct orders for extract{element,value}, loads,
+ // stores and alternate (split) nodes.
reorderOrder(TE->ReorderIndices, Mask);
if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
TE->reorderOperands(Mask);
@@ -6177,6 +6265,12 @@ void BoUpSLP::reorderTopToBottom() {
addMask(NewReuses, TE->ReuseShuffleIndices);
TE->ReuseShuffleIndices.swap(NewReuses);
}
+ // Update orders in user split vectorize nodes.
+ for (EdgeInfo &EI : TE->UserTreeIndices) {
+ if (EI.UserTE->State != TreeEntry::SplitVectorize)
+ continue;
+ UpdateSplitUserNode(EI.UserTE, EI.EdgeIdx, Mask, MaskOrder);
+ }
}
}
}
@@ -6189,7 +6283,8 @@ bool BoUpSLP::canReorderOperands(
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
return OpData.first == I &&
(OpData.second->State == TreeEntry::Vectorize ||
- OpData.second->State == TreeEntry::StridedVectorize);
+ OpData.second->State == TreeEntry::StridedVectorize ||
+ OpData.second->State == TreeEntry::SplitVectorize);
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
@@ -6207,6 +6302,7 @@ bool BoUpSLP::canReorderOperands(
// node, just reorder reuses mask.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
continue;
@@ -6216,6 +6312,7 @@ bool BoUpSLP::canReorderOperands(
[&Gather, UserTE, I](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::SplitVectorize &&
"Only non-vectorized nodes are expected.");
if (any_of(TE->UserTreeIndices,
[UserTE, I](const EdgeInfo &EI) {
@@ -6245,13 +6342,15 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> NonVectorized;
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE->State != TreeEntry::Vectorize &&
- TE->State != TreeEntry::StridedVectorize)
+ TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::SplitVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) ||
+ TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::SplitVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.insert(TE.get());
}
@@ -6270,6 +6369,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::SplitVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
!all_of(drop_begin(TE->UserTreeIndices),
@@ -6295,6 +6395,51 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return Data1.first->Idx > Data2.first->Idx;
});
for (auto &Data : UsersVec) {
+ if (Data.first->State == TreeEntry::SplitVectorize) {
+ assert(
+ Data.second.size() <= 2 &&
+ "Expected not greater than 2 operands for split vectorize node.");
+ if (any_of(Data.second, [](const auto &Op) {
+ return Op.second->UserTreeIndices.size() != 1;
+ }))
+ continue;
+ // Update orders in user split vectorize nodes.
+ for (const auto &P : Data.first->CombinedEntriesWithIndices) {
+ TreeEntry &OpTE = *VectorizableTree[P.first].get();
+ if (OpTE.isGather() || OpTE.ReorderIndices.empty())
+ continue;
+ SmallVector<int> Mask;
+ inversePermutation(OpTE.ReorderIndices, Mask);
+ SmallVector<int> MaskOrder(OpTE.ReorderIndices.size(),
+ PoisonMaskElem);
+ unsigned E = OpTE.ReorderIndices.size();
+ transform(OpTE.ReorderIndices, MaskOrder.begin(), [E](unsigned I) {
+ return I < E ? static_cast<int>(I) : PoisonMaskElem;
+ });
+ SmallVector<int> NewMask(Data.first->getVectorFactor());
+ SmallVector<int> NewMaskOrder(Data.first->getVectorFactor());
+ std::iota(NewMask.begin(), NewMask.end(), 0);
+ std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
+ if (P.second == 0) {
+ copy(Mask, NewMask.begin());
+ copy(MaskOrder, NewMaskOrder.begin());
+ } else {
+ unsigned Offset = P.second;
+ for (unsigned I : seq<unsigned>(Mask.size())) {
+ NewMask[I + Offset] = Mask[I] + Offset;
+ NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
+ }
+ }
+ reorderScalars(Data.first->Scalars, NewMask);
+ reorderOrder(Data.first->ReorderIndices, NewMaskOrder,
+ /*BottomOrder=*/true);
+ if (isIdentityOrder(Data.first->ReorderIndices))
+ Data.first->ReorderIndices.clear();
+ // Clear ordering of the operand.
+ OpTE.ReorderIndices.clear();
+ }
+ continue;
+ }
// Check that operands are used only in the User node.
SmallVector<TreeEntry *> GatherOps;
if (!canReorderOperands(Data.first, Data.second, NonVectorized,
@@ -6451,6 +6596,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Gathers are processed separately.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::SplitVectorize &&
(TE->State != TreeEntry::ScatterVectorize ||
TE->ReorderIndices.empty()))
continue;
@@ -6521,7 +6667,7 @@ void BoUpSLP::buildExternalUses(
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
- if (Entry->isGather())
+ if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
continue;
// For each lane:
@@ -8227,6 +8373,142 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
+ // Tries to build split node.
+ auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize,
+ const InstructionsState &LocalState) {
+ if (VL.size() <= SmallNodeSize)
+ return false;
+
+ // Any value is used in ...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: Alexey Bataev (alexey-bataev) ChangesPatch tries to remove wide alternate operations.
i.e. half of the results are just unused. This leads to increased Patch introduces SplitVectorize mode, where it splits the operations by
It allows to improve the performance by reducing number of ops. Also, it -O3+LTO, AVX512 Prolangs-C/TimberWolfMC/timberwolfmc - small variations, some code not -O3+LTO, march=rva32u64 CINT2017rate/525.x264_r - similar to x86, extra code in pixel_hadamard_ac -O3+LTO, mcpu=sifive-p470 Metric: size..text Program size..text CINT2006/464.h264ref - extra vector code in find_sad_16x16 Patch is 243.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123360.diff 21 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b0b8f8249d657b..59063d6b4c9bc4 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1461,6 +1461,7 @@ class BoUpSLP {
VectorizableTree.clear();
ScalarToTreeEntry.clear();
MultiNodeScalars.clear();
+ ScalarsInSplitNodes.clear();
MustGather.clear();
NonScheduledFirst.clear();
EntryToLastInstruction.clear();
@@ -3196,12 +3197,30 @@ class BoUpSLP {
/// \returns Common mask for reorder indices and reused scalars.
SmallVector<int> getCommonMask() const {
+ if (State == TreeEntry::SplitVectorize)
+ return {};
SmallVector<int> Mask;
inversePermutation(ReorderIndices, Mask);
::addMask(Mask, ReuseShuffleIndices);
return Mask;
}
+ /// \returns The mask for split nodes.
+ SmallVector<int> getSplitMask() const {
+ assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
+ "Expected only split vectorize node.");
+ SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
+ unsigned CommonVF = std::max<unsigned>(
+ CombinedEntriesWithIndices.back().second,
+ Scalars.size() - CombinedEntriesWithIndices.back().second);
+ for (auto [Idx, I] : enumerate(ReorderIndices))
+ Mask[I] =
+ Idx + (Idx >= CombinedEntriesWithIndices.back().second
+ ? CommonVF - CombinedEntriesWithIndices.back().second
+ : 0);
+ return Mask;
+ }
+
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
@@ -3293,6 +3312,8 @@ class BoUpSLP {
///< complex node like select/cmp to minmax, mul/add to
///< fma, etc. Must be used for the following nodes in
///< the pattern, not the very first one.
+ SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
+ ///< independently and then combines back.
};
EntryState State;
@@ -3324,7 +3345,7 @@ class BoUpSLP {
/// The index of this treeEntry in VectorizableTree.
unsigned Idx = 0;
- /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
+ /// For gather/buildvector/alt opcode nodes, which are combined from
/// other nodes as a series of insertvector instructions.
SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
@@ -3471,8 +3492,9 @@ class BoUpSLP {
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
/// Return true if this is a non-power-of-2 node.
- bool isNonPowOf2Vec() const {
- bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
+ bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
+ bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
+ TTI, getValueType(Scalars.front()), Scalars.size());
return IsNonPowerOf2;
}
@@ -3530,6 +3552,9 @@ class BoUpSLP {
case CombinedVectorize:
dbgs() << "CombinedVectorize\n";
break;
+ case SplitVectorize:
+ dbgs() << "SplitVectorize\n";
+ break;
}
dbgs() << "MainOp: ";
if (MainOp)
@@ -3611,8 +3636,10 @@ class BoUpSLP {
const EdgeInfo &UserTreeIdx,
ArrayRef<int> ReuseShuffleIndices = {},
ArrayRef<unsigned> ReorderIndices = {}) {
- assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
- (Bundle && EntryState != TreeEntry::NeedToGather)) &&
+ assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
+ EntryState == TreeEntry::SplitVectorize)) ||
+ (Bundle && EntryState != TreeEntry::NeedToGather &&
+ EntryState != TreeEntry::SplitVectorize)) &&
"Need to vectorize gather entry?");
// Gathered loads still gathered? Do not create entry, use the original one.
if (GatheredLoadsEntriesFirst.has_value() &&
@@ -3646,12 +3673,29 @@ class BoUpSLP {
return VL[Idx];
});
InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
- if (S)
+ if (S) {
Last->setOperations(S);
+ } else if (EntryState == TreeEntry::SplitVectorize) {
+ auto *MainOp =
+ cast<Instruction>(*find_if(Last->Scalars, IsaPred<Instruction>));
+ auto *AltOp = cast<Instruction>(*find_if(Last->Scalars, [=](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && I->getOpcode() != MainOp->getOpcode();
+ }));
+ Last->setOperations(InstructionsState(MainOp, AltOp));
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ ScalarsInSplitNodes.try_emplace(I, Last);
+ }
+ }
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
}
- if (!Last->isGather()) {
+ if (!Last->isGather() && Last->State != TreeEntry::SplitVectorize) {
for (Value *V : VL) {
+ if (isa<PoisonValue>(V))
+ continue;
const TreeEntry *TE = getTreeEntry(V);
assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
"Scalar already in tree!");
@@ -3679,7 +3723,7 @@ class BoUpSLP {
}
}
assert(!BundleMember && "Bundle and VL out of sync");
- } else {
+ } else if (Last->isGather()) {
// Build a map for gathered scalars to the nodes where they are used.
bool AllConstsOrCasts = true;
for (Value *V : VL)
@@ -3745,6 +3789,9 @@ class BoUpSLP {
/// nodes.
SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
+ /// Scalars, used in split vectorize nodes.
+ SmallDenseMap<Value *, TreeEntry *> ScalarsInSplitNodes;
+
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
@@ -5648,12 +5695,14 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
}) &&
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
return std::nullopt;
- if ((TE.State == TreeEntry::Vectorize ||
- TE.State == TreeEntry::StridedVectorize) &&
- (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
- (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
- assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
- "BinaryOperator and CastInst.");
+ if (TE.State == TreeEntry::SplitVectorize ||
+ ((TE.State == TreeEntry::Vectorize ||
+ TE.State == TreeEntry::StridedVectorize) &&
+ (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
+ (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
+ assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
+ "Alternate instructions are only supported by "
+ "BinaryOperator and CastInst.");
return TE.ReorderIndices;
}
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
@@ -5938,7 +5987,7 @@ void BoUpSLP::reorderTopToBottom() {
// Patterns like [fadd,fsub] can be combined into a single instruction in
// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
// to take into account their order when looking for the most used order.
- if (TE->isAltShuffle()) {
+ if (TE->isAltShuffle() && TE->State != TreeEntry::SplitVectorize) {
VectorType *VecTy =
getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
unsigned Opcode0 = TE->getOpcode();
@@ -5976,7 +6025,8 @@ void BoUpSLP::reorderTopToBottom() {
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) ||
+ TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::SplitVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
@@ -5985,6 +6035,30 @@ void BoUpSLP::reorderTopToBottom() {
}
});
+ auto UpdateSplitUserNode = [&](TreeEntry *UserTE, unsigned Idx,
+ ArrayRef<int> Mask, ArrayRef<int> MaskOrder) {
+ assert(UserTE->State == TreeEntry::SplitVectorize &&
+ "Expected split user node.");
+ SmallVector<int> NewMask(UserTE->getVectorFactor());
+ SmallVector<int> NewMaskOrder(UserTE->getVectorFactor());
+ std::iota(NewMask.begin(), NewMask.end(), 0);
+ std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
+ if (Idx == 0) {
+ copy(Mask, NewMask.begin());
+ copy(MaskOrder, NewMaskOrder.begin());
+ } else {
+ assert(Idx == 1 && "Expected either 0 or 1 index.");
+ unsigned Offset = UserTE->CombinedEntriesWithIndices.back().second;
+ for (unsigned I : seq<unsigned>(Mask.size())) {
+ NewMask[I + Offset] = Mask[I] + Offset;
+ NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
+ }
+ }
+ reorderScalars(UserTE->Scalars, NewMask);
+ reorderOrder(UserTE->ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
+ if (isIdentityOrder(UserTE->ReorderIndices))
+ UserTE->ReorderIndices.clear();
+ };
// Reorder the graph nodes according to their vectorization factor.
for (unsigned VF = VectorizableTree.front()->getVectorFactor();
!VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
@@ -6007,7 +6081,8 @@ void BoUpSLP::reorderTopToBottom() {
for (const TreeEntry *OpTE : OrderedEntries) {
// No need to reorder this nodes, still need to extend and to use shuffle,
// just need to merge reordering shuffle and the reuse shuffle.
- if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
+ if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
+ OpTE->State != TreeEntry::SplitVectorize)
continue;
// Count number of orders uses.
const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
@@ -6114,6 +6189,8 @@ void BoUpSLP::reorderTopToBottom() {
// Just do the reordering for the nodes with the given VF.
if (TE->Scalars.size() != VF) {
if (TE->ReuseShuffleIndices.size() == VF) {
+ assert(TE->State != TreeEntry::SplitVectorize &&
+ "Split vectorized not expected.");
// Need to reorder the reuses masks of the operands with smaller VF to
// be able to find the match between the graph nodes and scalar
// operands of the given node during vectorization/cost estimation.
@@ -6121,7 +6198,8 @@ void BoUpSLP::reorderTopToBottom() {
[VF, &TE](const EdgeInfo &EI) {
return EI.UserTE->Scalars.size() == VF ||
EI.UserTE->Scalars.size() ==
- TE->Scalars.size();
+ TE->Scalars.size() ||
+ EI.UserTE->State == TreeEntry::SplitVectorize;
}) &&
"All users must be of VF size.");
if (SLPReVec) {
@@ -6144,19 +6222,29 @@ void BoUpSLP::reorderTopToBottom() {
// Update ordering of the operands with the smaller VF than the given
// one.
reorderNodeWithReuses(*TE, Mask);
+ // Update orders in user split vectorize nodes.
+ for (EdgeInfo &EI : TE->UserTreeIndices) {
+ if (EI.UserTE->State != TreeEntry::SplitVectorize)
+ continue;
+ UpdateSplitUserNode(EI.UserTE, EI.EdgeIdx, Mask, MaskOrder);
+ }
}
continue;
}
- if ((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) &&
- (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
- InsertElementInst>(TE->getMainOp()) ||
- (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
- assert(!TE->isAltShuffle() &&
- "Alternate instructions are only supported by BinaryOperator "
- "and CastInst.");
- // Build correct orders for extract{element,value}, loads and
- // stores.
+ if ((TE->State == TreeEntry::SplitVectorize &&
+ TE->ReuseShuffleIndices.empty()) ||
+ ((TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::StridedVectorize) &&
+ (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
+ InsertElementInst>(TE->getMainOp()) ||
+ (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
+ assert(
+ (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
+ TE->ReuseShuffleIndices.empty())) &&
+ "Alternate instructions are only supported by BinaryOperator "
+ "and CastInst.");
+ // Build correct orders for extract{element,value}, loads,
+ // stores and alternate (split) nodes.
reorderOrder(TE->ReorderIndices, Mask);
if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
TE->reorderOperands(Mask);
@@ -6177,6 +6265,12 @@ void BoUpSLP::reorderTopToBottom() {
addMask(NewReuses, TE->ReuseShuffleIndices);
TE->ReuseShuffleIndices.swap(NewReuses);
}
+ // Update orders in user split vectorize nodes.
+ for (EdgeInfo &EI : TE->UserTreeIndices) {
+ if (EI.UserTE->State != TreeEntry::SplitVectorize)
+ continue;
+ UpdateSplitUserNode(EI.UserTE, EI.EdgeIdx, Mask, MaskOrder);
+ }
}
}
}
@@ -6189,7 +6283,8 @@ bool BoUpSLP::canReorderOperands(
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
return OpData.first == I &&
(OpData.second->State == TreeEntry::Vectorize ||
- OpData.second->State == TreeEntry::StridedVectorize);
+ OpData.second->State == TreeEntry::StridedVectorize ||
+ OpData.second->State == TreeEntry::SplitVectorize);
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
@@ -6207,6 +6302,7 @@ bool BoUpSLP::canReorderOperands(
// node, just reorder reuses mask.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
continue;
@@ -6216,6 +6312,7 @@ bool BoUpSLP::canReorderOperands(
[&Gather, UserTE, I](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::SplitVectorize &&
"Only non-vectorized nodes are expected.");
if (any_of(TE->UserTreeIndices,
[UserTE, I](const EdgeInfo &EI) {
@@ -6245,13 +6342,15 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> NonVectorized;
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE->State != TreeEntry::Vectorize &&
- TE->State != TreeEntry::StridedVectorize)
+ TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::SplitVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) ||
+ TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::SplitVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.insert(TE.get());
}
@@ -6270,6 +6369,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::SplitVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
!all_of(drop_begin(TE->UserTreeIndices),
@@ -6295,6 +6395,51 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return Data1.first->Idx > Data2.first->Idx;
});
for (auto &Data : UsersVec) {
+ if (Data.first->State == TreeEntry::SplitVectorize) {
+ assert(
+ Data.second.size() <= 2 &&
+ "Expected not greater than 2 operands for split vectorize node.");
+ if (any_of(Data.second, [](const auto &Op) {
+ return Op.second->UserTreeIndices.size() != 1;
+ }))
+ continue;
+ // Update orders in user split vectorize nodes.
+ for (const auto &P : Data.first->CombinedEntriesWithIndices) {
+ TreeEntry &OpTE = *VectorizableTree[P.first].get();
+ if (OpTE.isGather() || OpTE.ReorderIndices.empty())
+ continue;
+ SmallVector<int> Mask;
+ inversePermutation(OpTE.ReorderIndices, Mask);
+ SmallVector<int> MaskOrder(OpTE.ReorderIndices.size(),
+ PoisonMaskElem);
+ unsigned E = OpTE.ReorderIndices.size();
+ transform(OpTE.ReorderIndices, MaskOrder.begin(), [E](unsigned I) {
+ return I < E ? static_cast<int>(I) : PoisonMaskElem;
+ });
+ SmallVector<int> NewMask(Data.first->getVectorFactor());
+ SmallVector<int> NewMaskOrder(Data.first->getVectorFactor());
+ std::iota(NewMask.begin(), NewMask.end(), 0);
+ std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
+ if (P.second == 0) {
+ copy(Mask, NewMask.begin());
+ copy(MaskOrder, NewMaskOrder.begin());
+ } else {
+ unsigned Offset = P.second;
+ for (unsigned I : seq<unsigned>(Mask.size())) {
+ NewMask[I + Offset] = Mask[I] + Offset;
+ NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
+ }
+ }
+ reorderScalars(Data.first->Scalars, NewMask);
+ reorderOrder(Data.first->ReorderIndices, NewMaskOrder,
+ /*BottomOrder=*/true);
+ if (isIdentityOrder(Data.first->ReorderIndices))
+ Data.first->ReorderIndices.clear();
+ // Clear ordering of the operand.
+ OpTE.ReorderIndices.clear();
+ }
+ continue;
+ }
// Check that operands are used only in the User node.
SmallVector<TreeEntry *> GatherOps;
if (!canReorderOperands(Data.first, Data.second, NonVectorized,
@@ -6451,6 +6596,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Gathers are processed separately.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::SplitVectorize &&
(TE->State != TreeEntry::ScatterVectorize ||
TE->ReorderIndices.empty()))
continue;
@@ -6521,7 +6667,7 @@ void BoUpSLP::buildExternalUses(
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
- if (Entry->isGather())
+ if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
continue;
// For each lane:
@@ -8227,6 +8373,142 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
+ // Tries to build split node.
+ auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize,
+ const InstructionsState &LocalState) {
+ if (VL.size() <= SmallNodeSize)
+ return false;
+
+ // Any value is used in ...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 466217eb0334d467ec8e9b96c52ee988aaadc389 238d71529221832bf99d72983c95b29c84c18a02 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll llvm/test/Transforms/SLPVectorizer/addsub.ll llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
undefs in tests are generated by constant folder in InstructionBuilder |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not clear whether this patch makes it easier or harder to remove the main-alt opcode kludge in the future and allow general splitting - copyable elements, more than 2 subnodes, non-matching instruction types (e.g. binop and intrinsic).
I guess I'm asking - is there a larger plan for this or is it focused on improving existing main-alt opcode performance?
Currently, this is mostly all we can do about main/alternate vectorization. |
Also, as the first step to improve alt/main vectorization (and copyable elements), one of our team members works on #112181. It has some issues with compile time, we still investigating how we can improve it |
Just want to clarify two points here.
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like the current version causes some crashes on AArch64:
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx13.0.0"
define i32 @test(ptr %c) {
entry:
%bitlen = getelementptr i8, ptr %c, i64 136
%0 = load i64, ptr %bitlen, align 8
%incdec.ptr.4 = getelementptr i8, ptr %c, i64 122
%shr45.4 = lshr i64 %0, 0
%conv43.5 = trunc i64 %shr45.4 to i8
%incdec.ptr.5 = getelementptr i8, ptr %c, i64 121
store i8 %conv43.5, ptr %incdec.ptr.4, align 1
%shr45.5 = lshr i64 %0, 0
%conv43.6 = trunc i64 %shr45.5 to i8
%incdec.ptr.6 = getelementptr i8, ptr %c, i64 120
store i8 %conv43.6, ptr %incdec.ptr.5, align 1
%conv43.7 = trunc i64 %0 to i8
%incdec.ptr.7 = getelementptr i8, ptr %c, i64 119
store i8 %conv43.7, ptr %incdec.ptr.6, align 1
%arrayidx38.1 = getelementptr i8, ptr %c, i64 144
%1 = load i64, ptr %arrayidx38.1, align 8
%conv43.145 = trunc i64 %1 to i8
%incdec.ptr.146 = getelementptr i8, ptr %c, i64 118
store i8 %conv43.145, ptr %incdec.ptr.7, align 1
%shr45.147 = lshr i64 %1, 0
%conv43.1.1 = trunc i64 %shr45.147 to i8
%incdec.ptr.1.1 = getelementptr i8, ptr %c, i64 117
store i8 %conv43.1.1, ptr %incdec.ptr.146, align 1
%shr45.1.1 = lshr i64 %1, 0
%conv43.2.1 = trunc i64 %shr45.1.1 to i8
%incdec.ptr.2.1 = getelementptr i8, ptr %c, i64 116
store i8 %conv43.2.1, ptr %incdec.ptr.1.1, align 1
%shr45.2.1 = lshr i64 %1, 0
%conv43.3.1 = trunc i64 %shr45.2.1 to i8
%incdec.ptr.3.1 = getelementptr i8, ptr %c, i64 115
store i8 %conv43.3.1, ptr %incdec.ptr.2.1, align 1
%shr45.3.1 = lshr i64 %1, 0
%conv43.4.1 = trunc i64 %shr45.3.1 to i8
store i8 %conv43.4.1, ptr %incdec.ptr.3.1, align 1
ret i32 0
}
Assertion failed: ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && "SK_InsertSubvector index out of range"), function getInsertSubvectorOverhead, file BasicTTIImpl.h, line 166.
Created using spr 1.3.5
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rebase after #124914?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a few minors
Created using spr 1.3.5
Patch tries to remove wide alternate operations. Currently SLP vectorizer emits something like this: ``` %0 = add i32 %1 = sub i32 %2 = add i32 %3 = sub i32 %4 = add i32 %5 = sub i32 %6 = add i32 %7 = sub i32 transformes to %v1 = add <8 x i32> %v2 = sub <8 x i32> %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15> ``` i.e. half of the results are just unused. This leads to increased register pressure and potentially doubles number of operations. Patch introduces SplitVectorize mode, where it splits the operations by opcodes and produces instead something like this: ``` %v1 = add <4 x i32> %v2 = sub <4 x i32> %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7> ``` It allows to improve the performance by reducing number of ops. Also, it turns on some other improvements, like improved graph reordering. -O3+LTO, AVX512 Metric: size..text Program size..text results results0 diff test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 277800.00 280536.00 1.0% test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 81802.00 82426.00 0.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 790552.00 790952.00 0.1% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 383795.00 383987.00 0.1% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2075541.00 2076501.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2075541.00 2076501.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 312702.00 312766.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12569783.00 12569751.00 -0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2049374.00 2049358.00 -0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1091836.00 1091772.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 852339.00 852211.00 -0.0% test-suite :: MultiSource/Applications/oggenc/oggenc.test 190651.00 190523.00 -0.1% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 44203.00 44155.00 -0.1% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12997.00 12981.00 -0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 668971.00 658427.00 -1.6% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 668971.00 658427.00 -1.6% Prolangs-C/TimberWolfMC/timberwolfmc - small variations, some code not inlined FreeBench/pifft - extra stores <8 x double> vectorized, some other extra vectorizations CINT2006/464.h264ref - some smaller code + changes similar to x264 JM/ldecod - changes similar x264 CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - significantly compact vector code Benchmarks/Bullet - small variations CFP2017rate/526.blender_r - small variations CFP2017rate/510.parest_r - small variations CINT2006/400.perlbench - extra vector code JM/lencod - extra store <16 x i32> and other changes similar x264 Applications/oggenc - extra store <16 x i8>, small variations DOE-ProxyApps-C/miniGMG - small variations Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - better vector code CINT2017speed/625.x264_s CINT2017rate/525.x264_r - the number of instructions increased, but looks like they are more performant. E.g., for function x264_pixel_satd_8x8, llvm-mca reports better throughput - 84 for the current version and 59 for the new version. -O3+LTO, march=rva32u64 CINT2017rate/525.x264_r - similar to x86, extra code in pixel_hadamard_ac function vectorized, idct4x4dc stopped being vectorized (looks like issue with shuffles cost) CINT2006/400.perlbench - better vector code CINT2006/445.gobmk - some variations in vector code CINT2006/464.h264ref - extra code vectorized CINT2017rate/500.perlbench_r - small variations -O3+LTO, mcpu=sifive-p470 Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 587336.00 587668.00 0.1% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 643308.00 643614.00 0.0% test-suite :: MultiSource/Applications/d/make_dparser.test 79678.00 79710.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 277322.00 277420.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 933660.00 933682.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9497722.00 9497682.00 -0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 1767806.00 1767772.00 -0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 1767806.00 1767772.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 148038.00 148024.00 -0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 283036.00 283008.00 -0.0% test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test 4776.00 4772.00 -0.1% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 540582.00 511772.00 -5.3% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 540582.00 511772.00 -5.3% CINT2006/464.h264ref - extra vector code in find_sad_16x16 JM/lencod - extra vector code in find_sad_16x16 d/make_dparser - smaller vector code Benchmarks/Bullet - small variations CINT2006/400.perlbench - smaller vector code CFP2017rate/526.blender_r - small variations, extra store <8 x float> in the loop, extra store <8 x i8> in loop CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - small variations MiBench/consumer-lame - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - small variations CINT2017rate/525.x264_r CINT2017speed/625.x264_s - reduced number of wide operations and shuffles, saving the registers, similar to X86, extra code in pixel_hadamard_ac vectorized, idct4x4dc not vectorized (issue with some TTI costs) Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: llvm/llvm-project#123360
Previous version was reviewed here llvm/llvm-project#123360 It is mostly the same, adjusted after graph-to-tree transformation Patch tries to remove wide alternate operations. Currently SLP vectorizer emits something like this: ``` %0 = add i32 %1 = sub i32 %2 = add i32 %3 = sub i32 %4 = add i32 %5 = sub i32 %6 = add i32 %7 = sub i32 transformes to %v1 = add <8 x i32> %v2 = sub <8 x i32> %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15> ``` i.e. half of the results are just unused. This leads to increased register pressure and potentially doubles number of operations. Patch introduces SplitVectorize mode, where it splits the operations by opcodes and produces instead something like this: ``` %v1 = add <4 x i32> %v2 = sub <4 x i32> %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7> ``` It allows to improve the performance by reducing number of ops. Also, it turns on some other improvements, like improved graph reordering. -O3+LTO, AVX512 Metric: size..text Program size..text results results0 diff test-suite :: MultiSource/Benchmarks/Olden/tsp/tsp.test 2788.00 2820.00 1.1% test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 278168.00 280904.00 1.0% test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 82682.00 83258.00 0.7% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 139344.00 139712.00 0.3% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 27149.00 27197.00 0.2% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1008188.00 1009948.00 0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39226.00 39290.00 0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39229.00 39293.00 0.2% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 798440.00 798952.00 0.1% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 44123.00 44139.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 318942.00 319038.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1159880.00 1160152.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/miniAMR.test 73595.00 73611.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1146124.00 1146348.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CLAMR.test 203831.00 203847.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 207662.00 207678.00 0.0% test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 589851.00 589883.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2050990.00 2051006.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12559687.00 12559591.00 -0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 3074157.00 3074125.00 -0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1092252.00 1092188.00 -0.0% test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 779763.00 779715.00 -0.0% test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253517.00 253485.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 848259.00 848035.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 93064.00 93016.00 -0.1% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 383747.00 383475.00 -0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 673051.00 662907.00 -1.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 673051.00 662907.00 -1.5% Olden/tsp - small variations Prolangs-C/TimberWolfMC - small variations, some code not inlined FreeBench/pifft - extra store <8 x double> vectorized, some other extra vectorizations CFP2006/433.milc - better vector code FreeBench/fourinarow - better vector code Benchmarks/tramp3d-v4 - extra vector code, small variations mediabench/gsm/toast - small variations MiBench/telecomm-gsm - small variations CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - better vector code, small variations CINT2006/464.h264ref - some smaller code + changes similar to x264 DOE-ProxyApps-C/miniGMG - small variations Benchmarks/Bullet - small variations CFP2017rate/511.povray_r - small variations DOE-ProxyApps-C/miniAMR - small variations CFP2006/453.povray - small variations DOE-ProxyApps-C++/CLAMR - small variations MiBench/consumer-lame - small variations CFP2006/447.dealII - small variations CFP2017rate/538.imagick_r CFP2017speed/638.imagick_s - small variations CFP2017rate/510.parest_r - better vector code, small variations CFP2017rate/526.blender_r - small variations CINT2006/403.gcc - small variations CINT2006/400.perlbench - small variations CFP2017rate/508.namd_r - small variations ASCI_Purple/SMG2000 - small variations JM/lencod - extra store <16 x i32>, small variations DOE-ProxyApps-C++/miniFE - small variations JM/ldecod - extra vector code, small variations, less shuffles CINT2017speed/625.x264_s CINT2017rate/525.x264_r - the number of instructions increased, but looks like they are more performant. E.g., for function x264_pixel_satd_8x8, llvm-mca reports better throughput - 84 for the current version and 59 for the new version. -O3+LTO, mcpu=sifive-p470 Metric: size..text results results0 diff test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 580768.00 581118.00 0.1% test-suite :: MultiSource/Applications/d/make_dparser.test 78854.00 78894.00 0.1% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 633448.00 633750.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 277002.00 277080.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 931938.00 931960.00 0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 2512806.00 2512822.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 1602448.00 1602434.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9496664.00 9496542.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147424.00 147422.00 -0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 1764608.00 1764578.00 -0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 1764608.00 1764578.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 841656.00 841632.00 -0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 949026.00 948962.00 -0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 946348.00 946284.00 -0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 279794.00 279764.00 -0.0% test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test 4776.00 4772.00 -0.1% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 29336.00 29184.00 -0.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 535390.00 510124.00 -4.7% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 535390.00 510124.00 -4.7% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/ieee/GCC-C-execute-ieee-pr50310.test 886.00 608.00 -31.4% CINT2006/464.h264ref - extra v16i32 reduction d/make_dparser - better vector code JM/lencod - extra v16i32 reduction Benchmarks/Bullet - smaller vector code CINT2006/400.perlbench - better vector code CINT2006/403.gcc - small variations CINT2017speed/602.gcc_s CINT2017rate/502.gcc_r - small variations CFP2017rate/510.parest_r - small variations CFP2017rate/526.blender_r - small variations MiBench/consumer-lame - small variations CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - small variations Benchmarks/7zip - small variations CFP2017rate/511.povray_r - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - extra vector code mediabench/gsm - extra vector code MiBench/telecomm-gsm - extra vector code DOE-ProxyApps-C/miniGMG - extra vector code CINT2017rate/525.x264_r CINT2017speed/625.x264_s - reduced number of wide operations and shuffles, saving the registers, similar to X86, extra code in pixel_hadamard_ac vectorized ieee/GCC-C-execute-ieee-pr50310 - extra code vectorized CINT2006/464.h264ref - extra vector code in find_sad_16x16 JM/lencod - extra vector code in find_sad_16x16 d/make_dparser - smaller vector code Benchmarks/Bullet - small variations CINT2006/400.perlbench - smaller vector code CFP2017rate/526.blender_r - small variations, extra store <8 x float> in the loop, extra store <8 x i8> in loop CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - small variations MiBench/consumer-lame - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - small variations Reviewers: hiraditya Reviewed By: hiraditya Pull Request: llvm/llvm-project#128907
The re-land (9d37e61) caused
I'll prepare a revert. |
This caused failures such as: Instruction does not dominate all uses! %29 = insertelement <8 x i64> %28, i64 %xor6.i.5, i64 6 %17 = shufflevector <8 x i64> %29, <8 x i64> poison, <6 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6> see comment on #123360 > Previous version was reviewed here #123360 > It is mostly the same, adjusted after graph-to-tree transformation > > Patch tries to remove wide alternate operations. > Currently SLP vectorizer emits something like this: > ``` > %0 = add i32 > %1 = sub i32 > %2 = add i32 > %3 = sub i32 > %4 = add i32 > %5 = sub i32 > %6 = add i32 > %7 = sub i32 > > transformes to > > %v1 = add <8 x i32> > %v2 = sub <8 x i32> > %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15> > ``` > i.e. half of the results are just unused. This leads to increased > register pressure and potentially doubles number of operations. > > Patch introduces SplitVectorize mode, where it splits the operations by > opcodes and produces instead something like this: > ``` > %v1 = add <4 x i32> > %v2 = sub <4 x i32> > %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7> > ``` > It allows to improve the performance by reducing number of ops. Also, it > turns on some other improvements, like improved graph reordering. > > [...] This reverts commit 9d37e61 as well as the follow-up commit 72bb0a9.
… possible" This caused failures such as: Instruction does not dominate all uses! %29 = insertelement <8 x i64> %28, i64 %xor6.i.5, i64 6 %17 = shufflevector <8 x i64> %29, <8 x i64> poison, <6 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6> see comment on llvm/llvm-project#123360 > Previous version was reviewed here llvm/llvm-project#123360 > It is mostly the same, adjusted after graph-to-tree transformation > > Patch tries to remove wide alternate operations. > Currently SLP vectorizer emits something like this: > ``` > %0 = add i32 > %1 = sub i32 > %2 = add i32 > %3 = sub i32 > %4 = add i32 > %5 = sub i32 > %6 = add i32 > %7 = sub i32 > > transformes to > > %v1 = add <8 x i32> > %v2 = sub <8 x i32> > %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15> > ``` > i.e. half of the results are just unused. This leads to increased > register pressure and potentially doubles number of operations. > > Patch introduces SplitVectorize mode, where it splits the operations by > opcodes and produces instead something like this: > ``` > %v1 = add <4 x i32> > %v2 = sub <4 x i32> > %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7> > ``` > It allows to improve the performance by reducing number of ops. Also, it > turns on some other improvements, like improved graph reordering. > > [...] This reverts commit 9d37e61 as well as the follow-up commit 72bb0a9.
Previous version was reviewed here #123360 It is mostly the same, adjusted after graph-to-tree transformation Patch tries to remove wide alternate operations. Currently SLP vectorizer emits something like this: ``` %0 = add i32 %1 = sub i32 %2 = add i32 %3 = sub i32 %4 = add i32 %5 = sub i32 %6 = add i32 %7 = sub i32 transformes to %v1 = add <8 x i32> %v2 = sub <8 x i32> %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15> ``` i.e. half of the results are just unused. This leads to increased register pressure and potentially doubles number of operations. Patch introduces SplitVectorize mode, where it splits the operations by opcodes and produces instead something like this: ``` %v1 = add <4 x i32> %v2 = sub <4 x i32> %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7> ``` It allows to improve the performance by reducing number of ops. Also, it turns on some other improvements, like improved graph reordering. -O3+LTO, AVX512 Metric: size..text Program size..text results results0 diff test-suite :: MultiSource/Benchmarks/Olden/tsp/tsp.test 2788.00 2820.00 1.1% test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 278168.00 280904.00 1.0% test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 82682.00 83258.00 0.7% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 139344.00 139712.00 0.3% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 27149.00 27197.00 0.2% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1008188.00 1009948.00 0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39226.00 39290.00 0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39229.00 39293.00 0.2% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 798440.00 798952.00 0.1% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 44123.00 44139.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 318942.00 319038.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1159880.00 1160152.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/miniAMR.test 73595.00 73611.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1146124.00 1146348.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CLAMR.test 203831.00 203847.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 207662.00 207678.00 0.0% test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 589851.00 589883.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2050990.00 2051006.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12559687.00 12559591.00 -0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 3074157.00 3074125.00 -0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1092252.00 1092188.00 -0.0% test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 779763.00 779715.00 -0.0% test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253517.00 253485.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 848259.00 848035.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 93064.00 93016.00 -0.1% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 383747.00 383475.00 -0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 673051.00 662907.00 -1.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 673051.00 662907.00 -1.5% Olden/tsp - small variations Prolangs-C/TimberWolfMC - small variations, some code not inlined FreeBench/pifft - extra store <8 x double> vectorized, some other extra vectorizations CFP2006/433.milc - better vector code FreeBench/fourinarow - better vector code Benchmarks/tramp3d-v4 - extra vector code, small variations mediabench/gsm/toast - small variations MiBench/telecomm-gsm - small variations CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - better vector code, small variations CINT2006/464.h264ref - some smaller code + changes similar to x264 DOE-ProxyApps-C/miniGMG - small variations Benchmarks/Bullet - small variations CFP2017rate/511.povray_r - small variations DOE-ProxyApps-C/miniAMR - small variations CFP2006/453.povray - small variations DOE-ProxyApps-C++/CLAMR - small variations MiBench/consumer-lame - small variations CFP2006/447.dealII - small variations CFP2017rate/538.imagick_r CFP2017speed/638.imagick_s - small variations CFP2017rate/510.parest_r - better vector code, small variations CFP2017rate/526.blender_r - small variations CINT2006/403.gcc - small variations CINT2006/400.perlbench - small variations CFP2017rate/508.namd_r - small variations ASCI_Purple/SMG2000 - small variations JM/lencod - extra store <16 x i32>, small variations DOE-ProxyApps-C++/miniFE - small variations JM/ldecod - extra vector code, small variations, less shuffles CINT2017speed/625.x264_s CINT2017rate/525.x264_r - the number of instructions increased, but looks like they are more performant. E.g., for function x264_pixel_satd_8x8, llvm-mca reports better throughput - 84 for the current version and 59 for the new version. -O3+LTO, mcpu=sifive-p470 Metric: size..text results results0 diff test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 580768.00 581118.00 0.1% test-suite :: MultiSource/Applications/d/make_dparser.test 78854.00 78894.00 0.1% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 633448.00 633750.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 277002.00 277080.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 931938.00 931960.00 0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 2512806.00 2512822.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 1602448.00 1602434.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9496664.00 9496542.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147424.00 147422.00 -0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 1764608.00 1764578.00 -0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 1764608.00 1764578.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 841656.00 841632.00 -0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 949026.00 948962.00 -0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 946348.00 946284.00 -0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 279794.00 279764.00 -0.0% test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test 4776.00 4772.00 -0.1% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 29336.00 29184.00 -0.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 535390.00 510124.00 -4.7% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 535390.00 510124.00 -4.7% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/ieee/GCC-C-execute-ieee-pr50310.test 886.00 608.00 -31.4% CINT2006/464.h264ref - extra v16i32 reduction d/make_dparser - better vector code JM/lencod - extra v16i32 reduction Benchmarks/Bullet - smaller vector code CINT2006/400.perlbench - better vector code CINT2006/403.gcc - small variations CINT2017speed/602.gcc_s CINT2017rate/502.gcc_r - small variations CFP2017rate/510.parest_r - small variations CFP2017rate/526.blender_r - small variations MiBench/consumer-lame - small variations CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - small variations Benchmarks/7zip - small variations CFP2017rate/511.povray_r - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - extra vector code mediabench/gsm - extra vector code MiBench/telecomm-gsm - extra vector code DOE-ProxyApps-C/miniGMG - extra vector code CINT2017rate/525.x264_r CINT2017speed/625.x264_s - reduced number of wide operations and shuffles, saving the registers, similar to X86, extra code in pixel_hadamard_ac vectorized ieee/GCC-C-execute-ieee-pr50310 - extra code vectorized CINT2006/464.h264ref - extra vector code in find_sad_16x16 JM/lencod - extra vector code in find_sad_16x16 d/make_dparser - smaller vector code Benchmarks/Bullet - small variations CINT2006/400.perlbench - smaller vector code CFP2017rate/526.blender_r - small variations, extra store <8 x float> in the loop, extra store <8 x i8> in loop CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - small variations MiBench/consumer-lame - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - small variations Reviewers: hiraditya Reviewed By: hiraditya Pull Request: #128907
Previous version was reviewed here llvm/llvm-project#123360 It is mostly the same, adjusted after graph-to-tree transformation Patch tries to remove wide alternate operations. Currently SLP vectorizer emits something like this: ``` %0 = add i32 %1 = sub i32 %2 = add i32 %3 = sub i32 %4 = add i32 %5 = sub i32 %6 = add i32 %7 = sub i32 transformes to %v1 = add <8 x i32> %v2 = sub <8 x i32> %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15> ``` i.e. half of the results are just unused. This leads to increased register pressure and potentially doubles number of operations. Patch introduces SplitVectorize mode, where it splits the operations by opcodes and produces instead something like this: ``` %v1 = add <4 x i32> %v2 = sub <4 x i32> %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7> ``` It allows to improve the performance by reducing number of ops. Also, it turns on some other improvements, like improved graph reordering. -O3+LTO, AVX512 Metric: size..text Program size..text results results0 diff test-suite :: MultiSource/Benchmarks/Olden/tsp/tsp.test 2788.00 2820.00 1.1% test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 278168.00 280904.00 1.0% test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 82682.00 83258.00 0.7% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 139344.00 139712.00 0.3% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 27149.00 27197.00 0.2% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1008188.00 1009948.00 0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39226.00 39290.00 0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39229.00 39293.00 0.2% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 798440.00 798952.00 0.1% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 44123.00 44139.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 318942.00 319038.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1159880.00 1160152.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/miniAMR.test 73595.00 73611.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1146124.00 1146348.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CLAMR.test 203831.00 203847.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 207662.00 207678.00 0.0% test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 589851.00 589883.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2050990.00 2051006.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12559687.00 12559591.00 -0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 3074157.00 3074125.00 -0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1092252.00 1092188.00 -0.0% test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 779763.00 779715.00 -0.0% test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253517.00 253485.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 848259.00 848035.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 93064.00 93016.00 -0.1% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 383747.00 383475.00 -0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 673051.00 662907.00 -1.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 673051.00 662907.00 -1.5% Olden/tsp - small variations Prolangs-C/TimberWolfMC - small variations, some code not inlined FreeBench/pifft - extra store <8 x double> vectorized, some other extra vectorizations CFP2006/433.milc - better vector code FreeBench/fourinarow - better vector code Benchmarks/tramp3d-v4 - extra vector code, small variations mediabench/gsm/toast - small variations MiBench/telecomm-gsm - small variations CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - better vector code, small variations CINT2006/464.h264ref - some smaller code + changes similar to x264 DOE-ProxyApps-C/miniGMG - small variations Benchmarks/Bullet - small variations CFP2017rate/511.povray_r - small variations DOE-ProxyApps-C/miniAMR - small variations CFP2006/453.povray - small variations DOE-ProxyApps-C++/CLAMR - small variations MiBench/consumer-lame - small variations CFP2006/447.dealII - small variations CFP2017rate/538.imagick_r CFP2017speed/638.imagick_s - small variations CFP2017rate/510.parest_r - better vector code, small variations CFP2017rate/526.blender_r - small variations CINT2006/403.gcc - small variations CINT2006/400.perlbench - small variations CFP2017rate/508.namd_r - small variations ASCI_Purple/SMG2000 - small variations JM/lencod - extra store <16 x i32>, small variations DOE-ProxyApps-C++/miniFE - small variations JM/ldecod - extra vector code, small variations, less shuffles CINT2017speed/625.x264_s CINT2017rate/525.x264_r - the number of instructions increased, but looks like they are more performant. E.g., for function x264_pixel_satd_8x8, llvm-mca reports better throughput - 84 for the current version and 59 for the new version. -O3+LTO, mcpu=sifive-p470 Metric: size..text results results0 diff test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 580768.00 581118.00 0.1% test-suite :: MultiSource/Applications/d/make_dparser.test 78854.00 78894.00 0.1% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 633448.00 633750.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 277002.00 277080.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 931938.00 931960.00 0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 2512806.00 2512822.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 1602448.00 1602434.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9496664.00 9496542.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147424.00 147422.00 -0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 1764608.00 1764578.00 -0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 1764608.00 1764578.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 841656.00 841632.00 -0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 949026.00 948962.00 -0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 946348.00 946284.00 -0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 279794.00 279764.00 -0.0% test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test 4776.00 4772.00 -0.1% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 29336.00 29184.00 -0.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 535390.00 510124.00 -4.7% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 535390.00 510124.00 -4.7% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/ieee/GCC-C-execute-ieee-pr50310.test 886.00 608.00 -31.4% CINT2006/464.h264ref - extra v16i32 reduction d/make_dparser - better vector code JM/lencod - extra v16i32 reduction Benchmarks/Bullet - smaller vector code CINT2006/400.perlbench - better vector code CINT2006/403.gcc - small variations CINT2017speed/602.gcc_s CINT2017rate/502.gcc_r - small variations CFP2017rate/510.parest_r - small variations CFP2017rate/526.blender_r - small variations MiBench/consumer-lame - small variations CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - small variations Benchmarks/7zip - small variations CFP2017rate/511.povray_r - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - extra vector code mediabench/gsm - extra vector code MiBench/telecomm-gsm - extra vector code DOE-ProxyApps-C/miniGMG - extra vector code CINT2017rate/525.x264_r CINT2017speed/625.x264_s - reduced number of wide operations and shuffles, saving the registers, similar to X86, extra code in pixel_hadamard_ac vectorized ieee/GCC-C-execute-ieee-pr50310 - extra code vectorized CINT2006/464.h264ref - extra vector code in find_sad_16x16 JM/lencod - extra vector code in find_sad_16x16 d/make_dparser - smaller vector code Benchmarks/Bullet - small variations CINT2006/400.perlbench - smaller vector code CFP2017rate/526.blender_r - small variations, extra store <8 x float> in the loop, extra store <8 x i8> in loop CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - small variations MiBench/consumer-lame - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - small variations Reviewers: hiraditya Reviewed By: hiraditya Pull Request: llvm/llvm-project#128907
The re-land is causing assertion failures. Reproducer:
I'll revert back to green. |
This caused assertion failures: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:16237: Value *llvm::slpvectorizer::BoUpSLP::vectorizeTree(TreeEntry *): Assertion `OpTE1.isSame( ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) && "Expected same first part of scalars."' failed. See comment on the PR. > Previous version was reviewed here #123360 > It is mostly the same, adjusted after graph-to-tree transformation This reverts commit 7de895f.
… possible" This caused assertion failures: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:16237: Value *llvm::slpvectorizer::BoUpSLP::vectorizeTree(TreeEntry *): Assertion `OpTE1.isSame( ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) && "Expected same first part of scalars."' failed. See comment on the PR. > Previous version was reviewed here llvm/llvm-project#123360 > It is mostly the same, adjusted after graph-to-tree transformation This reverts commit 7de895f.
Previous version was reviewed here #123360 It is mostly the same, adjusted after graph-to-tree transformation Patch tries to remove wide alternate operations. Currently SLP vectorizer emits something like this: ``` %0 = add i32 %1 = sub i32 %2 = add i32 %3 = sub i32 %4 = add i32 %5 = sub i32 %6 = add i32 %7 = sub i32 transformes to %v1 = add <8 x i32> %v2 = sub <8 x i32> %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15> ``` i.e. half of the results are just unused. This leads to increased register pressure and potentially doubles number of operations. Patch introduces SplitVectorize mode, where it splits the operations by opcodes and produces instead something like this: ``` %v1 = add <4 x i32> %v2 = sub <4 x i32> %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7> ``` It allows to improve the performance by reducing number of ops. Also, it turns on some other improvements, like improved graph reordering. -O3+LTO, AVX512 Metric: size..text Program size..text results results0 diff test-suite :: MultiSource/Benchmarks/Olden/tsp/tsp.test 2788.00 2820.00 1.1% test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 278168.00 280904.00 1.0% test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 82682.00 83258.00 0.7% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 139344.00 139712.00 0.3% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 27149.00 27197.00 0.2% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1008188.00 1009948.00 0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39226.00 39290.00 0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39229.00 39293.00 0.2% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 798440.00 798952.00 0.1% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 44123.00 44139.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 318942.00 319038.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1159880.00 1160152.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/miniAMR.test 73595.00 73611.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1146124.00 1146348.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CLAMR.test 203831.00 203847.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 207662.00 207678.00 0.0% test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 589851.00 589883.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2050990.00 2051006.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12559687.00 12559591.00 -0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 3074157.00 3074125.00 -0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1092252.00 1092188.00 -0.0% test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 779763.00 779715.00 -0.0% test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253517.00 253485.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 848259.00 848035.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 93064.00 93016.00 -0.1% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 383747.00 383475.00 -0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 673051.00 662907.00 -1.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 673051.00 662907.00 -1.5% Olden/tsp - small variations Prolangs-C/TimberWolfMC - small variations, some code not inlined FreeBench/pifft - extra store <8 x double> vectorized, some other extra vectorizations CFP2006/433.milc - better vector code FreeBench/fourinarow - better vector code Benchmarks/tramp3d-v4 - extra vector code, small variations mediabench/gsm/toast - small variations MiBench/telecomm-gsm - small variations CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - better vector code, small variations CINT2006/464.h264ref - some smaller code + changes similar to x264 DOE-ProxyApps-C/miniGMG - small variations Benchmarks/Bullet - small variations CFP2017rate/511.povray_r - small variations DOE-ProxyApps-C/miniAMR - small variations CFP2006/453.povray - small variations DOE-ProxyApps-C++/CLAMR - small variations MiBench/consumer-lame - small variations CFP2006/447.dealII - small variations CFP2017rate/538.imagick_r CFP2017speed/638.imagick_s - small variations CFP2017rate/510.parest_r - better vector code, small variations CFP2017rate/526.blender_r - small variations CINT2006/403.gcc - small variations CINT2006/400.perlbench - small variations CFP2017rate/508.namd_r - small variations ASCI_Purple/SMG2000 - small variations JM/lencod - extra store <16 x i32>, small variations DOE-ProxyApps-C++/miniFE - small variations JM/ldecod - extra vector code, small variations, less shuffles CINT2017speed/625.x264_s CINT2017rate/525.x264_r - the number of instructions increased, but looks like they are more performant. E.g., for function x264_pixel_satd_8x8, llvm-mca reports better throughput - 84 for the current version and 59 for the new version. -O3+LTO, mcpu=sifive-p470 Metric: size..text results results0 diff test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 580768.00 581118.00 0.1% test-suite :: MultiSource/Applications/d/make_dparser.test 78854.00 78894.00 0.1% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 633448.00 633750.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 277002.00 277080.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 931938.00 931960.00 0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 2512806.00 2512822.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 1602448.00 1602434.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9496664.00 9496542.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147424.00 147422.00 -0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 1764608.00 1764578.00 -0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 1764608.00 1764578.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 841656.00 841632.00 -0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 949026.00 948962.00 -0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 946348.00 946284.00 -0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 279794.00 279764.00 -0.0% test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test 4776.00 4772.00 -0.1% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 29336.00 29184.00 -0.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 535390.00 510124.00 -4.7% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 535390.00 510124.00 -4.7% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/ieee/GCC-C-execute-ieee-pr50310.test 886.00 608.00 -31.4% CINT2006/464.h264ref - extra v16i32 reduction d/make_dparser - better vector code JM/lencod - extra v16i32 reduction Benchmarks/Bullet - smaller vector code CINT2006/400.perlbench - better vector code CINT2006/403.gcc - small variations CINT2017speed/602.gcc_s CINT2017rate/502.gcc_r - small variations CFP2017rate/510.parest_r - small variations CFP2017rate/526.blender_r - small variations MiBench/consumer-lame - small variations CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - small variations Benchmarks/7zip - small variations CFP2017rate/511.povray_r - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - extra vector code mediabench/gsm - extra vector code MiBench/telecomm-gsm - extra vector code DOE-ProxyApps-C/miniGMG - extra vector code CINT2017rate/525.x264_r CINT2017speed/625.x264_s - reduced number of wide operations and shuffles, saving the registers, similar to X86, extra code in pixel_hadamard_ac vectorized ieee/GCC-C-execute-ieee-pr50310 - extra code vectorized CINT2006/464.h264ref - extra vector code in find_sad_16x16 JM/lencod - extra vector code in find_sad_16x16 d/make_dparser - smaller vector code Benchmarks/Bullet - small variations CINT2006/400.perlbench - smaller vector code CFP2017rate/526.blender_r - small variations, extra store <8 x float> in the loop, extra store <8 x i8> in loop CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - small variations MiBench/consumer-lame - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - small variations Reviewers: hiraditya Reviewed By: hiraditya Pull Request: #128907
Previous version was reviewed here llvm/llvm-project#123360 It is mostly the same, adjusted after graph-to-tree transformation Patch tries to remove wide alternate operations. Currently SLP vectorizer emits something like this: ``` %0 = add i32 %1 = sub i32 %2 = add i32 %3 = sub i32 %4 = add i32 %5 = sub i32 %6 = add i32 %7 = sub i32 transformes to %v1 = add <8 x i32> %v2 = sub <8 x i32> %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15> ``` i.e. half of the results are just unused. This leads to increased register pressure and potentially doubles number of operations. Patch introduces SplitVectorize mode, where it splits the operations by opcodes and produces instead something like this: ``` %v1 = add <4 x i32> %v2 = sub <4 x i32> %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7> ``` It allows to improve the performance by reducing number of ops. Also, it turns on some other improvements, like improved graph reordering. -O3+LTO, AVX512 Metric: size..text Program size..text results results0 diff test-suite :: MultiSource/Benchmarks/Olden/tsp/tsp.test 2788.00 2820.00 1.1% test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 278168.00 280904.00 1.0% test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 82682.00 83258.00 0.7% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 139344.00 139712.00 0.3% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 27149.00 27197.00 0.2% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1008188.00 1009948.00 0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39226.00 39290.00 0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39229.00 39293.00 0.2% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 798440.00 798952.00 0.1% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 44123.00 44139.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 318942.00 319038.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1159880.00 1160152.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/miniAMR.test 73595.00 73611.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1146124.00 1146348.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CLAMR.test 203831.00 203847.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 207662.00 207678.00 0.0% test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 589851.00 589883.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2050990.00 2051006.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12559687.00 12559591.00 -0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 3074157.00 3074125.00 -0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1092252.00 1092188.00 -0.0% test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 779763.00 779715.00 -0.0% test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253517.00 253485.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 848259.00 848035.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 93064.00 93016.00 -0.1% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 383747.00 383475.00 -0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 673051.00 662907.00 -1.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 673051.00 662907.00 -1.5% Olden/tsp - small variations Prolangs-C/TimberWolfMC - small variations, some code not inlined FreeBench/pifft - extra store <8 x double> vectorized, some other extra vectorizations CFP2006/433.milc - better vector code FreeBench/fourinarow - better vector code Benchmarks/tramp3d-v4 - extra vector code, small variations mediabench/gsm/toast - small variations MiBench/telecomm-gsm - small variations CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - better vector code, small variations CINT2006/464.h264ref - some smaller code + changes similar to x264 DOE-ProxyApps-C/miniGMG - small variations Benchmarks/Bullet - small variations CFP2017rate/511.povray_r - small variations DOE-ProxyApps-C/miniAMR - small variations CFP2006/453.povray - small variations DOE-ProxyApps-C++/CLAMR - small variations MiBench/consumer-lame - small variations CFP2006/447.dealII - small variations CFP2017rate/538.imagick_r CFP2017speed/638.imagick_s - small variations CFP2017rate/510.parest_r - better vector code, small variations CFP2017rate/526.blender_r - small variations CINT2006/403.gcc - small variations CINT2006/400.perlbench - small variations CFP2017rate/508.namd_r - small variations ASCI_Purple/SMG2000 - small variations JM/lencod - extra store <16 x i32>, small variations DOE-ProxyApps-C++/miniFE - small variations JM/ldecod - extra vector code, small variations, less shuffles CINT2017speed/625.x264_s CINT2017rate/525.x264_r - the number of instructions increased, but looks like they are more performant. E.g., for function x264_pixel_satd_8x8, llvm-mca reports better throughput - 84 for the current version and 59 for the new version. -O3+LTO, mcpu=sifive-p470 Metric: size..text results results0 diff test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 580768.00 581118.00 0.1% test-suite :: MultiSource/Applications/d/make_dparser.test 78854.00 78894.00 0.1% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 633448.00 633750.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 277002.00 277080.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 931938.00 931960.00 0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 2512806.00 2512822.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 1602448.00 1602434.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9496664.00 9496542.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147424.00 147422.00 -0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 1764608.00 1764578.00 -0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 1764608.00 1764578.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 841656.00 841632.00 -0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 949026.00 948962.00 -0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 946348.00 946284.00 -0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 279794.00 279764.00 -0.0% test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test 4776.00 4772.00 -0.1% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 29336.00 29184.00 -0.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 535390.00 510124.00 -4.7% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 535390.00 510124.00 -4.7% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/ieee/GCC-C-execute-ieee-pr50310.test 886.00 608.00 -31.4% CINT2006/464.h264ref - extra v16i32 reduction d/make_dparser - better vector code JM/lencod - extra v16i32 reduction Benchmarks/Bullet - smaller vector code CINT2006/400.perlbench - better vector code CINT2006/403.gcc - small variations CINT2017speed/602.gcc_s CINT2017rate/502.gcc_r - small variations CFP2017rate/510.parest_r - small variations CFP2017rate/526.blender_r - small variations MiBench/consumer-lame - small variations CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - small variations Benchmarks/7zip - small variations CFP2017rate/511.povray_r - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - extra vector code mediabench/gsm - extra vector code MiBench/telecomm-gsm - extra vector code DOE-ProxyApps-C/miniGMG - extra vector code CINT2017rate/525.x264_r CINT2017speed/625.x264_s - reduced number of wide operations and shuffles, saving the registers, similar to X86, extra code in pixel_hadamard_ac vectorized ieee/GCC-C-execute-ieee-pr50310 - extra code vectorized CINT2006/464.h264ref - extra vector code in find_sad_16x16 JM/lencod - extra vector code in find_sad_16x16 d/make_dparser - smaller vector code Benchmarks/Bullet - small variations CINT2006/400.perlbench - smaller vector code CFP2017rate/526.blender_r - small variations, extra store <8 x float> in the loop, extra store <8 x i8> in loop CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - small variations MiBench/consumer-lame - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - small variations Reviewers: hiraditya Reviewed By: hiraditya Pull Request: llvm/llvm-project#128907
Hi @zmodem ,
|
Previous version was reviewed here llvm#123360 It is mostly the same, adjusted after graph-to-tree transformation Patch tries to remove wide alternate operations. Currently SLP vectorizer emits something like this: ``` %0 = add i32 %1 = sub i32 %2 = add i32 %3 = sub i32 %4 = add i32 %5 = sub i32 %6 = add i32 %7 = sub i32 transformes to %v1 = add <8 x i32> %v2 = sub <8 x i32> %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15> ``` i.e. half of the results are just unused. This leads to increased register pressure and potentially doubles number of operations. Patch introduces SplitVectorize mode, where it splits the operations by opcodes and produces instead something like this: ``` %v1 = add <4 x i32> %v2 = sub <4 x i32> %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7> ``` It allows to improve the performance by reducing number of ops. Also, it turns on some other improvements, like improved graph reordering. -O3+LTO, AVX512 Metric: size..text Program size..text results results0 diff test-suite :: MultiSource/Benchmarks/Olden/tsp/tsp.test 2788.00 2820.00 1.1% test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 278168.00 280904.00 1.0% test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 82682.00 83258.00 0.7% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 139344.00 139712.00 0.3% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 27149.00 27197.00 0.2% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1008188.00 1009948.00 0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39226.00 39290.00 0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39229.00 39293.00 0.2% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074533.00 2076549.00 0.1% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 798440.00 798952.00 0.1% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 44123.00 44139.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 318942.00 319038.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1159880.00 1160152.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/miniAMR.test 73595.00 73611.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1146124.00 1146348.00 0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CLAMR.test 203831.00 203847.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 207662.00 207678.00 0.0% test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 589851.00 589883.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1398543.00 1398559.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2050990.00 2051006.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12559687.00 12559591.00 -0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 3074157.00 3074125.00 -0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1092252.00 1092188.00 -0.0% test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 779763.00 779715.00 -0.0% test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253517.00 253485.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 848259.00 848035.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 93064.00 93016.00 -0.1% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 383747.00 383475.00 -0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 673051.00 662907.00 -1.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 673051.00 662907.00 -1.5% Olden/tsp - small variations Prolangs-C/TimberWolfMC - small variations, some code not inlined FreeBench/pifft - extra store <8 x double> vectorized, some other extra vectorizations CFP2006/433.milc - better vector code FreeBench/fourinarow - better vector code Benchmarks/tramp3d-v4 - extra vector code, small variations mediabench/gsm/toast - small variations MiBench/telecomm-gsm - small variations CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - better vector code, small variations CINT2006/464.h264ref - some smaller code + changes similar to x264 DOE-ProxyApps-C/miniGMG - small variations Benchmarks/Bullet - small variations CFP2017rate/511.povray_r - small variations DOE-ProxyApps-C/miniAMR - small variations CFP2006/453.povray - small variations DOE-ProxyApps-C++/CLAMR - small variations MiBench/consumer-lame - small variations CFP2006/447.dealII - small variations CFP2017rate/538.imagick_r CFP2017speed/638.imagick_s - small variations CFP2017rate/510.parest_r - better vector code, small variations CFP2017rate/526.blender_r - small variations CINT2006/403.gcc - small variations CINT2006/400.perlbench - small variations CFP2017rate/508.namd_r - small variations ASCI_Purple/SMG2000 - small variations JM/lencod - extra store <16 x i32>, small variations DOE-ProxyApps-C++/miniFE - small variations JM/ldecod - extra vector code, small variations, less shuffles CINT2017speed/625.x264_s CINT2017rate/525.x264_r - the number of instructions increased, but looks like they are more performant. E.g., for function x264_pixel_satd_8x8, llvm-mca reports better throughput - 84 for the current version and 59 for the new version. -O3+LTO, mcpu=sifive-p470 Metric: size..text results results0 diff test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 580768.00 581118.00 0.1% test-suite :: MultiSource/Applications/d/make_dparser.test 78854.00 78894.00 0.1% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 633448.00 633750.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 277002.00 277080.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 931938.00 931960.00 0.0% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 2512806.00 2512822.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7659880.00 7659876.00 -0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 1602448.00 1602434.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9496664.00 9496542.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147424.00 147422.00 -0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 1764608.00 1764578.00 -0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 1764608.00 1764578.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 841656.00 841632.00 -0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 949026.00 948962.00 -0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 946348.00 946284.00 -0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 279794.00 279764.00 -0.0% test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test 4776.00 4772.00 -0.1% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 25074.00 25028.00 -0.2% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 29336.00 29184.00 -0.5% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 535390.00 510124.00 -4.7% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 535390.00 510124.00 -4.7% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/ieee/GCC-C-execute-ieee-pr50310.test 886.00 608.00 -31.4% CINT2006/464.h264ref - extra v16i32 reduction d/make_dparser - better vector code JM/lencod - extra v16i32 reduction Benchmarks/Bullet - smaller vector code CINT2006/400.perlbench - better vector code CINT2006/403.gcc - small variations CINT2017speed/602.gcc_s CINT2017rate/502.gcc_r - small variations CFP2017rate/510.parest_r - small variations CFP2017rate/526.blender_r - small variations MiBench/consumer-lame - small variations CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - small variations Benchmarks/7zip - small variations CFP2017rate/511.povray_r - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - extra vector code mediabench/gsm - extra vector code MiBench/telecomm-gsm - extra vector code DOE-ProxyApps-C/miniGMG - extra vector code CINT2017rate/525.x264_r CINT2017speed/625.x264_s - reduced number of wide operations and shuffles, saving the registers, similar to X86, extra code in pixel_hadamard_ac vectorized ieee/GCC-C-execute-ieee-pr50310 - extra code vectorized CINT2006/464.h264ref - extra vector code in find_sad_16x16 JM/lencod - extra vector code in find_sad_16x16 d/make_dparser - smaller vector code Benchmarks/Bullet - small variations CINT2006/400.perlbench - smaller vector code CFP2017rate/526.blender_r - small variations, extra store <8 x float> in the loop, extra store <8 x i8> in loop CINT2017rate/500.perlbench_r CINT2017speed/600.perlbench_s - small variations MiBench/consumer-lame - small variations JM/ldecod - extra vector code mediabench/g721/g721encode - small variations Reviewers: hiraditya Reviewed By: hiraditya Pull Request: llvm#128907
Patch tries to remove wide alternate operations.
Currently SLP vectorizer emits something like this:
i.e. half of the results are just unused. This leads to increased
register pressure and potentially doubles number of operations.
Patch introduces SplitVectorize mode, where it splits the operations by
opcodes and produces instead something like this:
It allows to improve the performance by reducing number of ops. Also, it
turns on some other improvements, like improved graph reordering.
-O3+LTO, AVX512
Metric: size..text
Program size..text
results results0 diff
test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 277800.00 280536.00 1.0%
test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 81802.00 82426.00 0.8%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 790552.00 790952.00 0.1%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 383795.00 383987.00 0.1%
test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2075541.00 2076501.00 0.0%
test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2075541.00 2076501.00 0.0%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 312702.00 312766.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12569783.00 12569751.00 -0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2049374.00 2049358.00 -0.0%
test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1091836.00 1091772.00 -0.0%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 852339.00 852211.00 -0.0%
test-suite :: MultiSource/Applications/oggenc/oggenc.test 190651.00 190523.00 -0.1%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 44203.00 44155.00 -0.1%
test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12997.00 12981.00 -0.1%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 668971.00 658427.00 -1.6%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 668971.00 658427.00 -1.6%
Prolangs-C/TimberWolfMC/timberwolfmc - small variations, some code not
inlined
FreeBench/pifft - extra stores <8 x double> vectorized, some other extra
vectorizations
CINT2006/464.h264ref - some smaller code + changes similar to x264
JM/ldecod - changes similar x264
CINT2017speed/600.perlbench_s
CINT2017rate/500.perlbench_r - significantly compact vector code
Benchmarks/Bullet - small variations
CFP2017rate/526.blender_r - small variations
CFP2017rate/510.parest_r - small variations
CINT2006/400.perlbench - extra vector code
JM/lencod - extra store <16 x i32> and other changes similar x264
Applications/oggenc - extra store <16 x i8>, small variations
DOE-ProxyApps-C/miniGMG - small variations
Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - better vector code
CINT2017speed/625.x264_s
CINT2017rate/525.x264_r - the number of instructions increased, but
looks like they are more performant. E.g., for function
x264_pixel_satd_8x8, llvm-mca reports better throughput - 84 for the
current version and 59 for the new version.
-O3+LTO, march=rva32u64
CINT2017rate/525.x264_r - similar to x86, extra code in pixel_hadamard_ac
function vectorized, idct4x4dc stopped being vectorized (looks like
issue with shuffles cost)
CINT2006/400.perlbench - better vector code
CINT2006/445.gobmk - some variations in vector code
CINT2006/464.h264ref - extra code vectorized
CINT2017rate/500.perlbench_r - small variations
-O3+LTO, mcpu=sifive-p470
Metric: size..text
Program size..text
results results0 diff
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 587336.00 587668.00 0.1%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 643308.00 643614.00 0.0%
test-suite :: MultiSource/Applications/d/make_dparser.test 79678.00 79710.00 0.0%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 277322.00 277420.00 0.0%
test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 933660.00 933682.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9497722.00 9497682.00 -0.0%
test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 1767806.00 1767772.00 -0.0%
test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 1767806.00 1767772.00 -0.0%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 148038.00 148024.00 -0.0%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 283036.00 283008.00 -0.0%
test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test 4776.00 4772.00 -0.1%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 540582.00 511772.00 -5.3%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 540582.00 511772.00 -5.3%
CINT2006/464.h264ref - extra vector code in find_sad_16x16
JM/lencod - extra vector code in find_sad_16x16
d/make_dparser - smaller vector code
Benchmarks/Bullet - small variations
CINT2006/400.perlbench - smaller vector code
CFP2017rate/526.blender_r - small variations, extra store <8 x float> in
the loop, extra store <8 x i8> in loop
CINT2017rate/500.perlbench_r
CINT2017speed/600.perlbench_s - small variations
MiBench/consumer-lame - small variations
JM/ldecod - extra vector code
mediabench/g721/g721encode - small variations
CINT2017rate/525.x264_r
CINT2017speed/625.x264_s - reduced number of wide operations and
shuffles, saving the registers, similar to X86, extra code in
pixel_hadamard_ac vectorized, idct4x4dc not vectorized (issue with some
TTI costs)