Skip to content

[llvm][ARM]Add widen global arrays pass #107120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Oct 17, 2024
11 changes: 11 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1819,6 +1819,10 @@ class TargetTransformInfo {
/// \return The maximum number of function arguments the target supports.
unsigned getMaxNumArgs() const;

/// \return For an array of given Size, return alignment boundary to
/// pad to. Default is no padding.
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;

/// @}

private:
Expand Down Expand Up @@ -2225,6 +2229,8 @@ class TargetTransformInfo::Concept {
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
virtual unsigned getMaxNumArgs() const = 0;
virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
Type *ArrayType) const = 0;
};

template <typename T>
Expand Down Expand Up @@ -3026,6 +3032,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxNumArgs() const override {
return Impl.getMaxNumArgs();
}

unsigned getNumBytesToPadGlobalArray(unsigned Size,
Type *ArrayType) const override {
return Impl.getNumBytesToPadGlobalArray(Size, ArrayType);
}
};

template <typename T>
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,10 @@ class TargetTransformInfoImplBase {

unsigned getMaxNumArgs() const { return UINT_MAX; }

unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
return 0;
}

protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1383,6 +1383,12 @@ bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const {
return TTIImpl->isVectorShiftByScalarCheap(Ty);
}

unsigned
TargetTransformInfo::getNumBytesToPadGlobalArray(unsigned Size,
Type *ArrayType) const {
return TTIImpl->getNumBytesToPadGlobalArray(Size, ArrayType);
}

TargetTransformInfo::Concept::~Concept() = default;

TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
Expand Down
33 changes: 33 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ static cl::opt<bool>
AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
cl::desc("Enable the generation of WLS loops"));

static cl::opt<bool> UseWidenGlobalArrays(
"widen-global-strings", cl::Hidden, cl::init(true),
cl::desc("Enable the widening of global strings to alignment boundaries"));

extern cl::opt<TailPredication::Mode> EnableTailPredication;

extern cl::opt<bool> EnableMaskedGatherScatters;
Expand Down Expand Up @@ -2805,3 +2809,32 @@ bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
}
return true;
}

unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
Type *ArrayType) const {
if (!UseWidenGlobalArrays) {
LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
return false;
}

// Don't modify none integer array types
if (!ArrayType || !ArrayType->isArrayTy() ||
!ArrayType->getArrayElementType()->isIntegerTy())
return 0;

// We pad to 4 byte boundaries
if (Size % 4 == 0)
return 0;

unsigned NumBytesToPad = 4 - (Size % 4);
unsigned NewSize = Size + NumBytesToPad;

// Max number of bytes that memcpy allows for lowering to load/stores before
// it uses library function (__aeabi_memcpy).
unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();

if (NewSize > MaxMemIntrinsicSize)
return 0;

return NumBytesToPad;
}
3 changes: 3 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,9 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {

bool isProfitableToSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const;

unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;

/// @}
};

Expand Down
165 changes: 165 additions & 0 deletions llvm/lib/Transforms/IPO/GlobalOpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ STATISTIC(NumInternalFunc, "Number of internal functions");
STATISTIC(NumColdCC, "Number of functions marked coldcc");
STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
STATISTIC(NumGlobalArraysPadded,
"Number of global arrays padded to alignment boundary");

static cl::opt<bool>
EnableColdCCStressTest("enable-coldcc-stress-test",
Expand Down Expand Up @@ -2029,6 +2031,165 @@ OptimizeFunctions(Module &M,
return Changed;
}

static bool callInstIsMemcpy(CallInst *CI) {
if (!CI)
return false;

Function *F = CI->getCalledFunction();
if (!F || !F->isIntrinsic() || F->getIntrinsicID() != Intrinsic::memcpy)
return false;

return true;
}

static bool destArrayCanBeWidened(CallInst *CI) {
auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));

if (!Alloca || !IsVolatile || IsVolatile->isOne())
return false;

if (!Alloca->isStaticAlloca())
return false;

if (!Alloca->getAllocatedType()->isArrayTy())
return false;

return true;
}

static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F,
unsigned NumBytesToPad,
unsigned NumBytesToCopy) {
if (!OldVar->hasInitializer())
return nullptr;

ConstantDataArray *DataArray =
dyn_cast<ConstantDataArray>(OldVar->getInitializer());
if (!DataArray)
return nullptr;

// Update to be word aligned (memcpy(...,X,...))
// create replacement with padded null bytes.
StringRef Data = DataArray->getRawDataValues();
std::vector<uint8_t> StrData(Data.begin(), Data.end());
for (unsigned int p = 0; p < NumBytesToPad; p++)
StrData.push_back('\0');
auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad);
// Create new padded version of global variable.
Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
GlobalVariable *NewGV = new GlobalVariable(
*(F->getParent()), SourceReplace->getType(), true, OldVar->getLinkage(),
SourceReplace, SourceReplace->getName());
// Copy any other attributes from original global variable
// e.g. unamed_addr
NewGV->copyAttributesFrom(OldVar);
NewGV->takeName(OldVar);
return NewGV;
}

static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad,
const unsigned NumBytesToCopy,
ConstantDataArray *SourceDataArray) {

auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
if (Alloca) {
unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
// Update destination array to be word aligned (memcpy(X,...,...))
IRBuilder<> BuildAlloca(Alloca);
AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
NewAlloca->takeName(Alloca);
NewAlloca->setAlignment(Alloca->getAlign());
Alloca->replaceAllUsesWith(NewAlloca);
Alloca->eraseFromParent();
}
}

static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
const unsigned NumBytesToPad,
const unsigned NumBytesToCopy,
ConstantInt *BytesToCopyOp,
ConstantDataArray *SourceDataArray) {
auto *NewSourceGV =
widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy);
if (!NewSourceGV)
return false;

// Update arguments of remaining uses that
// are memcpys.
for (auto *User : SourceVar->users()) {
auto *CI = dyn_cast<CallInst>(User);
if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
continue;

if (CI->getArgOperand(1) != SourceVar)
continue;

widenDestArray(CI, NumBytesToPad, NumBytesToCopy, SourceDataArray);

CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
NumBytesToCopy + NumBytesToPad));
}
SourceVar->replaceAllUsesWith(NewSourceGV);

NumGlobalArraysPadded++;
return true;
}

static bool tryWidenGlobalArraysUsedByMemcpy(
GlobalVariable *GV,
function_ref<TargetTransformInfo &(Function &)> GetTTI) {

if (!GV->hasInitializer() || !GV->isConstant() || !GV->hasLocalLinkage() ||
!GV->hasGlobalUnnamedAddr())
return false;

for (auto *User : GV->users()) {
CallInst *CI = dyn_cast<CallInst>(User);
if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
continue;

Function *F = CI->getCalledFunction();

auto *BytesToCopyOp = dyn_cast<ConstantInt>(CI->getArgOperand(2));
if (!BytesToCopyOp)
continue;

ConstantDataArray *SourceDataArray =
dyn_cast<ConstantDataArray>(GV->getInitializer());
if (!SourceDataArray)
continue;

unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();

auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
uint64_t SZSize = SourceDataArray->getType()->getNumElements();
unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
// Calculate the number of elements to copy while avoiding floored
// division of integers returning wrong values i.e. copying one byte
// from an array of i16 would yield 0 elements to copy as supposed to 1.
unsigned NumElementsToCopy = divideCeil(NumBytesToCopy, ElementByteWidth);

// For safety purposes lets add a constraint and only pad when
// NumElementsToCopy == destination array size ==
// source which is a constant
if (NumElementsToCopy != DZSize || DZSize != SZSize)
continue;

unsigned NumBytesToPad = GetTTI(*F).getNumBytesToPadGlobalArray(
NumBytesToCopy, SourceDataArray->getType());
if (NumBytesToPad) {
return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy,
BytesToCopyOp, SourceDataArray);
}
}
return false;
}

static bool
OptimizeGlobalVars(Module &M,
function_ref<TargetTransformInfo &(Function &)> GetTTI,
Expand Down Expand Up @@ -2058,6 +2219,10 @@ OptimizeGlobalVars(Module &M,
continue;
}

// For global variable arrays called in a memcpy
// we try to pad to nearest valid alignment boundary
Changed |= tryWidenGlobalArraysUsedByMemcpy(&GV, GetTTI);

Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
}
return Changed;
Expand Down
39 changes: 39 additions & 0 deletions llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s

@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1

define void @memcpy_struct() {
; CHECK-LABEL: define void @memcpy_struct() local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca { i8, i8, i8 }, align 1
; CHECK-NEXT: [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
; CHECK-NEXT: ret void
;
entry:
%something = alloca {i8, i8, i8}, align 1
%call1 = call i32 @bar(ptr nonnull %something)
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
ret void
}


@.i8_multi = private unnamed_addr constant [2 x [3 x i8]] [[3 x i8] [i8 1, i8 2, i8 3], [3 x i8] [i8 4, i8 5, i8 6]] , align 1

define void @memcpy_array_multidimensional() {
; CHECK-LABEL: define void @memcpy_array_multidimensional() local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca [2 x [3 x i8]], align 1
; CHECK-NEXT: [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
; CHECK-NEXT: ret void
;
entry:
%something = alloca [2 x [3 x i8]], align 1
%call1 = call i32 @bar(ptr nonnull %something)
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
ret void
}

declare i32 @bar(...)
28 changes: 28 additions & 0 deletions llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s

; CHECK: [3 x i8]
@other = private unnamed_addr global [3 x i8] [i8 1, i8 2, i8 3] , align 1
; CHECK: [4 x i8]
@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1

define void @memcpy_multiple() {
; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca [4 x i8], align 1
; CHECK-NEXT: [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
; CHECK-NEXT: [[CALL3:%.*]] = call i32 @bar(ptr nonnull @other)
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
; CHECK-NEXT: ret void
;
entry:
%something = alloca [3 x i8], align 1
%call1 = call i32 @bar(ptr nonnull %something)
%call2 = call i32 @bar(ptr nonnull @other)
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
ret void
}

declare i32 @bar(...)
22 changes: 22 additions & 0 deletions llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s

@.i16 = private unnamed_addr constant [5 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5] , align 1

define void @memcpy_i16_array() {
; CHECK-LABEL: define void @memcpy_i16_array() local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SOMETHING1:%.*]] = alloca [6 x i16], align 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 12, i1 false)
; CHECK-NEXT: [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
; CHECK-NEXT: ret void
;
entry:
%something = alloca [5 x i16], align 1
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 10, i1 false)
%call2 = call i32 @bar(ptr nonnull %something)
ret void
}


declare i32 @bar(...)
Loading