-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[llvm][ARM]Add widen global arrays pass #107120
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
nasherm
commented
Sep 3, 2024
•
edited
Loading
edited
- Pass optimizes memcpy's by padding out destinations and sources to a full word to make backend generate full word loads instead of loading a single byte (ldrb) and/or half word (ldrh). Only pads destination when it's a stack allocated constant size array and source when it's constant array. Heuristic to decide whether to pad or not is very basic and could be improved to allow more examples to be padded.
- Pass works within GlobalOpt but is disabled by default on all targets except ARM.
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-llvm-transforms Author: Nashe Mncube (nasherm) Changes
Patch is 25.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/107120.diff 13 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h b/llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h
new file mode 100755
index 00000000000000..d78f0219c03037
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h
@@ -0,0 +1,28 @@
+//===- ARMWidenStrings.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface for the ArmWidenStrings pass
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
+#define LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+
+struct ARMWidenStringsPass : PassInfoMixin<ARMWidenStringsPass> {
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
\ No newline at end of file
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 1df1449fce597c..6b989231cb9861 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -207,6 +207,7 @@
#include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
#include "llvm/Transforms/ObjCARC.h"
#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
#include "llvm/Transforms/Scalar/BDCE.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 9c3d49cabbd38c..b75612c410f07d 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -80,6 +80,7 @@
#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
#include "llvm/Transforms/Scalar/BDCE.h"
@@ -1513,6 +1514,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
// from the TargetLibraryInfo.
OptimizePM.addPass(InjectTLIMappings());
+ bool IsARM = TM && TM->getTargetTriple().isARM();
+ // Optimizes memcpy by padding arrays to exploit alignment
+ if (IsARM && Level.getSizeLevel() == 0 && Level.getSpeedupLevel() > 1)
+ OptimizePM.addPass(ARMWidenStringsPass());
+
addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
// LoopSink pass sinks instructions hoisted by LICM, which serves as a
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index d6067089c6b5c1..55566f43e5435d 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -489,6 +489,7 @@ FUNCTION_PASS("view-dom-only", DomOnlyViewer())
FUNCTION_PASS("view-post-dom", PostDomViewer())
FUNCTION_PASS("view-post-dom-only", PostDomOnlyViewer())
FUNCTION_PASS("wasm-eh-prepare", WasmEHPreparePass())
+FUNCTION_PASS("arm-widen-strings", ARMWidenStringsPass())
#undef FUNCTION_PASS
#ifndef FUNCTION_PASS_WITH_PARAMS
diff --git a/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp b/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
new file mode 100644
index 00000000000000..5a3c470861cf45
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
@@ -0,0 +1,236 @@
+// ARMWidenStrings.cpp - Widen strings to word boundaries to speed up
+// programs that use simple strcpy's with constant strings as source
+// and stack allocated array for destination.
+
+#define DEBUG_TYPE "arm-widen-strings"
+
+#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+cl::opt<bool> DisableARMWidenStrings("disable-arm-widen-strings");
+
+namespace {
+
+class ARMWidenStrings {
+public:
+ /*
+ Max number of bytes that memcpy allows for lowering to load/stores before it
+ uses library function (__aeabi_memcpy). This is the same value returned by
+ ARMSubtarget::getMaxInlineSizeThreshold which I would have called in place of
+ the constant int but can't get access to the subtarget info class from the
+ midend.
+ */
+ const unsigned int MemcpyInliningLimit = 64;
+
+ bool run(Function &F);
+};
+
+static bool IsCharArray(Type *t) {
+ const unsigned int CHAR_BIT_SIZE = 8;
+ return t && t->isArrayTy() && t->getArrayElementType()->isIntegerTy() &&
+ t->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
+}
+
+bool ARMWidenStrings::run(Function &F) {
+ if (DisableARMWidenStrings) {
+ return false;
+ }
+
+ if (Triple(F.getParent()->getTargetTriple()).isARM()) {
+ LLVM_DEBUG(
+ dbgs() << "Pass only runs on ARM as hasn't been benchmarked on other "
+ "targets\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "Running ARMWidenStrings on module " << F.getName()
+ << "\n");
+
+ for (Function::iterator b = F.begin(); b != F.end(); ++b) {
+ for (BasicBlock::iterator i = b->begin(); i != b->end(); ++i) {
+ CallInst *CI = dyn_cast<CallInst>(i);
+ if (!CI) {
+ continue;
+ }
+
+ Function *CallMemcpy = CI->getCalledFunction();
+ // find out if the current call instruction is a call to llvm memcpy
+ // intrinsics
+ if (CallMemcpy == NULL || !CallMemcpy->isIntrinsic() ||
+ CallMemcpy->getIntrinsicID() != Intrinsic::memcpy) {
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "Found call to strcpy/memcpy:\n" << *CI << "\n");
+
+ auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
+ auto *SourceVar = dyn_cast<GlobalVariable>(CI->getArgOperand(1));
+ auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+
+ if (!BytesToCopy) {
+ LLVM_DEBUG(dbgs() << "Number of bytes to copy is null\n");
+ continue;
+ }
+
+ uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
+
+ if (!Alloca) {
+ LLVM_DEBUG(dbgs() << "Destination isn't a Alloca\n");
+ continue;
+ }
+
+ if (!SourceVar) {
+ LLVM_DEBUG(dbgs() << "Source isn't a global constant variable\n");
+ continue;
+ }
+
+ if (!IsVolatile || IsVolatile->isOne()) {
+ LLVM_DEBUG(
+ dbgs() << "Not widening strings for this memcpy because it's "
+ "a volatile operations\n");
+ continue;
+ }
+
+ if (NumBytesToCopy % 4 == 0) {
+ LLVM_DEBUG(dbgs() << "Bytes to copy in strcpy/memcpy is already word "
+ "aligned so nothing to do here.\n");
+ continue;
+ }
+
+ if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
+ !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr()) {
+ LLVM_DEBUG(dbgs() << "Source is not constant global, thus it's "
+ "mutable therefore it's not safe to pad\n");
+ continue;
+ }
+
+ ConstantDataArray *SourceDataArray =
+ dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
+ if (!SourceDataArray || !IsCharArray(SourceDataArray->getType())) {
+ LLVM_DEBUG(dbgs() << "Source isn't a constant data array\n");
+ continue;
+ }
+
+ if (!Alloca->isStaticAlloca()) {
+ LLVM_DEBUG(dbgs() << "Destination allocation isn't a static "
+ "constant which is locally allocated in this "
+ "function, so skipping.\n");
+ continue;
+ }
+
+ // Make sure destination is definitley a char array.
+ if (!IsCharArray(Alloca->getAllocatedType())) {
+ LLVM_DEBUG(dbgs() << "Destination doesn't look like a constant char (8 "
+ "bits) array\n");
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << "With Alloca: " << *Alloca << "\n");
+
+ uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
+ uint64_t SZSize = SourceDataArray->getType()->getNumElements();
+
+ // For safety purposes lets add a constraint and only padd when
+ // num bytes to copy == destination array size == source string
+ // which is a constant
+ LLVM_DEBUG(dbgs() << "Number of bytes to copy is: " << NumBytesToCopy
+ << "\n");
+ LLVM_DEBUG(dbgs() << "Size of destination array is: " << DZSize << "\n");
+ LLVM_DEBUG(dbgs() << "Size of source array is: " << SZSize << "\n");
+ if (NumBytesToCopy != DZSize || DZSize != SZSize) {
+ LLVM_DEBUG(dbgs() << "Size of number of bytes to copy, destination "
+ "array and source string don't match, so "
+ "skipping\n");
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << "Going to widen.\n");
+ unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
+ LLVM_DEBUG(dbgs() << "Number of bytes to pad by is " << NumBytesToPad
+ << "\n");
+ unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
+
+ if (TotalBytes > MemcpyInliningLimit) {
+ LLVM_DEBUG(
+ dbgs() << "Not going to pad because total number of bytes is "
+ << TotalBytes
+ << " which be greater than the inlining "
+ "limit for memcpy which is "
+ << MemcpyInliningLimit << "\n");
+ continue;
+ }
+
+ // update destination char array to be word aligned (memcpy(X,...,...))
+ IRBuilder<> BuildAlloca(Alloca);
+ AllocaInst *NewAlloca = cast<AllocaInst>(BuildAlloca.CreateAlloca(
+ ArrayType::get(Alloca->getAllocatedType()->getArrayElementType(),
+ NumBytesToCopy + NumBytesToPad)));
+ NewAlloca->takeName(Alloca);
+ NewAlloca->setAlignment(Alloca->getAlign());
+ Alloca->replaceAllUsesWith(NewAlloca);
+
+ LLVM_DEBUG(dbgs() << "Updating users of destination stack object to use "
+ << "new size\n");
+
+ // update source to be word aligned (memcpy(...,X,...))
+ // create replacement string with padded null bytes.
+ StringRef Data = SourceDataArray->getRawDataValues();
+ std::vector<uint8_t> StrData(Data.begin(), Data.end());
+ for (unsigned int p = 0; p < NumBytesToPad; p++)
+ StrData.push_back('\0');
+ auto Arr = ArrayRef(StrData.data(), TotalBytes);
+
+ // create new padded version of global variable string.
+ Constant *SourceReplace = ConstantDataArray::get(F.getContext(), Arr);
+ GlobalVariable *NewGV = new GlobalVariable(
+ *F.getParent(), SourceReplace->getType(), true,
+ SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
+
+ // copy any other attributes from original global variable string
+ // e.g. unamed_addr
+ NewGV->copyAttributesFrom(SourceVar);
+ NewGV->takeName(SourceVar);
+
+ // replace intrinsic source.
+ CI->setArgOperand(1, NewGV);
+
+ // Update number of bytes to copy (memcpy(...,...,X))
+ CI->setArgOperand(2,
+ ConstantInt::get(BytesToCopy->getType(), TotalBytes));
+ LLVM_DEBUG(dbgs() << "Padded dest/source and increased number of bytes:\n"
+ << *CI << "\n"
+ << *NewAlloca << "\n");
+ }
+ }
+ return true;
+}
+
+} // end of anonymous namespace
+
+PreservedAnalyses ARMWidenStringsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ bool Changed = ARMWidenStrings().run(F);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses Preserved;
+ Preserved.preserveSet(CFGAnalyses::ID());
+ Preserved.preserve<LoopAnalysis>();
+ return Preserved;
+}
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 939a1457239567..a9607e4ebc6583 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMScalarOpts
ADCE.cpp
AlignmentFromAssumptions.cpp
AnnotationRemarks.cpp
+ ARMWidenStrings.cpp
BDCE.cpp
CallSiteSplitting.cpp
ConstantHoisting.cpp
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
new file mode 100644
index 00000000000000..a34ddc2ae2a29a
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -O2 -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>" -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-arm-none-eabi"
+
+; CHECK: [17 x i8]
+@.str = private unnamed_addr constant [17 x i8] c"aaaaaaaaaaaaaaaa\00", align 1
+
+; Function Attrs: nounwind
+define hidden void @foo() local_unnamed_addr #0 {
+entry:
+ %something = alloca [20 x i8], align 1
+ call void @llvm.lifetime.start(i64 20, ptr nonnull %something) #3
+ call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 17, i1 false)
+ %call2 = call i32 @bar(ptr nonnull %something) #3
+ call void @llvm.lifetime.end(i64 20, ptr nonnull %something) #3
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+
+declare i32 @bar(...) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
new file mode 100644
index 00000000000000..15c196b62bc9b2
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -O3 -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O3>" -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-arm-none-eabi"
+
+; CHECK: [65 x i8]
+; CHECK-NOT: [68 x i8]
+@.str = private unnamed_addr constant [65 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzz\00", align 1
+
+; Function Attrs: nounwind
+define hidden void @foo() local_unnamed_addr #0 {
+entry:
+ %something = alloca [65 x i8], align 1
+ call void @llvm.lifetime.start(i64 65, ptr nonnull %something) #3
+ call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 65, i1 false)
+ %call2 = call i32 @bar(ptr nonnull %something) #3
+ call void @llvm.lifetime.end(i64 65, ptr nonnull %something) #3
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+
+declare i32 @bar(...) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
new file mode 100644
index 00000000000000..b4cb1beee92535
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -O2 -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>" -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; This test uses ptrtoint, but still should be handled correctly.
+; The [45 x i8] string should be optimised away (i.e unused)
+; CHECK: [48 x i8]
+; CHECK-NOT: [45 x i8]
+@f.string1 = private unnamed_addr constant [45 x i8] c"The quick brown dog jumps over the lazy fox.\00", align 1
+
+; Function Attrs: nounwind
+define hidden i32 @f() {
+entry:
+ %string1 = alloca [45 x i8], align 1
+ %pos = alloca i32, align 4
+ %token = alloca ptr, align 4
+ call void @llvm.lifetime.start.p0i8(i64 45, ptr %string1)
+ call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %string1, ptr align 1 @f.string1, i32 45, i1 false)
+ call void @llvm.lifetime.start.p0i8(i64 4, ptr %pos)
+ call void @llvm.lifetime.start.p0i8(i64 4, ptr %token)
+ %call = call ptr @strchr(ptr %string1, i32 101)
+ store ptr %call, ptr %token, align 4
+ %0 = load ptr, ptr %token, align 4
+ %sub.ptr.lhs.cast = ptrtoint ptr %0 to i32
+ %sub.ptr.rhs.cast = ptrtoint ptr %string1 to i32
+ %sub.ptr.sub = sub i32 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+ %add = add nsw i32 %sub.ptr.sub, 1
+ store i32 %add, ptr %pos, align 4
+ %1 = load i32, ptr %pos, align 4
+ call void @llvm.lifetime.end.p0i8(i64 4, ptr %token)
+ call void @llvm.lifetime.end.p0i8(i64 4, ptr %pos)
+ call void @llvm.lifetime.end.p0i8(i64 45, ptr %string1)
+ ret i32 %1
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, ptr nocapture)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1)
+
+; Function Attrs: nounwind
+declare ptr @strchr(ptr, i32)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, ptr nocapture)
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
new file mode 100644
index 00000000000000..b852944c3f876f
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -O3 -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O3>" -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-arm-none-eabi"
+
+%struct.P = type { i32, [13 x i8] }
+
+; CHECK-NOT: [16 x i8]
+@.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
+@.str.1 = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+@__ARM_use_no_argv = global i32 1, section ".ARM.use_no_argv", align 4
+@llvm.used = appending global [1 x ptr] [ptr @__ARM_use_no_argv], section "llvm.metadata"
+
+; Function Attrs: nounwind
+define hidden i32 @main() local_unnamed_addr #0 {
+entry:
+ %p = alloca %struct.P, align 4
+ call void @llvm.lifetime.start(i64 20, ptr nonnull %p) #2
+ store i32 10, ptr %p, align 4, !tbaa !3
+ %arraydecay = getelementptr inbounds %struct.P, ptr %p, i32 0, i32 1, i32 0
+ call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %arraydecay, ptr align 1 @.str, i32 13, i1 false)
+ %puts = call i32 @puts(ptr %arraydecay)
+ call void @llvm.lifetime.end(i64 20, ptr nonnull %p) #2
+ ret i32 0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr noca...
[truncated]
|
if this is actually arm-specific, please use the some performance numbers in the description would be helpful adding some arm people |
also I would split PRs to implement the pass and add it to the pipeline |
I believe there was talk a long time ago about adding this to an existing pass such as the GlobalOpts pass or CGP. It sounds like CGP is too late for it, could it be a part of GlobalOpt or some other pass? |
I intend to rework the patch to make use of this and benchmark
Sure, no problem |
Can you give a brief example of Arm asm before/after this optimization? I suspect this generalizes to other targets, at least in some cases. Is there some reason we can't pad globals that aren't strings? Padding out strings probably affects string merging in the linker, so the codesize tradeoff here is sort of hard to compute. |
b3bca66
to
cc8bf21
Compare
I've reduced this patch down to adding the pass, as well as tests, without enabling it. With respect to performance gain I've seen a jump of around 1% on some of our benchmarks. I used the following (truncated) IR to show the difference in generated assembly
Optimization off
Optmization on
Diff of assembly for readability
|
I don't think so? But there might be a reason this wasn't investigated. The work in this patch was originally authored by someone no longer at Arm |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably worth investigating if we can fit this easily into some pass that's already examining the uses of globals, like GlobalOpt; iterating over the whole module isn't cheap.
My most recent patch addresses the comments.
I've had a look at GlobalOpt briefly and I have a few questions: if this pass were added wouldn't investigation also include seeing if this improves performance on other targets? I can see restricting the pass to ARM cores but it seems like that would it make it a poor fit for GlobalOpt. Is there something I'm missing? |
✅ With the latest revision this PR passed the C/C++ code formatter. |
If we're going to make this transform target-independent, we'll need some target-specific tuning from TargetTransformInfo or something like that. Even if the transform is profitable, the exact thresholds where it's profitable are likely to be different. (The maximum size of the global where it's relevant, and whether the best alignment boundary is 2/4/8/16 bytes, is going to vary.) Not sure we need extensive performance measurements for other targets... if you could get measurements for some big x86 or Arm64 core, that would be nice. But you can basically see what happens on other targets by just compiling a simple example. And if we have a TTI hook, targets could easily opt-out. |
f7e220d
to
3b0405b
Compare
I've rewritten the pass to be platform independent and added it to GlobalOpt. By default it's disabled for all targets except ARM. |
I haven't had a chance to investigate performance on AArch64 or x86 machines and will not be able to until next week |
The case in which copying from a global source to a global dest wasn't handled and caused opt to crash. This is now handled and a new test has been added to check Change-Id: Ieb0467797fcee888f6e95e68af4dac9c05d70a4d
Change-Id: I029312362f9dd714b2e9bc206cc002883d761b8b
Change-Id: Idc7b14cc785eb88552dd72947eb0df128baa7e90
- Removed handling of global variable destinations. We simply don't pad these for now - Added check that destination array is an array type and added test. Change-Id: Ifc53051952ef69c4af64827402baf7d69cab4824
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the updates. I ran some tests and as far as I can tell they ran OK now. LGTM if there are no other comments.
bbe246e
to
86ee9ad
Compare
86ee9ad
to
2815d59
Compare
- Added test showing behaviour of attempting to widen non-const globals - Refactoring Change-Id: I566214331bf3d889bd1409d3148aa6eab2530ed5
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/160/builds/6882 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/159/builds/8352 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/144/builds/9579 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/12/builds/7865 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/180/builds/6880 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/3/builds/6308 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/140/builds/8956 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/133/builds/5328 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/7798 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/7199 Here is the relevant piece of the build log for the reference
|
Reverts #107120 Unexpected build failures in post-commit pipelines. Needs investigation
Has been reverted due to unexpected buildbot failures |
probably just requires |
This is a recommit of #107120 . The original PR was approved but failed buildbot. The newly added tests should only be run for compilers that support the ARM target. This has been resolved by adding a config file for these tests. - Pass optimizes memcpy's by padding out destinations and sources to a full word to make ARM backend generate full word loads instead of loading a single byte (ldrb) and/or half word (ldrh). Only pads destination when it's a stack allocated constant size array and source when it's constant string. Heuristic to decide whether to pad or not is very basic and could be improved to allow more examples to be padded. - Pass works at the midend level
…3289) This is a recommit of llvm#107120 . The original PR was approved but failed buildbot. The newly added tests should only be run for compilers that support the ARM target. This has been resolved by adding a config file for these tests. - Pass optimizes memcpy's by padding out destinations and sources to a full word to make ARM backend generate full word loads instead of loading a single byte (ldrb) and/or half word (ldrh). Only pads destination when it's a stack allocated constant size array and source when it's constant string. Heuristic to decide whether to pad or not is very basic and could be improved to allow more examples to be padded. - Pass works at the midend level