Skip to content

Commit 0a7ac93

Browse files
Wei Xiaocherrymui
Wei Xiao
authored andcommitted
cmd/compile: improve atomic add intrinsics with ARMv8.1 new instruction
ARMv8.1 has added new instruction (LDADDAL) for atomic memory operations. This CL improves existing atomic add intrinsics with the new instruction. Since the new instruction is only guaranteed to be present after ARMv8.1, we guard its usage with a conditional on CPU feature. Performance result on ARMv8.1 machine: name old time/op new time/op delta Xadd-224 1.05µs ± 6% 0.02µs ± 4% -98.06% (p=0.000 n=10+8) Xadd64-224 1.05µs ± 3% 0.02µs ±13% -98.10% (p=0.000 n=9+10) [Geo mean] 1.05µs 0.02µs -98.08% Performance result on ARMv8.0 machine: name old time/op new time/op delta Xadd-46 538ns ± 1% 541ns ± 1% +0.62% (p=0.000 n=9+9) Xadd64-46 505ns ± 1% 508ns ± 0% +0.48% (p=0.003 n=9+8) [Geo mean] 521ns 524ns +0.55% Change-Id: If4b5d8d0e2d6f84fe1492a4f5de0789910ad0ee9 Reviewed-on: https://go-review.googlesource.com/81877 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
1 parent 1988b3e commit 0a7ac93

File tree

16 files changed

+211
-6
lines changed

16 files changed

+211
-6
lines changed

src/cmd/asm/internal/arch/arm64.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ func IsARM64STLXR(op obj.As) bool {
7777
arm64.ALDADDB, arm64.ALDADDH, arm64.ALDADDW, arm64.ALDADDD,
7878
arm64.ALDANDB, arm64.ALDANDH, arm64.ALDANDW, arm64.ALDANDD,
7979
arm64.ALDEORB, arm64.ALDEORH, arm64.ALDEORW, arm64.ALDEORD,
80-
arm64.ALDORB, arm64.ALDORH, arm64.ALDORW, arm64.ALDORD:
80+
arm64.ALDORB, arm64.ALDORH, arm64.ALDORW, arm64.ALDORD,
81+
arm64.ALDADDALD, arm64.ALDADDALW:
8182
return true
8283
}
8384
return false

src/cmd/asm/internal/asm/testdata/arm64.s

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,8 @@ again:
604604
LDORH R5, (RSP), R7 // e7332578
605605
LDORB R5, (R6), R7 // c7302538
606606
LDORB R5, (RSP), R7 // e7332538
607+
LDADDALD R2, (R1), R3 // 2300e2f8
608+
LDADDALW R5, (R4), R6 // 8600e5b8
607609

608610
// RET
609611
//

src/cmd/compile/internal/arm64/ssa.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,28 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
553553
p3.From.Reg = arm64.REGTMP
554554
p3.To.Type = obj.TYPE_BRANCH
555555
gc.Patch(p3, p)
556+
case ssa.OpARM64LoweredAtomicAdd64Variant,
557+
ssa.OpARM64LoweredAtomicAdd32Variant:
558+
// LDADDAL Rarg1, (Rarg0), Rout
559+
// ADD Rarg1, Rout
560+
op := arm64.ALDADDALD
561+
if v.Op == ssa.OpARM64LoweredAtomicAdd32Variant {
562+
op = arm64.ALDADDALW
563+
}
564+
r0 := v.Args[0].Reg()
565+
r1 := v.Args[1].Reg()
566+
out := v.Reg0()
567+
p := s.Prog(op)
568+
p.From.Type = obj.TYPE_REG
569+
p.From.Reg = r1
570+
p.To.Type = obj.TYPE_MEM
571+
p.To.Reg = r0
572+
p.RegTo2 = out
573+
p1 := s.Prog(arm64.AADD)
574+
p1.From.Type = obj.TYPE_REG
575+
p1.From.Reg = r1
576+
p1.To.Type = obj.TYPE_REG
577+
p1.To.Reg = out
556578
case ssa.OpARM64LoweredAtomicCas64,
557579
ssa.OpARM64LoweredAtomicCas32:
558580
// LDAXR (Rarg0), Rtmp

src/cmd/compile/internal/gc/go.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ var (
303303
racewriterange,
304304
supportPopcnt,
305305
supportSSE41,
306+
arm64SupportAtomics,
306307
typedmemclr,
307308
typedmemmove,
308309
Udiv,

src/cmd/compile/internal/gc/ssa.go

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ func initssaconfig() {
7878
racewriterange = sysfunc("racewriterange")
7979
supportPopcnt = sysfunc("support_popcnt")
8080
supportSSE41 = sysfunc("support_sse41")
81+
arm64SupportAtomics = sysfunc("arm64_support_atomics")
8182
typedmemclr = sysfunc("typedmemclr")
8283
typedmemmove = sysfunc("typedmemmove")
8384
Udiv = sysfunc("udiv")
@@ -2935,14 +2936,56 @@ func init() {
29352936
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
29362937
return s.newValue1(ssa.OpSelect0, types.Types[TUINT32], v)
29372938
},
2938-
sys.AMD64, sys.ARM64, sys.S390X, sys.MIPS, sys.MIPS64, sys.PPC64)
2939+
sys.AMD64, sys.S390X, sys.MIPS, sys.MIPS64, sys.PPC64)
29392940
addF("runtime/internal/atomic", "Xadd64",
29402941
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
29412942
v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[TUINT64], types.TypeMem), args[0], args[1], s.mem())
29422943
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
29432944
return s.newValue1(ssa.OpSelect0, types.Types[TUINT64], v)
29442945
},
2945-
sys.AMD64, sys.ARM64, sys.S390X, sys.MIPS64, sys.PPC64)
2946+
sys.AMD64, sys.S390X, sys.MIPS64, sys.PPC64)
2947+
2948+
makeXaddARM64 := func(op0 ssa.Op, op1 ssa.Op, ty types.EType) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
2949+
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
2950+
// Target Atomic feature is identified by dynamic detection
2951+
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), arm64SupportAtomics, s.sb)
2952+
v := s.load(types.Types[TBOOL], addr)
2953+
b := s.endBlock()
2954+
b.Kind = ssa.BlockIf
2955+
b.SetControl(v)
2956+
bTrue := s.f.NewBlock(ssa.BlockPlain)
2957+
bFalse := s.f.NewBlock(ssa.BlockPlain)
2958+
bEnd := s.f.NewBlock(ssa.BlockPlain)
2959+
b.AddEdgeTo(bTrue)
2960+
b.AddEdgeTo(bFalse)
2961+
b.Likely = ssa.BranchUnlikely // most machines don't have Atomics nowadays
2962+
2963+
// We have atomic instructions - use it directly.
2964+
s.startBlock(bTrue)
2965+
v0 := s.newValue3(op1, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
2966+
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v0)
2967+
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v0)
2968+
s.endBlock().AddEdgeTo(bEnd)
2969+
2970+
// Use original instruction sequence.
2971+
s.startBlock(bFalse)
2972+
v1 := s.newValue3(op0, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
2973+
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v1)
2974+
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v1)
2975+
s.endBlock().AddEdgeTo(bEnd)
2976+
2977+
// Merge results.
2978+
s.startBlock(bEnd)
2979+
return s.variable(n, types.Types[ty])
2980+
}
2981+
}
2982+
2983+
addF("runtime/internal/atomic", "Xadd",
2984+
makeXaddARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, TUINT32),
2985+
sys.ARM64)
2986+
addF("runtime/internal/atomic", "Xadd64",
2987+
makeXaddARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, TUINT64),
2988+
sys.ARM64)
29462989

29472990
addF("runtime/internal/atomic", "Cas",
29482991
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {

src/cmd/compile/internal/ssa/gen/ARM64.rules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,9 @@
544544
(AtomicAnd8 ptr val mem) -> (Select1 (LoweredAtomicAnd8 ptr val mem))
545545
(AtomicOr8 ptr val mem) -> (Select1 (LoweredAtomicOr8 ptr val mem))
546546

547+
(AtomicAdd32Variant ptr val mem) -> (LoweredAtomicAdd32Variant ptr val mem)
548+
(AtomicAdd64Variant ptr val mem) -> (LoweredAtomicAdd64Variant ptr val mem)
549+
547550
// Write barrier.
548551
(WB {fn} destptr srcptr mem) -> (LoweredWB {fn} destptr srcptr mem)
549552

src/cmd/compile/internal/ssa/gen/ARM64Ops.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,13 @@ func init() {
578578
{name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
579579
{name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
580580

581+
// atomic add variant.
582+
// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
583+
// LDADDAL (Rarg0), Rarg1, Rout
584+
// ADD Rarg1, Rout
585+
{name: "LoweredAtomicAdd64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
586+
{name: "LoweredAtomicAdd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
587+
581588
// atomic compare and swap.
582589
// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
583590
// if *arg0 == arg1 {

src/cmd/compile/internal/ssa/gen/genericOps.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,13 @@ var genericOps = []opData{
515515
{name: "AtomicAnd8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory.
516516
{name: "AtomicOr8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory.
517517

518+
// Atomic operation variants
519+
// These variants have the same semantics as above atomic operations.
520+
// But they are used for generating more efficient code on certain modern machines, with run-time CPU feature detection.
521+
// Currently, they are used on ARM64 only.
522+
{name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
523+
{name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
524+
518525
// Clobber experiment op
519526
{name: "Clobber", argLength: 0, typ: "Void", aux: "SymOff", symEffect: "None"}, // write an invalid pointer value to the given pointer slot of a stack variable
520527
}

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 48 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/rewriteARM64.go

Lines changed: 36 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/internal/obj/arm64/a.out.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,8 @@ const (
594594
AHVC
595595
AIC
596596
AISB
597+
ALDADDALD
598+
ALDADDALW
597599
ALDADDB
598600
ALDADDH
599601
ALDADDW

src/cmd/internal/obj/arm64/anames.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ var Anames = []string{
9696
"HVC",
9797
"IC",
9898
"ISB",
99+
"LDADDALD",
100+
"LDADDALW",
99101
"LDADDB",
100102
"LDADDH",
101103
"LDADDW",

src/cmd/internal/obj/arm64/asm7.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2011,6 +2011,8 @@ func buildop(ctxt *obj.Link) {
20112011
oprangeset(ASWPB, t)
20122012
oprangeset(ASWPH, t)
20132013
oprangeset(ASWPW, t)
2014+
oprangeset(ALDADDALD, t)
2015+
oprangeset(ALDADDALW, t)
20142016
oprangeset(ALDADDB, t)
20152017
oprangeset(ALDADDH, t)
20162018
oprangeset(ALDADDW, t)
@@ -3363,9 +3365,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
33633365
rt := p.RegTo2
33643366
rb := p.To.Reg
33653367
switch p.As {
3366-
case ASWPD, ALDADDD, ALDANDD, ALDEORD, ALDORD: // 64-bit
3368+
case ASWPD, ALDADDALD, ALDADDD, ALDANDD, ALDEORD, ALDORD: // 64-bit
33673369
o1 = 3 << 30
3368-
case ASWPW, ALDADDW, ALDANDW, ALDEORW, ALDORW: // 32-bit
3370+
case ASWPW, ALDADDALW, ALDADDW, ALDANDW, ALDEORW, ALDORW: // 32-bit
33693371
o1 = 2 << 30
33703372
case ASWPH, ALDADDH, ALDANDH, ALDEORH, ALDORH: // 16-bit
33713373
o1 = 1 << 30
@@ -3377,7 +3379,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
33773379
switch p.As {
33783380
case ASWPD, ASWPW, ASWPH, ASWPB:
33793381
o1 |= 0x20 << 10
3380-
case ALDADDD, ALDADDW, ALDADDH, ALDADDB:
3382+
case ALDADDALD, ALDADDALW, ALDADDD, ALDADDW, ALDADDH, ALDADDB:
33813383
o1 |= 0x00 << 10
33823384
case ALDANDD, ALDANDW, ALDANDH, ALDANDB:
33833385
o1 |= 0x04 << 10
@@ -3386,6 +3388,10 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
33863388
case ALDORD, ALDORW, ALDORH, ALDORB:
33873389
o1 |= 0x0c << 10
33883390
}
3391+
switch p.As {
3392+
case ALDADDALD, ALDADDALW:
3393+
o1 |= 3 << 22
3394+
}
33893395
o1 |= 0x1c1<<21 | uint32(rs&31)<<16 | uint32(rb&31)<<5 | uint32(rt&31)
33903396

33913397
case 50: /* sys/sysl */

src/runtime/internal/atomic/bench_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,23 @@ func BenchmarkAtomicStore(b *testing.B) {
4242
atomic.Store(&x, 0)
4343
}
4444
}
45+
46+
func BenchmarkXadd(b *testing.B) {
47+
var x uint32
48+
ptr := &x
49+
b.RunParallel(func(pb *testing.PB) {
50+
for pb.Next() {
51+
atomic.Xadd(ptr, 1)
52+
}
53+
})
54+
}
55+
56+
func BenchmarkXadd64(b *testing.B) {
57+
var x uint64
58+
ptr := &x
59+
b.RunParallel(func(pb *testing.PB) {
60+
for pb.Next() {
61+
atomic.Xadd64(ptr, 1)
62+
}
63+
})
64+
}

src/runtime/proc.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,8 @@ func cpuinit() {
517517
support_popcnt = cpu.X86.HasPOPCNT
518518
support_sse2 = cpu.X86.HasSSE2
519519
support_sse41 = cpu.X86.HasSSE41
520+
521+
arm64_support_atomics = cpu.ARM64.HasATOMICS
520522
}
521523

522524
// The bootstrap sequence is:

0 commit comments

Comments
 (0)