Skip to content

Commit 15fbdc2

Browse files
authored
[AArch64][SVE] Lower unpredicated loads/stores as LDR/STR. (#127837)
Currently, given: ```cpp svuint8_t foo(uint8_t *x) { return svld1(svptrue_b8(), x); } ``` We generate: ```gas foo: ptrue p0.b ld1b { z0.b }, p0/z, [x0] ret ``` However, on little-endian and with unaligned memory accesses allowed, we could instead be using LDR as follows: ```gas foo: ldr z0, [x0] ret ``` The second form avoids the predicate dependency. Likewise for other types and stores.
1 parent a5d8b7a commit 15fbdc2

File tree

70 files changed

+2393
-2597
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+2393
-2597
lines changed

clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c

+6-3
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@
1313

1414
void func(int *restrict a, int *restrict b) {
1515
// CHECK-LABEL: func
16-
// CHECK256-COUNT-8: st1w
17-
// CHECK512-COUNT-4: st1w
18-
// CHECK1024-COUNT-2: st1w
16+
// CHECK256-COUNT-1: str
17+
// CHECK256-COUNT-7: st1w
18+
// CHECK512-COUNT-1: str
19+
// CHECK512-COUNT-3: st1w
20+
// CHECK1024-COUNT-1: str
21+
// CHECK1024-COUNT-1: st1w
1922
// CHECK2048-COUNT-1: st1w
2023
#pragma clang loop vectorize(enable)
2124
for (int i = 0; i < 64; ++i)

llvm/lib/Target/AArch64/AArch64InstrInfo.td

+2
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
393393
SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
394394
SDTCisInt<1>]>>;
395395

396+
def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
397+
396398

397399
//===----------------------------------------------------------------------===//
398400
// AArch64-specific DAG Nodes.

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

+16
Original file line numberDiff line numberDiff line change
@@ -2993,6 +2993,22 @@ let Predicates = [HasSVE_or_SME] in {
29932993
defm : unpred_loadstore_bitcast<nxv2i64>;
29942994
defm : unpred_loadstore_bitcast<nxv2f64>;
29952995

2996+
// Allow using LDR/STR to avoid the predicate dependence.
2997+
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses] in
2998+
foreach Ty = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in {
2999+
let AddedComplexity = 2 in {
3000+
def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
3001+
(LDR_ZXI GPR64sp:$base, simm9:$offset)>;
3002+
def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
3003+
(STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>;
3004+
}
3005+
3006+
def : Pat<(Ty (load GPR64sp:$base)),
3007+
(LDR_ZXI GPR64sp:$base, (i64 0))>;
3008+
def : Pat<(store Ty:$val, GPR64sp:$base),
3009+
(STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>;
3010+
}
3011+
29963012
multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
29973013
def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
29983014
(Store PPR:$val, GPR64sp:$base, simm9:$offset)>;

llvm/lib/Target/AArch64/SVEInstrFormats.td

+1
Original file line numberDiff line numberDiff line change
@@ -9668,6 +9668,7 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter
96689668
let WantsRoot = true in {
96699669
def am_sve_indexed_s4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-8, 7>">;
96709670
def am_sve_indexed_s6 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-32, 31>">;
9671+
def am_sve_indexed_s9 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-256, 255>">;
96719672
}
96729673

96739674
def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", []>;

llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll

+24-28
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,12 @@ define void @array_1D(ptr %addr) #0 {
1313
; CHECK-NEXT: addvl sp, sp, #-3
1414
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
1515
; CHECK-NEXT: .cfi_offset w29, -16
16-
; CHECK-NEXT: ptrue p0.d
17-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
18-
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
19-
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
20-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
21-
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
22-
; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
16+
; CHECK-NEXT: ldr z0, [x0]
17+
; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
18+
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
19+
; CHECK-NEXT: str z0, [sp]
20+
; CHECK-NEXT: str z1, [sp, #2, mul vl]
21+
; CHECK-NEXT: str z2, [sp, #1, mul vl]
2322
; CHECK-NEXT: addvl sp, sp, #3
2423
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2524
; CHECK-NEXT: ret
@@ -37,8 +36,7 @@ define %my_subtype @array_1D_extract(ptr %addr) #0 {
3736
; CHECK-NEXT: addvl sp, sp, #-3
3837
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
3938
; CHECK-NEXT: .cfi_offset w29, -16
40-
; CHECK-NEXT: ptrue p0.d
41-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
39+
; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
4240
; CHECK-NEXT: addvl sp, sp, #3
4341
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
4442
; CHECK-NEXT: ret
@@ -56,12 +54,11 @@ define void @array_1D_insert(ptr %addr, %my_subtype %elt) #0 {
5654
; CHECK-NEXT: addvl sp, sp, #-3
5755
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
5856
; CHECK-NEXT: .cfi_offset w29, -16
59-
; CHECK-NEXT: ptrue p0.d
60-
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
61-
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
62-
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
63-
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
64-
; CHECK-NEXT: st1d { z2.d }, p0, [sp]
57+
; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
58+
; CHECK-NEXT: ldr z2, [x0]
59+
; CHECK-NEXT: str z0, [sp, #1, mul vl]
60+
; CHECK-NEXT: str z1, [sp, #2, mul vl]
61+
; CHECK-NEXT: str z2, [sp]
6562
; CHECK-NEXT: addvl sp, sp, #3
6663
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
6764
; CHECK-NEXT: ret
@@ -80,19 +77,18 @@ define void @array_2D(ptr %addr) #0 {
8077
; CHECK-NEXT: addvl sp, sp, #-6
8178
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
8279
; CHECK-NEXT: .cfi_offset w29, -16
83-
; CHECK-NEXT: ptrue p0.d
84-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
85-
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #5, mul vl]
86-
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
87-
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #4, mul vl]
88-
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #2, mul vl]
89-
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #3, mul vl]
90-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
91-
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #5, mul vl]
92-
; CHECK-NEXT: st1d { z3.d }, p0, [sp, #4, mul vl]
93-
; CHECK-NEXT: st1d { z5.d }, p0, [sp, #3, mul vl]
94-
; CHECK-NEXT: st1d { z4.d }, p0, [sp, #2, mul vl]
95-
; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
80+
; CHECK-NEXT: ldr z0, [x0]
81+
; CHECK-NEXT: ldr z1, [x0, #5, mul vl]
82+
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
83+
; CHECK-NEXT: ldr z3, [x0, #4, mul vl]
84+
; CHECK-NEXT: ldr z4, [x0, #2, mul vl]
85+
; CHECK-NEXT: ldr z5, [x0, #3, mul vl]
86+
; CHECK-NEXT: str z0, [sp]
87+
; CHECK-NEXT: str z1, [sp, #5, mul vl]
88+
; CHECK-NEXT: str z3, [sp, #4, mul vl]
89+
; CHECK-NEXT: str z5, [sp, #3, mul vl]
90+
; CHECK-NEXT: str z4, [sp, #2, mul vl]
91+
; CHECK-NEXT: str z2, [sp, #1, mul vl]
9692
; CHECK-NEXT: addvl sp, sp, #6
9793
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
9894
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll

+6-7
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,12 @@ define void @test(ptr %addr) #0 {
1212
; CHECK-NEXT: addvl sp, sp, #-3
1313
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
1414
; CHECK-NEXT: .cfi_offset w29, -16
15-
; CHECK-NEXT: ptrue p0.d
16-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
17-
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
18-
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
19-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
20-
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
21-
; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
15+
; CHECK-NEXT: ldr z0, [x0]
16+
; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
17+
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
18+
; CHECK-NEXT: str z0, [sp]
19+
; CHECK-NEXT: str z1, [sp, #2, mul vl]
20+
; CHECK-NEXT: str z2, [sp, #1, mul vl]
2221
; CHECK-NEXT: addvl sp, sp, #3
2322
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2423
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll

+18-18
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
2525
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
2626
; CHECK-NEXT: .LBB0_1: // %vector.body
2727
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
28-
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
29-
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
28+
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
29+
; CHECK-NEXT: ldr z3, [x0]
3030
; CHECK-NEXT: subs x9, x9, x8
31-
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
32-
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
31+
; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
32+
; CHECK-NEXT: ldr z5, [x1]
3333
; CHECK-NEXT: add x1, x1, x10
3434
; CHECK-NEXT: add x0, x0, x10
3535
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
@@ -114,11 +114,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
114114
; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
115115
; CHECK-NEXT: .LBB1_1: // %vector.body
116116
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
117-
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
118-
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
117+
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
118+
; CHECK-NEXT: ldr z3, [x0]
119119
; CHECK-NEXT: subs x9, x9, x8
120-
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
121-
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
120+
; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
121+
; CHECK-NEXT: ldr z5, [x1]
122122
; CHECK-NEXT: add x1, x1, x10
123123
; CHECK-NEXT: add x0, x0, x10
124124
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
@@ -196,16 +196,16 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
196196
; CHECK-NEXT: mov z3.d, z0.d
197197
; CHECK-NEXT: .LBB2_1: // %vector.body
198198
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
199-
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
200-
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0]
199+
; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
200+
; CHECK-NEXT: ldr z5, [x0]
201201
; CHECK-NEXT: subs x9, x9, x8
202-
; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl]
203-
; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl]
204-
; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1]
205-
; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl]
202+
; CHECK-NEXT: ldr z6, [x0, #3, mul vl]
203+
; CHECK-NEXT: ldr z7, [x1, #1, mul vl]
204+
; CHECK-NEXT: ldr z16, [x1]
205+
; CHECK-NEXT: ldr z17, [x0, #2, mul vl]
206206
; CHECK-NEXT: add x0, x0, x10
207-
; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl]
208-
; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl]
207+
; CHECK-NEXT: ldr z18, [x1, #3, mul vl]
208+
; CHECK-NEXT: ldr z19, [x1, #2, mul vl]
209209
; CHECK-NEXT: add x1, x1, x10
210210
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
211211
; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
@@ -321,8 +321,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
321321
; CHECK-NEXT: zip1 z1.d, z2.d, z2.d
322322
; CHECK-NEXT: .LBB3_1: // %vector.body
323323
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
324-
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
325-
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
324+
; CHECK-NEXT: ldr z3, [x0]
325+
; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
326326
; CHECK-NEXT: add x0, x0, x11
327327
; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2]
328328
; CHECK-NEXT: add x8, x8, x9

llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll

+1-2
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,7 @@ define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 {
9797
; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32:
9898
; CHECK: // %bb.0: // %entry
9999
; CHECK-NEXT: fmov z0.s, #1.00000000
100-
; CHECK-NEXT: ptrue p0.s
101-
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
100+
; CHECK-NEXT: str z0, [x0]
102101
; CHECK-NEXT: ret
103102
entry:
104103
%0 = shufflevector <vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer

llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll

+8-8
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr %
103103
; CHECK-NEXT: csel x8, x8, x9, lo
104104
; CHECK-NEXT: mov x9, sp
105105
; CHECK-NEXT: lsl x8, x8, #1
106-
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
106+
; CHECK-NEXT: str z0, [sp]
107107
; CHECK-NEXT: str q1, [x9, x8]
108-
; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
108+
; CHECK-NEXT: ldr z0, [sp]
109109
; CHECK-NEXT: addvl sp, sp, #1
110110
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
111111
; CHECK-NEXT: ret
@@ -147,9 +147,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr
147147
; CHECK-NEXT: csel x8, x8, x9, lo
148148
; CHECK-NEXT: mov x9, sp
149149
; CHECK-NEXT: lsl x8, x8, #2
150-
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
150+
; CHECK-NEXT: str z0, [sp]
151151
; CHECK-NEXT: str q1, [x9, x8]
152-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
152+
; CHECK-NEXT: ldr z0, [sp]
153153
; CHECK-NEXT: addvl sp, sp, #1
154154
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
155155
; CHECK-NEXT: ret
@@ -191,9 +191,9 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr
191191
; CHECK-NEXT: csel x8, x8, x9, lo
192192
; CHECK-NEXT: mov x9, sp
193193
; CHECK-NEXT: lsl x8, x8, #3
194-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
194+
; CHECK-NEXT: str z0, [sp]
195195
; CHECK-NEXT: str q1, [x9, x8]
196-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
196+
; CHECK-NEXT: ldr z0, [sp]
197197
; CHECK-NEXT: addvl sp, sp, #1
198198
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
199199
; CHECK-NEXT: ret
@@ -211,10 +211,10 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr %
211211
; CHECK-NEXT: ptrue p0.d
212212
; CHECK-NEXT: ptrue p1.d, vl8
213213
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
214-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
214+
; CHECK-NEXT: str z0, [sp]
215215
; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1]
216216
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
217-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
217+
; CHECK-NEXT: ldr z0, [sp]
218218
; CHECK-NEXT: addvl sp, sp, #1
219219
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
220220
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)