diff --git a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c index 692d11d97f486..0ed14b4b3b793 100644 --- a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c +++ b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c @@ -13,9 +13,12 @@ void func(int *restrict a, int *restrict b) { // CHECK-LABEL: func -// CHECK256-COUNT-8: st1w -// CHECK512-COUNT-4: st1w -// CHECK1024-COUNT-2: st1w +// CHECK256-COUNT-1: str +// CHECK256-COUNT-7: st1w +// CHECK512-COUNT-1: str +// CHECK512-COUNT-3: st1w +// CHECK1024-COUNT-1: str +// CHECK1024-COUNT-1: st1w // CHECK2048-COUNT-1: st1w #pragma clang loop vectorize(enable) for (int i = 0; i < 64; ++i) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index fc86dd4742bc4..c836f3138a45f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -393,6 +393,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisInt<1>]>>; +def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">; + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 28aecd14e33fa..4365e573d8b16 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2993,6 +2993,22 @@ let Predicates = [HasSVE_or_SME] in { defm : unpred_loadstore_bitcast; defm : unpred_loadstore_bitcast; + // Allow using LDR/STR to avoid the predicate dependence. + let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses] in + foreach Ty = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in { + let AddedComplexity = 2 in { + def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))), + (LDR_ZXI GPR64sp:$base, simm9:$offset)>; + def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)), + (STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>; + } + + def : Pat<(Ty (load GPR64sp:$base)), + (LDR_ZXI GPR64sp:$base, (i64 0))>; + def : Pat<(store Ty:$val, GPR64sp:$base), + (STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>; + } + multiclass unpred_store_predicate { def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), (Store PPR:$val, GPR64sp:$base, simm9:$offset)>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index e443c5ab150bd..48f71297f8377 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -9668,6 +9668,7 @@ multiclass sve_int_perm_bin_perm_128_zz opc, bit P, string asm, SDPatter let WantsRoot = true in { def am_sve_indexed_s4 : ComplexPattern">; def am_sve_indexed_s6 : ComplexPattern">; + def am_sve_indexed_s9 : ComplexPattern">; } def am_sve_regreg_lsl0 : ComplexPattern", []>; diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll index 7244ac949ab88..3a808f5a02f0d 100644 --- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll +++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll @@ -13,13 +13,12 @@ define void @array_1D(ptr %addr) #0 { ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #2, mul vl] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -37,8 +36,7 @@ define %my_subtype @array_1D_extract(ptr %addr) #0 { ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -56,12 +54,11 @@ define void @array_1D_insert(ptr %addr, %my_subtype %elt) #0 { ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp] +; CHECK-NEXT: ldr z1, [x0, #2, mul vl] +; CHECK-NEXT: ldr z2, [x0] +; CHECK-NEXT: str z0, [sp, #1, mul vl] +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -80,19 +77,18 @@ define void @array_2D(ptr %addr) #0 { ; CHECK-NEXT: addvl sp, sp, #-6 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #5, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #4, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #5, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [sp, #4, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1d { z4.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #5, mul vl] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: ldr z3, [x0, #4, mul vl] +; CHECK-NEXT: ldr z4, [x0, #2, mul vl] +; CHECK-NEXT: ldr z5, [x0, #3, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: str z1, [sp, #5, mul vl] +; CHECK-NEXT: str z3, [sp, #4, mul vl] +; CHECK-NEXT: str z5, [sp, #3, mul vl] +; CHECK-NEXT: str z4, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #6 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll index f03a6f018d34d..e7d8f4ff39cee 100644 --- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll +++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll @@ -12,13 +12,12 @@ define void @test(ptr %addr) #0 { ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #2, mul vl] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index 8e26ef6b87ecc..668dc18df6a0b 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -25,11 +25,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: ldr z3, [x0] ; CHECK-NEXT: subs x9, x9, x8 -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1] +; CHECK-NEXT: ldr z4, [x1, #1, mul vl] +; CHECK-NEXT: ldr z5, [x1] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: add x0, x0, x10 ; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 @@ -114,11 +114,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z1.d, z1.d, z3.d ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: ldr z3, [x0] ; CHECK-NEXT: subs x9, x9, x8 -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1] +; CHECK-NEXT: ldr z4, [x1, #1, mul vl] +; CHECK-NEXT: ldr z5, [x1] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: add x0, x0, x10 ; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 @@ -196,16 +196,16 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0] +; CHECK-NEXT: ldr z4, [x0, #1, mul vl] +; CHECK-NEXT: ldr z5, [x0] ; CHECK-NEXT: subs x9, x9, x8 -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl] -; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1] -; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ldr z6, [x0, #3, mul vl] +; CHECK-NEXT: ldr z7, [x1, #1, mul vl] +; CHECK-NEXT: ldr z16, [x1] +; CHECK-NEXT: ldr z17, [x0, #2, mul vl] ; CHECK-NEXT: add x0, x0, x10 -; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl] -; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl] +; CHECK-NEXT: ldr z18, [x1, #3, mul vl] +; CHECK-NEXT: ldr z19, [x1, #2, mul vl] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 @@ -321,8 +321,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-NEXT: zip1 z1.d, z2.d, z2.d ; CHECK-NEXT: .LBB3_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ldr z3, [x0] +; CHECK-NEXT: ldr z4, [x0, #1, mul vl] ; CHECK-NEXT: add x0, x0, x11 ; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2] ; CHECK-NEXT: add x8, x8, x9 diff --git a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll index e6d5a2ac0fd79..820bc2c8a417f 100644 --- a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll +++ b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll @@ -97,8 +97,7 @@ define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 { ; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov z0.s, #1.00000000 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret entry: %0 = shufflevector insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll index 542b2e90ffc15..d5b9d17a98d55 100644 --- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -103,9 +103,9 @@ define @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr % ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -147,9 +147,9 @@ define @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -191,9 +191,9 @@ define @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -211,10 +211,10 @@ define @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr % ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ptrue p1.d, vl8 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll index d1171bc312473..69e805d9ca2ee 100644 --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -328,15 +328,14 @@ define @splice_nxv8i32_idx( %a, @splice_nxv16f32_16( %a, @splice_nxv16i8_neg17( %a, @splice_nxv8i16_neg9( %a, @splice_nxv8f16_neg9( %a, @splice_nxv8i32( %a, @splice_nxv16f32_neg17( %a, @test_ldnp_v16f64(ptr %A) { define @test_ldnp_v20f32_vscale(ptr %A) { ; CHECK-LABEL: test_ldnp_v20f32_vscale: ; CHECK: ; %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] +; CHECK-NEXT: ldr z2, [x0, #2, mul vl] +; CHECK-NEXT: ldr z3, [x0, #3, mul vl] +; CHECK-NEXT: ldr z4, [x0, #4, mul vl] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v20f32_vscale: diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll index b77f7b97f797c..e329548f84d24 100644 --- a/llvm/test/CodeGen/AArch64/sinksplat.ll +++ b/llvm/test/CodeGen/AArch64/sinksplat.ll @@ -517,7 +517,7 @@ define @fmul_scalable(ptr %x, ptr %y) "target-features"="+s ; CHECK-NEXT: lsl x8, x8, #2 ; CHECK-NEXT: .LBB15_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: ldr z2, [x1] ; CHECK-NEXT: subs w9, w9, #1 ; CHECK-NEXT: add x1, x1, x8 ; CHECK-NEXT: fmul z2.s, z2.s, z1.s diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll index 15bf6a45f7541..f49bb910b5bd1 100644 --- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll +++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll @@ -19,8 +19,6 @@ define void @quux() #1 { ; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #384 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: .cfi_def_cfa w29, 96 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 @@ -55,77 +53,92 @@ define void @quux() #1 { ; CHECK-NEXT: // implicit-def: $x9 ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: and x14, x9, #0x70 -; CHECK-NEXT: str x14, [x19, #16] // 8-byte Folded Spill +; CHECK-NEXT: sub x9, x29, #120 +; CHECK-NEXT: stur x14, [x9, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x14 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #24] // 8-byte Folded Spill +; CHECK-NEXT: sub x10, x29, #112 +; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x14 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #32] // 8-byte Folded Spill +; CHECK-NEXT: sub x10, x29, #104 +; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill ; CHECK-NEXT: addvl x9, x8, #1 ; CHECK-NEXT: mov w0, w9 ; CHECK-NEXT: // implicit-def: $x9 ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: and x10, x9, #0x3f0 -; CHECK-NEXT: str x10, [x19, #40] // 8-byte Folded Spill +; CHECK-NEXT: sub x9, x29, #96 +; CHECK-NEXT: stur x10, [x9, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x10 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #48] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #88 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x10 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #56] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #80 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x14 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #64] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #72 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x14 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #72] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #64 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x10 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #80] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #56 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x10 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #88] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #48 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x14 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #96] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #40 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x14 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #104] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #32 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x10 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #112] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #24 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x10 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #120] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #16 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x14 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #128] // 8-byte Folded Spill +; CHECK-NEXT: sub x11, x29, #8 +; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x14 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #136] // 8-byte Folded Spill +; CHECK-NEXT: stur x9, [x29, #-256] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x10 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #144] // 8-byte Folded Spill +; CHECK-NEXT: stur x9, [x29, #-248] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, x10 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str x9, [x19, #152] // 8-byte Folded Spill +; CHECK-NEXT: stur x9, [x29, #-240] // 8-byte Folded Spill ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, #16 ; CHECK-NEXT: mov sp, x9 @@ -159,33 +172,33 @@ define void @quux() #1 { ; CHECK-NEXT: mov x2, sp ; CHECK-NEXT: subs x10, x2, #16 ; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: str x10, [x19, #160] // 8-byte Folded Spill +; CHECK-NEXT: stur x10, [x29, #-232] // 8-byte Folded Spill ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: subs x11, x10, x14 ; CHECK-NEXT: mov sp, x11 ; CHECK-NEXT: mov x10, x11 -; CHECK-NEXT: str x10, [x19, #168] // 8-byte Folded Spill +; CHECK-NEXT: stur x10, [x29, #-224] // 8-byte Folded Spill ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: subs x10, x0, #16 ; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: str x10, [x19, #176] // 8-byte Folded Spill +; CHECK-NEXT: stur x10, [x29, #-216] // 8-byte Folded Spill ; CHECK-NEXT: mov x17, sp ; CHECK-NEXT: subs x10, x17, #16 ; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: str x10, [x19, #184] // 8-byte Folded Spill +; CHECK-NEXT: stur x10, [x29, #-208] // 8-byte Folded Spill ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: subs x10, x10, x14 -; CHECK-NEXT: str x10, [x19, #360] // 8-byte Folded Spill +; CHECK-NEXT: stur x10, [x29, #-32] // 8-byte Folded Spill ; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: str x10, [x19, #192] // 8-byte Folded Spill +; CHECK-NEXT: stur x10, [x29, #-200] // 8-byte Folded Spill ; CHECK-NEXT: mov x15, sp ; CHECK-NEXT: subs x10, x15, #16 ; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: str x10, [x19, #200] // 8-byte Folded Spill +; CHECK-NEXT: stur x10, [x29, #-192] // 8-byte Folded Spill ; CHECK-NEXT: mov x13, sp ; CHECK-NEXT: subs x10, x13, #16 ; CHECK-NEXT: mov sp, x10 -; CHECK-NEXT: str x10, [x19, #208] // 8-byte Folded Spill +; CHECK-NEXT: stur x10, [x29, #-184] // 8-byte Folded Spill ; CHECK-NEXT: incw x8 ; CHECK-NEXT: mov w1, w8 ; CHECK-NEXT: // implicit-def: $x8 @@ -195,28 +208,28 @@ define void @quux() #1 { ; CHECK-NEXT: subs x10, x8, x12 ; CHECK-NEXT: mov sp, x10 ; CHECK-NEXT: mov x8, x10 -; CHECK-NEXT: str x8, [x19, #216] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-176] // 8-byte Folded Spill ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: subs x8, x8, x12 -; CHECK-NEXT: str x8, [x19, #368] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-24] // 8-byte Folded Spill ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str x8, [x19, #224] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-168] // 8-byte Folded Spill ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: subs x8, x8, x9 ; CHECK-NEXT: and x8, x8, #0xffffffffffffffe0 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str x8, [x19, #232] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-160] // 8-byte Folded Spill ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: subs x8, x8, x9 ; CHECK-NEXT: and x8, x8, #0xffffffffffffffe0 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str x8, [x19, #240] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-152] // 8-byte Folded Spill ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #336] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-56] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #344] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-48] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x24, sp @@ -244,7 +257,7 @@ define void @quux() #1 { ; CHECK-NEXT: subs x8, x16, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #248] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-144] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x5, sp @@ -263,35 +276,35 @@ define void @quux() #1 { ; CHECK-NEXT: subs x8, x30, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #296] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-96] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #328] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-64] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #264] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-128] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #256] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-136] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #272] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-120] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #312] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-80] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #280] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-112] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #304] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-88] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x6, sp @@ -301,7 +314,7 @@ define void @quux() #1 { ; CHECK-NEXT: subs x8, x21, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x19, #352] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-40] // 8-byte Folded Spill ; CHECK-NEXT: subs x8, x8, #16 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov x28, sp @@ -324,12 +337,12 @@ define void @quux() #1 { ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: sturb w8, [x9, #-16] -; CHECK-NEXT: ldr x9, [x19, #248] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-144] // 8-byte Folded Reload ; CHECK-NEXT: sturb w8, [x9, #-16] -; CHECK-NEXT: ldr x9, [x19, #296] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-96] // 8-byte Folded Reload ; CHECK-NEXT: sturb w8, [x30, #-16] ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: str x8, [x19, #376] // 8-byte Folded Spill +; CHECK-NEXT: stur x8, [x29, #-16] // 8-byte Folded Spill ; CHECK-NEXT: stur x8, [x9, #-16] ; CHECK-NEXT: ldur x8, [x20, #-16] ; CHECK-NEXT: ldur x9, [x27, #-16] @@ -338,11 +351,11 @@ define void @quux() #1 { ; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: ldur x9, [x16, #-16] ; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ldr x9, [x19, #328] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-64] // 8-byte Folded Reload ; CHECK-NEXT: add x30, x30, x8, lsl #2 -; CHECK-NEXT: ldr x8, [x19, #296] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-96] // 8-byte Folded Reload ; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldr x8, [x19, #376] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x9, #-16] ; CHECK-NEXT: ldur x8, [x5, #-16] ; CHECK-NEXT: ldur x9, [x26, #-16] @@ -351,11 +364,11 @@ define void @quux() #1 { ; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: ldur x9, [x12, #-16] ; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ldr x9, [x19, #264] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-128] // 8-byte Folded Reload ; CHECK-NEXT: add x30, x30, x8, lsl #2 -; CHECK-NEXT: ldr x8, [x19, #328] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-64] // 8-byte Folded Reload ; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldr x8, [x19, #376] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x9, #-16] ; CHECK-NEXT: ldur x8, [x22, #-16] ; CHECK-NEXT: ldur x9, [x27, #-16] @@ -364,23 +377,23 @@ define void @quux() #1 { ; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: ldur x9, [x25, #-16] ; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ldr x9, [x19, #256] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-136] // 8-byte Folded Reload ; CHECK-NEXT: add x30, x30, x8, lsl #2 -; CHECK-NEXT: ldr x8, [x19, #264] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-128] // 8-byte Folded Reload ; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldr x8, [x19, #272] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-120] // 8-byte Folded Reload ; CHECK-NEXT: mov w30, #32 // =0x20 ; CHECK-NEXT: // kill: def $lr killed $w30 ; CHECK-NEXT: stur x30, [x9, #-16] -; CHECK-NEXT: ldr x9, [x19, #312] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-80] // 8-byte Folded Reload ; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldr x8, [x19, #376] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x9, #-16] ; CHECK-NEXT: ldur x8, [x1, #-16] ; CHECK-NEXT: lsl x8, x8, #5 ; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldr x9, [x19, #280] // 8-byte Folded Reload -; CHECK-NEXT: ldr x8, [x19, #376] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-112] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload ; CHECK-NEXT: stur x30, [x16, #-16] ; CHECK-NEXT: stur x8, [x9, #-16] ; CHECK-NEXT: ldur x8, [x27, #-16] @@ -389,17 +402,17 @@ define void @quux() #1 { ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: stur x8, [x9, #-16] ; CHECK-NEXT: ldur x8, [x20, #-16] -; CHECK-NEXT: str x8, [x19, #288] // 8-byte Folded Spill -; CHECK-NEXT: ldr x8, [x19, #312] // 8-byte Folded Reload +; CHECK-NEXT: stur x8, [x29, #-104] // 8-byte Folded Spill +; CHECK-NEXT: ldur x8, [x29, #-80] // 8-byte Folded Reload ; CHECK-NEXT: ldur x9, [x9, #-16] ; CHECK-NEXT: ldur x8, [x8, #-16] ; CHECK-NEXT: mul x9, x9, x8 -; CHECK-NEXT: ldr x8, [x19, #288] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-104] // 8-byte Folded Reload ; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ldr x9, [x19, #296] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-96] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldr x9, [x19, #304] // 8-byte Folded Reload -; CHECK-NEXT: ldr x8, [x19, #376] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-88] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload ; CHECK-NEXT: stur x30, [x12, #-16] ; CHECK-NEXT: stur x8, [x9, #-16] ; CHECK-NEXT: ldur x8, [x26, #-16] @@ -408,17 +421,17 @@ define void @quux() #1 { ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: stur x8, [x9, #-16] ; CHECK-NEXT: ldur x8, [x5, #-16] -; CHECK-NEXT: str x8, [x19, #320] // 8-byte Folded Spill -; CHECK-NEXT: ldr x8, [x19, #312] // 8-byte Folded Reload +; CHECK-NEXT: stur x8, [x29, #-72] // 8-byte Folded Spill +; CHECK-NEXT: ldur x8, [x29, #-80] // 8-byte Folded Reload ; CHECK-NEXT: ldur x9, [x9, #-16] ; CHECK-NEXT: ldur x8, [x8, #-16] ; CHECK-NEXT: mul x9, x9, x8 -; CHECK-NEXT: ldr x8, [x19, #320] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-72] // 8-byte Folded Reload ; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ldr x9, [x19, #328] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-64] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldr x9, [x19, #352] // 8-byte Folded Reload -; CHECK-NEXT: ldr x8, [x19, #376] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-40] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x6, #-16] ; CHECK-NEXT: stur x8, [x6, #-16] ; CHECK-NEXT: stur x8, [x21, #-16] @@ -427,23 +440,23 @@ define void @quux() #1 { ; CHECK-NEXT: ldur x8, [x27, #-16] ; CHECK-NEXT: ldur x9, [x21, #-16] ; CHECK-NEXT: subs x8, x8, x9 -; CHECK-NEXT: ldr x9, [x19, #336] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-56] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x9, #-16] -; CHECK-NEXT: ldr x8, [x19, #344] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-48] // 8-byte Folded Reload ; CHECK-NEXT: stur x30, [x8, #-16] -; CHECK-NEXT: ldr x8, [x19, #352] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-40] // 8-byte Folded Reload ; CHECK-NEXT: ldur x9, [x9, #-16] ; CHECK-NEXT: stur x9, [x8, #-16] -; CHECK-NEXT: ldr x8, [x19, #376] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x28, #-16] ; CHECK-NEXT: ldur x8, [x26, #-16] ; CHECK-NEXT: ldur x9, [x6, #-16] ; CHECK-NEXT: subs x8, x8, x9 -; CHECK-NEXT: ldr x9, [x19, #360] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-32] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x24, #-16] -; CHECK-NEXT: ldr x8, [x19, #368] // 8-byte Folded Reload +; CHECK-NEXT: ldur x8, [x29, #-24] // 8-byte Folded Reload ; CHECK-NEXT: stur x30, [x7, #-16] -; CHECK-NEXT: ldr x7, [x19, #376] // 8-byte Folded Reload +; CHECK-NEXT: ldur x7, [x29, #-16] // 8-byte Folded Reload ; CHECK-NEXT: ldur x24, [x24, #-16] ; CHECK-NEXT: stur x24, [x28, #-16] ; CHECK-NEXT: ldur x24, [x21, #-16] @@ -520,43 +533,54 @@ define void @quux() #1 { ; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_3: // %bb178 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr x9, [x19, #160] // 8-byte Folded Reload -; CHECK-NEXT: ldr x8, [x19, #56] // 8-byte Folded Reload -; CHECK-NEXT: ldr x10, [x19, #48] // 8-byte Folded Reload -; CHECK-NEXT: ldr x11, [x19, #32] // 8-byte Folded Reload -; CHECK-NEXT: ldr x12, [x19, #24] // 8-byte Folded Reload -; CHECK-NEXT: ldr x13, [x19, #240] // 8-byte Folded Reload -; CHECK-NEXT: ldr x14, [x19, #232] // 8-byte Folded Reload -; CHECK-NEXT: ldr x17, [x19, #88] // 8-byte Folded Reload -; CHECK-NEXT: ldr x18, [x19, #80] // 8-byte Folded Reload -; CHECK-NEXT: ldr x0, [x19, #72] // 8-byte Folded Reload -; CHECK-NEXT: ldr x1, [x19, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldr x15, [x19, #224] // 8-byte Folded Reload -; CHECK-NEXT: ldr x2, [x19, #216] // 8-byte Folded Reload -; CHECK-NEXT: ldr x3, [x19, #120] // 8-byte Folded Reload -; CHECK-NEXT: ldr x4, [x19, #112] // 8-byte Folded Reload -; CHECK-NEXT: ldr x5, [x19, #104] // 8-byte Folded Reload -; CHECK-NEXT: ldr x6, [x19, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x16, [x19, #152] // 8-byte Folded Reload -; CHECK-NEXT: ldr x7, [x19, #144] // 8-byte Folded Reload -; CHECK-NEXT: ldr x20, [x19, #136] // 8-byte Folded Reload -; CHECK-NEXT: ldr x21, [x19, #128] // 8-byte Folded Reload -; CHECK-NEXT: ldr x23, [x19, #200] // 8-byte Folded Reload -; CHECK-NEXT: ldr x22, [x19, #208] // 8-byte Folded Reload -; CHECK-NEXT: ldr x24, [x19, #192] // 8-byte Folded Reload -; CHECK-NEXT: ldr x26, [x19, #176] // 8-byte Folded Reload -; CHECK-NEXT: ldr x25, [x19, #184] // 8-byte Folded Reload -; CHECK-NEXT: ldr x27, [x19, #168] // 8-byte Folded Reload +; CHECK-NEXT: ldur x9, [x29, #-232] // 8-byte Folded Reload +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: ldur x8, [x8, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x10, x29, #88 +; CHECK-NEXT: ldur x10, [x10, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x11, x29, #104 +; CHECK-NEXT: ldur x11, [x11, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x12, x29, #112 +; CHECK-NEXT: ldur x12, [x12, #-256] // 8-byte Folded Reload +; CHECK-NEXT: ldur x13, [x29, #-152] // 8-byte Folded Reload +; CHECK-NEXT: ldur x14, [x29, #-160] // 8-byte Folded Reload +; CHECK-NEXT: sub x15, x29, #48 +; CHECK-NEXT: ldur x17, [x15, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x15, x29, #56 +; CHECK-NEXT: ldur x18, [x15, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x15, x29, #64 +; CHECK-NEXT: ldur x0, [x15, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x15, x29, #72 +; CHECK-NEXT: ldur x1, [x15, #-256] // 8-byte Folded Reload +; CHECK-NEXT: ldur x15, [x29, #-168] // 8-byte Folded Reload +; CHECK-NEXT: ldur x2, [x29, #-176] // 8-byte Folded Reload +; CHECK-NEXT: sub x16, x29, #16 +; CHECK-NEXT: ldur x3, [x16, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x16, x29, #24 +; CHECK-NEXT: ldur x4, [x16, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x16, x29, #32 +; CHECK-NEXT: ldur x5, [x16, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x16, x29, #40 +; CHECK-NEXT: ldur x6, [x16, #-256] // 8-byte Folded Reload +; CHECK-NEXT: ldur x16, [x29, #-240] // 8-byte Folded Reload +; CHECK-NEXT: ldur x7, [x29, #-248] // 8-byte Folded Reload +; CHECK-NEXT: ldur x20, [x29, #-256] // 8-byte Folded Reload +; CHECK-NEXT: sub x21, x29, #8 +; CHECK-NEXT: ldur x21, [x21, #-256] // 8-byte Folded Reload +; CHECK-NEXT: ldur x23, [x29, #-192] // 8-byte Folded Reload +; CHECK-NEXT: ldur x22, [x29, #-184] // 8-byte Folded Reload +; CHECK-NEXT: ldur x24, [x29, #-200] // 8-byte Folded Reload +; CHECK-NEXT: ldur x26, [x29, #-216] // 8-byte Folded Reload +; CHECK-NEXT: ldur x25, [x29, #-208] // 8-byte Folded Reload +; CHECK-NEXT: ldur x27, [x29, #-224] // 8-byte Folded Reload ; CHECK-NEXT: ldr p0, [x27] ; CHECK-NEXT: ldr x27, [x26] ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x27] ; CHECK-NEXT: mov z0.d, z16.d ; CHECK-NEXT: mov z1.d, z24.d -; CHECK-NEXT: ptrue p2.s -; CHECK-NEXT: str p2, [x29, #-1, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1w { z1.s }, p2, [x14, #1, mul vl] -; CHECK-NEXT: st1w { z0.s }, p2, [x14] +; CHECK-NEXT: str z1, [x14, #1, mul vl] +; CHECK-NEXT: str z0, [x14] ; CHECK-NEXT: ldr x27, [x25] ; CHECK-NEXT: ldr x25, [x26] ; CHECK-NEXT: add x25, x25, x27, lsl #2 @@ -567,66 +591,66 @@ define void @quux() #1 { ; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x24] ; CHECK-NEXT: mov z0.d, z16.d ; CHECK-NEXT: mov z1.d, z24.d -; CHECK-NEXT: st1w { z1.s }, p2, [x13, #1, mul vl] -; CHECK-NEXT: st1w { z0.s }, p2, [x13] +; CHECK-NEXT: str z1, [x13, #1, mul vl] +; CHECK-NEXT: str z0, [x13] ; CHECK-NEXT: ldr x24, [x22] ; CHECK-NEXT: ldr x22, [x23] ; CHECK-NEXT: add x22, x22, x24, lsl #2 ; CHECK-NEXT: str x22, [x23] ; CHECK-NEXT: ldr p1, [x2] ; CHECK-NEXT: ldr p0, [x15] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x14] -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x13] +; CHECK-NEXT: ldr z1, [x14] +; CHECK-NEXT: ldr z0, [x13] ; CHECK-NEXT: str p1, [x21] ; CHECK-NEXT: str p0, [x20] -; CHECK-NEXT: st1w { z1.s }, p2, [x7] -; CHECK-NEXT: st1w { z0.s }, p2, [x16] +; CHECK-NEXT: str z1, [x7] +; CHECK-NEXT: str z0, [x16] ; CHECK-NEXT: ldr p0, [x21] ; CHECK-NEXT: ldr p1, [x20] -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x7] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x16] +; CHECK-NEXT: ldr z0, [x7] +; CHECK-NEXT: ldr z1, [x16] ; CHECK-NEXT: fmopa za0.s, p0/m, p1/m, z0.s, z1.s ; CHECK-NEXT: mov x16, x2 ; CHECK-NEXT: incd x16 ; CHECK-NEXT: ldr p1, [x16] ; CHECK-NEXT: ldr p0, [x15] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x13] +; CHECK-NEXT: ldr z1, [x14, #1, mul vl] +; CHECK-NEXT: ldr z0, [x13] ; CHECK-NEXT: str p1, [x6] ; CHECK-NEXT: str p0, [x5] -; CHECK-NEXT: st1w { z1.s }, p2, [x4] -; CHECK-NEXT: st1w { z0.s }, p2, [x3] +; CHECK-NEXT: str z1, [x4] +; CHECK-NEXT: str z0, [x3] ; CHECK-NEXT: ldr p0, [x6] ; CHECK-NEXT: ldr p1, [x5] -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x4] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x3] +; CHECK-NEXT: ldr z0, [x4] +; CHECK-NEXT: ldr z1, [x3] ; CHECK-NEXT: fmopa za1.s, p0/m, p1/m, z0.s, z1.s ; CHECK-NEXT: ldr p1, [x2] ; CHECK-NEXT: incd x15 ; CHECK-NEXT: ldr p0, [x15] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x14] -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x13, #1, mul vl] +; CHECK-NEXT: ldr z1, [x14] +; CHECK-NEXT: ldr z0, [x13, #1, mul vl] ; CHECK-NEXT: str p1, [x1] ; CHECK-NEXT: str p0, [x0] -; CHECK-NEXT: st1w { z1.s }, p2, [x18] -; CHECK-NEXT: st1w { z0.s }, p2, [x17] +; CHECK-NEXT: str z1, [x18] +; CHECK-NEXT: str z0, [x17] ; CHECK-NEXT: ldr p0, [x1] ; CHECK-NEXT: ldr p1, [x0] -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x18] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x17] +; CHECK-NEXT: ldr z0, [x18] +; CHECK-NEXT: ldr z1, [x17] ; CHECK-NEXT: fmopa za2.s, p0/m, p1/m, z0.s, z1.s ; CHECK-NEXT: ldr p1, [x16] ; CHECK-NEXT: ldr p0, [x15] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x13, #1, mul vl] +; CHECK-NEXT: ldr z1, [x14, #1, mul vl] +; CHECK-NEXT: ldr z0, [x13, #1, mul vl] ; CHECK-NEXT: str p1, [x12] ; CHECK-NEXT: str p0, [x11] -; CHECK-NEXT: st1w { z1.s }, p2, [x10] -; CHECK-NEXT: st1w { z0.s }, p2, [x8] +; CHECK-NEXT: str z1, [x10] +; CHECK-NEXT: str z0, [x8] ; CHECK-NEXT: ldr p0, [x12] ; CHECK-NEXT: ldr p1, [x11] -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x10] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x8] +; CHECK-NEXT: ldr z0, [x10] +; CHECK-NEXT: ldr z1, [x8] ; CHECK-NEXT: fmopa za3.s, p0/m, p1/m, z0.s, z1.s ; CHECK-NEXT: ldr x8, [x9] ; CHECK-NEXT: subs x8, x8, #1 diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index 83437c9eb076e..6ea2267cd22e6 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -385,8 +385,7 @@ define void @test11(ptr %p) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm @@ -487,9 +486,8 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-NEXT: bl callee_farg_fret ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index de6d59801b078..5ea5e3e7766e8 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -29,9 +29,8 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload @@ -63,9 +62,8 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload @@ -97,9 +95,8 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload @@ -131,9 +128,8 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload @@ -170,10 +166,9 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -211,10 +206,9 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -252,10 +246,9 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -298,10 +291,9 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -340,10 +332,9 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -382,10 +373,9 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -424,10 +414,9 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -466,10 +455,9 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -508,10 +496,9 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -550,10 +537,9 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -596,10 +582,9 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -637,10 +622,9 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -678,10 +662,9 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -719,10 +702,9 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -760,10 +742,9 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -801,10 +782,9 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8bf16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -842,10 +822,9 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -883,10 +862,9 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -967,10 +945,9 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload @@ -997,10 +974,9 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload @@ -1027,10 +1003,9 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload @@ -1057,10 +1032,9 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload @@ -1090,10 +1064,9 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 { ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1123,10 +1096,9 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 { ; CHECK-NEXT: bl get_f32 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1156,10 +1128,9 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 { ; CHECK-NEXT: bl get_f64 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1193,10 +1164,9 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1i8 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1227,10 +1197,9 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1i16 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1261,10 +1230,9 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1i32 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1295,10 +1263,9 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1i64 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1330,10 +1297,9 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1364,10 +1330,9 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1f32 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1398,10 +1363,9 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1f64 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1437,10 +1401,9 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1471,10 +1434,9 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1505,10 +1467,9 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1539,10 +1500,9 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1573,10 +1533,9 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1607,10 +1566,9 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload @@ -1641,10 +1599,9 @@ define void @dont_coalesce_res_v2f64(ptr %ptr) #0 { ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index cd133e946f04c..438b941198449 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -385,8 +385,7 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl foo ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll index ecfbb47ba5571..eab0adf70f8c8 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll @@ -55,12 +55,11 @@ define { , , , , , , , , , , , , , , , , , , %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: fdot_multi_za32_f16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ldr z27, [x1] ; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d @@ -142,13 +141,12 @@ define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, %unused, define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: fdot_multi_za32_bf16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ldr z27, [x1] ; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index fba81eac905e2..ae561016e58b1 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -48,13 +48,12 @@ entry: define void @udot_multi_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ldr z27, [x1] ; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d @@ -138,13 +137,12 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, %unused, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ldr z27, [x1] ; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d @@ -180,13 +178,12 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, %unused, < define void @udot_multi_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ldr z27, [x1] ; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d @@ -244,13 +241,12 @@ entry: define void @usdot_multi_za32_u8_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ldr z27, [x1] ; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d @@ -358,13 +354,12 @@ entry: define void @sdot_multi_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ldr z27, [x1] ; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d @@ -448,13 +443,12 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, %unused, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ldr z27, [x1] ; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d @@ -513,13 +507,12 @@ entry: define void @sdot_multi_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ldr z27, [x1] ; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d @@ -1370,7 +1363,6 @@ define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] @@ -1379,7 +1371,7 @@ define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1458,7 +1450,6 @@ define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill @@ -1484,7 +1475,7 @@ define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1676,7 +1667,6 @@ define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] @@ -1685,7 +1675,7 @@ define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1764,7 +1754,6 @@ define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill @@ -1790,7 +1779,7 @@ define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1932,7 +1921,6 @@ define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] @@ -1941,7 +1929,7 @@ define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -2020,7 +2008,6 @@ define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill @@ -2046,7 +2033,7 @@ define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -2188,7 +2175,6 @@ define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] @@ -2197,7 +2183,7 @@ define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -2276,7 +2262,6 @@ define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill @@ -2302,7 +2287,7 @@ define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll index 9d865b1e74471..e7c9a0a2f5913 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll @@ -575,12 +575,11 @@ define { , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; CHECK-NEXT: add x9, x0, x1 ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x0] ; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x9] @@ -118,7 +117,7 @@ define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -197,7 +196,6 @@ define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill @@ -223,7 +221,7 @@ define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -338,7 +336,6 @@ define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: add x9, x0, x1 ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x0] ; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x9] @@ -347,7 +344,7 @@ define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -426,7 +423,6 @@ define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill @@ -452,7 +448,7 @@ define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -566,7 +562,6 @@ define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %local1 = alloca load volatile , ptr %local0 @@ -93,8 +93,8 @@ define @fill_signed_nxv2i8() { define void @fill_nxv8i16() { ; CHECK-LABEL: fill_nxv8i16 -; CHECK-DAG: ld1h { z{{[01]}}.h }, p0/z, [sp] -; CHECK-DAG: ld1h { z{{[01]}}.h }, p0/z, [sp, #1, mul vl] +; CHECK-DAG: ldr z{{[01]}}, [sp] +; CHECK-DAG: ldr z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca load volatile , ptr %local0 @@ -154,8 +154,8 @@ define @fill_signed_nxv2i16() { define void @fill_nxv4i32() { ; CHECK-LABEL: fill_nxv4i32 -; CHECK-DAG: ld1w { z{{[01]}}.s }, p0/z, [sp] -; CHECK-DAG: ld1w { z{{[01]}}.s }, p0/z, [sp, #1, mul vl] +; CHECK-DAG: ldr z{{[01]}}, [sp] +; CHECK-DAG: ldr z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca load volatile , ptr %local0 @@ -190,8 +190,8 @@ define @fill_signed_nxv2i32() { define void @fill_nxv2i64() { ; CHECK-LABEL: fill_nxv2i64 -; CHECK-DAG: ld1d { z{{[01]}}.d }, p0/z, [sp] -; CHECK-DAG: ld1d { z{{[01]}}.d }, p0/z, [sp, #1, mul vl] +; CHECK-DAG: ldr z{{[01]}}, [sp] +; CHECK-DAG: ldr z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca load volatile , ptr %local0 @@ -201,8 +201,8 @@ define void @fill_nxv2i64() { define void @fill_nxv8bf16() { ; CHECK-LABEL: fill_nxv8bf16 -; CHECK-DAG: ld1h { z{{[01]}}.h }, p0/z, [sp] -; CHECK-DAG: ld1h { z{{[01]}}.h }, p0/z, [sp, #1, mul vl] +; CHECK-DAG: ldr z{{[01]}}, [sp] +; CHECK-DAG: ldr z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca load volatile , ptr %local0 @@ -212,8 +212,8 @@ define void @fill_nxv8bf16() { define void @fill_nxv8f16() { ; CHECK-LABEL: fill_nxv8f16 -; CHECK-DAG: ld1h { z{{[01]}}.h }, p0/z, [sp] -; CHECK-DAG: ld1h { z{{[01]}}.h }, p0/z, [sp, #1, mul vl] +; CHECK-DAG: ldr z{{[01]}}, [sp] +; CHECK-DAG: ldr z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca load volatile , ptr %local0 @@ -223,8 +223,8 @@ define void @fill_nxv8f16() { define void @fill_nxv4f32() { ; CHECK-LABEL: fill_nxv4f32 -; CHECK-DAG: ld1w { z{{[01]}}.s }, p0/z, [sp] -; CHECK-DAG: ld1w { z{{[01]}}.s }, p0/z, [sp, #1, mul vl] +; CHECK-DAG: ldr z{{[01]}}, [sp] +; CHECK-DAG: ldr z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca load volatile , ptr %local0 @@ -234,8 +234,8 @@ define void @fill_nxv4f32() { define void @fill_nxv2f64() { ; CHECK-LABEL: fill_nxv2f64 -; CHECK-DAG: ld1d { z{{[01]}}.d }, p0/z, [sp] -; CHECK-DAG: ld1d { z{{[01]}}.d }, p0/z, [sp, #1, mul vl] +; CHECK-DAG: ldr z{{[01]}}, [sp] +; CHECK-DAG: ldr z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca load volatile , ptr %local0 @@ -248,8 +248,8 @@ define void @fill_nxv2f64() { define void @spill_nxv16i8( %v0, %v1) { ; CHECK-LABEL: spill_nxv16i8 -; CHECK-DAG: st1b { z{{[01]}}.b }, p0, [sp] -; CHECK-DAG: st1b { z{{[01]}}.b }, p0, [sp, #1, mul vl] +; CHECK-DAG: str z{{[01]}}, [sp] +; CHECK-DAG: str z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca store volatile %v0, ptr %local0 @@ -292,8 +292,8 @@ define void @spill_nxv2i8( %v0, %v1) { define void @spill_nxv8i16( %v0, %v1) { ; CHECK-LABEL: spill_nxv8i16 -; CHECK-DAG: st1h { z{{[01]}}.h }, p0, [sp] -; CHECK-DAG: st1h { z{{[01]}}.h }, p0, [sp, #1, mul vl] +; CHECK-DAG: str z{{[01]}}, [sp] +; CHECK-DAG: str z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca store volatile %v0, ptr %local0 @@ -325,8 +325,8 @@ define void @spill_nxv2i16( %v0, %v1) { define void @spill_nxv4i32( %v0, %v1) { ; CHECK-LABEL: spill_nxv4i32 -; CHECK-DAG: st1w { z{{[01]}}.s }, p0, [sp] -; CHECK-DAG: st1w { z{{[01]}}.s }, p0, [sp, #1, mul vl] +; CHECK-DAG: str z{{[01]}}, [sp] +; CHECK-DAG: str z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca store volatile %v0, ptr %local0 @@ -347,8 +347,8 @@ define void @spill_nxv2i32( %v0, %v1) { define void @spill_nxv2i64( %v0, %v1) { ; CHECK-LABEL: spill_nxv2i64 -; CHECK-DAG: st1d { z{{[01]}}.d }, p0, [sp] -; CHECK-DAG: st1d { z{{[01]}}.d }, p0, [sp, #1, mul vl] +; CHECK-DAG: str z{{[01]}}, [sp] +; CHECK-DAG: str z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca store volatile %v0, ptr %local0 @@ -358,8 +358,8 @@ define void @spill_nxv2i64( %v0, %v1) { define void @spill_nxv8f16( %v0, %v1) { ; CHECK-LABEL: spill_nxv8f16 -; CHECK-DAG: st1h { z{{[01]}}.h }, p0, [sp] -; CHECK-DAG: st1h { z{{[01]}}.h }, p0, [sp, #1, mul vl] +; CHECK-DAG: str z{{[01]}}, [sp] +; CHECK-DAG: str z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca store volatile %v0, ptr %local0 @@ -369,8 +369,8 @@ define void @spill_nxv8f16( %v0, %v1) { define void @spill_nxv8bf16( %v0, %v1) { ; CHECK-LABEL: spill_nxv8bf16 -; CHECK-DAG: st1h { z{{[01]}}.h }, p0, [sp] -; CHECK-DAG: st1h { z{{[01]}}.h }, p0, [sp, #1, mul vl] +; CHECK-DAG: str z{{[01]}}, [sp] +; CHECK-DAG: str z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca store volatile %v0, ptr %local0 @@ -380,8 +380,8 @@ define void @spill_nxv8bf16( %v0, %v1 define void @spill_nxv4f32( %v0, %v1) { ; CHECK-LABEL: spill_nxv4f32 -; CHECK-DAG: st1w { z{{[01]}}.s }, p0, [sp] -; CHECK-DAG: st1w { z{{[01]}}.s }, p0, [sp, #1, mul vl] +; CHECK-DAG: str z{{[01]}}, [sp] +; CHECK-DAG: str z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca store volatile %v0, ptr %local0 @@ -391,8 +391,8 @@ define void @spill_nxv4f32( %v0, %v1) { define void @spill_nxv2f64( %v0, %v1) { ; CHECK-LABEL: spill_nxv2f64 -; CHECK-DAG: st1d { z{{[01]}}.d }, p0, [sp] -; CHECK-DAG: st1d { z{{[01]}}.d }, p0, [sp, #1, mul vl] +; CHECK-DAG: str z{{[01]}}, [sp] +; CHECK-DAG: str z{{[01]}}, [sp, #1, mul vl] %local0 = alloca %local1 = alloca store volatile %v0, ptr %local0 diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll index b8ab9a00c6981..555e38a3df205 100644 --- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll +++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll @@ -25,28 +25,27 @@ define @test_nxv2i64_v8i64( %a, <8 x i64> % ; CHECK-LEGALIZATION-NEXT: mov x10, sp ; CHECK-LEGALIZATION-NEXT: cmp x8, #2 ; CHECK-LEGALIZATION-NEXT: mov z0.d, p0/m, z1.d -; CHECK-LEGALIZATION-NEXT: ptrue p0.d ; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo ; CHECK-LEGALIZATION-NEXT: cmp x8, #4 ; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 -; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-LEGALIZATION-NEXT: str z0, [sp] ; CHECK-LEGALIZATION-NEXT: str q2, [x10, x9] ; CHECK-LEGALIZATION-NEXT: mov w9, #4 // =0x4 ; CHECK-LEGALIZATION-NEXT: addvl x10, sp, #1 -; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-LEGALIZATION-NEXT: ldr z0, [sp] ; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo ; CHECK-LEGALIZATION-NEXT: cmp x8, #6 ; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 -; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: str z0, [sp, #1, mul vl] ; CHECK-LEGALIZATION-NEXT: str q3, [x10, x9] ; CHECK-LEGALIZATION-NEXT: mov w9, #6 // =0x6 -; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: ldr z0, [sp, #1, mul vl] ; CHECK-LEGALIZATION-NEXT: csel x8, x8, x9, lo ; CHECK-LEGALIZATION-NEXT: addvl x9, sp, #2 ; CHECK-LEGALIZATION-NEXT: lsl x8, x8, #3 -; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: str z0, [sp, #2, mul vl] ; CHECK-LEGALIZATION-NEXT: str q4, [x9, x8] -; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: ldr z0, [sp, #2, mul vl] ; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #3 ; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-LEGALIZATION-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -69,28 +68,27 @@ define @test_nxv2i64_v8i64( %a, <8 x i64> % ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: cmp x8, #2 ; CHECK-NEXT: mov z0.d, p0/m, z1.d -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: cmp x8, #4 ; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q2, [x10, x9] ; CHECK-NEXT: mov w9, #4 // =0x4 ; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: cmp x8, #6 ; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp, #1, mul vl] ; CHECK-NEXT: str q3, [x10, x9] ; CHECK-NEXT: mov w9, #6 // =0x6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: addvl x9, sp, #2 ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str z0, [sp, #2, mul vl] ; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -122,28 +120,27 @@ define @test_nxv2f64_v8f64( %a, <8 x ; CHECK-LEGALIZATION-NEXT: mov x10, sp ; CHECK-LEGALIZATION-NEXT: cmp x8, #2 ; CHECK-LEGALIZATION-NEXT: mov z0.d, p0/m, z1.d -; CHECK-LEGALIZATION-NEXT: ptrue p0.d ; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo ; CHECK-LEGALIZATION-NEXT: cmp x8, #4 ; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 -; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-LEGALIZATION-NEXT: str z0, [sp] ; CHECK-LEGALIZATION-NEXT: str q2, [x10, x9] ; CHECK-LEGALIZATION-NEXT: mov w9, #4 // =0x4 ; CHECK-LEGALIZATION-NEXT: addvl x10, sp, #1 -; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-LEGALIZATION-NEXT: ldr z0, [sp] ; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo ; CHECK-LEGALIZATION-NEXT: cmp x8, #6 ; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 -; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: str z0, [sp, #1, mul vl] ; CHECK-LEGALIZATION-NEXT: str q3, [x10, x9] ; CHECK-LEGALIZATION-NEXT: mov w9, #6 // =0x6 -; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: ldr z0, [sp, #1, mul vl] ; CHECK-LEGALIZATION-NEXT: csel x8, x8, x9, lo ; CHECK-LEGALIZATION-NEXT: addvl x9, sp, #2 ; CHECK-LEGALIZATION-NEXT: lsl x8, x8, #3 -; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: str z0, [sp, #2, mul vl] ; CHECK-LEGALIZATION-NEXT: str q4, [x9, x8] -; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: ldr z0, [sp, #2, mul vl] ; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #3 ; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-LEGALIZATION-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -166,28 +163,27 @@ define @test_nxv2f64_v8f64( %a, <8 x ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: cmp x8, #2 ; CHECK-NEXT: mov z0.d, p0/m, z1.d -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: cmp x8, #4 ; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q2, [x10, x9] ; CHECK-NEXT: mov w9, #4 // =0x4 ; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: cmp x8, #6 ; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp, #1, mul vl] ; CHECK-NEXT: str q3, [x10, x9] ; CHECK-NEXT: mov w9, #6 // =0x6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: addvl x9, sp, #2 ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str z0, [sp, #2, mul vl] ; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll index e3308e95e46a0..e719e6d9d25b2 100644 --- a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll +++ b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll @@ -9,7 +9,7 @@ declare dso_local void @ptr_fn(ptr) ; CHECK-NOT: mov x19, sp ; CHECK: addvl sp, sp, #-1 ; CHECK-NOT: __stack_chk_guard -; CHECK: st1w { {{z[0-9]+.s}} }, {{p[0-9]+}}, [x29, #-1, mul vl] +; CHECK: str {{z[0-9]+}}, [x29, #-1, mul vl] define void @call_value() #0 { entry: %x = alloca , align 16 @@ -23,7 +23,7 @@ entry: ; CHECK-NOT: mov x19, sp ; CHECK: addvl sp, sp, #-1 ; CHECK-NOT: __stack_chk_guard -; CHECK: st1w { {{z[0-9]+.s}} }, {{p[0-9]+}}, [x29, #-1, mul vl] +; CHECK: str {{z[0-9]+}}, [x29, #-1, mul vl] define void @call_value_strong() #1 { entry: %x = alloca , align 16 @@ -70,7 +70,7 @@ entry: ; CHECK: mov x29, sp ; CHECK: addvl sp, sp, #-2 ; CHECK-NOT: __stack_chk_guard -; CHECK: st1w { {{z[0-9]+.s}} }, {{p[0-9]+}}, [x29, #-1, mul vl] +; CHECK: str {{z[0-9]+}}, [x29, #-1, mul vl] ; CHECK: bl val_fn ; CHECK: addvl x0, x29, #-2 ; CHECK: bl ptr_fn @@ -91,7 +91,7 @@ entry: ; CHECK-DAG: addvl [[ADDR:x[0-9]+]], x29, #-1 ; CHECK-DAG: ldr [[VAL:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard] ; CHECK-DAG: str [[VAL]], [[[ADDR]]] -; CHECK-DAG: st1w { {{z[0-9]+.s}} }, {{p[0-9]+}}, [x29, #-2, mul vl] +; CHECK-DAG: str {{z[0-9]+}}, [x29, #-2, mul vl] ; CHECK: bl val_fn ; CHECK: addvl x0, x29, #-3 ; CHECK: bl ptr_fn @@ -115,8 +115,7 @@ entry: ; CHECK-NOT: mov x29, sp ; CHECK: addvl sp, sp, #-1 ; CHECK-NOT: __stack_chk_guard -; CHECK: addvl [[REG:x[0-9]+]], x29, #-11 -; CHECK: st1w { {{z[0-9]+.s}} }, {{p[0-9]+}}, [[[REG]], #-8, mul vl] +; CHECK: str {{z[0-9]+}}, [x29, #-19, mul vl] define void @callee_save( %x) #0 { entry: %x.addr = alloca , align 16 @@ -133,8 +132,7 @@ entry: ; CHECK-DAG: addvl [[ADDR:x[0-9]+]], x29, #-19 ; CHECK-DAG: ldr [[VAL:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard] ; CHECK-DAG: str [[VAL]], [[[ADDR]]] -; CHECK-DAG: addvl [[ADDR2:x[0-9]+]], x29, #-12 -; CHECK-DAG: st1w { z0.s }, p0, [[[ADDR2]], #-8, mul vl] +; CHECK-DAG: str z0, [x29, #-20, mul vl] define void @callee_save_strong( %x) #1 { entry: %x.addr = alloca , align 16 diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index a4c2b30566a95..791d7580c327d 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -392,11 +392,10 @@ define i32 @csr_d8_allocnxv4i32(i64 %d) "aarch64_pstate_sm_compatible" { ; CHECK0-NEXT: .cfi_offset w29, -8 ; CHECK0-NEXT: .cfi_offset b8, -16 ; CHECK0-NEXT: mov z0.s, #0 // =0x0 -; CHECK0-NEXT: ptrue p0.s ; CHECK0-NEXT: mov w0, wzr ; CHECK0-NEXT: //APP ; CHECK0-NEXT: //NO_APP -; CHECK0-NEXT: st1w { z0.s }, p0, [sp] +; CHECK0-NEXT: str z0, [sp] ; CHECK0-NEXT: addvl sp, sp, #1 ; CHECK0-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload ; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload @@ -412,12 +411,11 @@ define i32 @csr_d8_allocnxv4i32(i64 %d) "aarch64_pstate_sm_compatible" { ; CHECK64-NEXT: .cfi_offset w29, -8 ; CHECK64-NEXT: .cfi_offset b8, -80 ; CHECK64-NEXT: mov z0.s, #0 // =0x0 -; CHECK64-NEXT: ptrue p0.s ; CHECK64-NEXT: add x8, sp, #64 ; CHECK64-NEXT: mov w0, wzr ; CHECK64-NEXT: //APP ; CHECK64-NEXT: //NO_APP -; CHECK64-NEXT: st1w { z0.s }, p0, [x8] +; CHECK64-NEXT: str z0, [x8] ; CHECK64-NEXT: addvl sp, sp, #1 ; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: ldr x29, [sp, #72] // 8-byte Folded Reload @@ -435,12 +433,11 @@ define i32 @csr_d8_allocnxv4i32(i64 %d) "aarch64_pstate_sm_compatible" { ; CHECK1024-NEXT: .cfi_offset w29, -8 ; CHECK1024-NEXT: .cfi_offset b8, -1040 ; CHECK1024-NEXT: mov z0.s, #0 // =0x0 -; CHECK1024-NEXT: ptrue p0.s ; CHECK1024-NEXT: add x8, sp, #1024 ; CHECK1024-NEXT: mov w0, wzr ; CHECK1024-NEXT: //APP ; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: st1w { z0.s }, p0, [x8] +; CHECK1024-NEXT: str z0, [x8] ; CHECK1024-NEXT: addvl sp, sp, #1 ; CHECK1024-NEXT: add sp, sp, #1024 ; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload @@ -1231,11 +1228,10 @@ define i32 @svecc_csr_d8_allocnxv4i32(i64 %d, %vs) "aarch64_p ; CHECK0-NEXT: .cfi_offset w29, -16 ; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; CHECK0-NEXT: mov z0.s, #0 // =0x0 -; CHECK0-NEXT: ptrue p0.s ; CHECK0-NEXT: mov w0, wzr ; CHECK0-NEXT: //APP ; CHECK0-NEXT: //NO_APP -; CHECK0-NEXT: st1w { z0.s }, p0, [sp] +; CHECK0-NEXT: str z0, [sp] ; CHECK0-NEXT: addvl sp, sp, #1 ; CHECK0-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK0-NEXT: addvl sp, sp, #1 @@ -1254,12 +1250,11 @@ define i32 @svecc_csr_d8_allocnxv4i32(i64 %d, %vs) "aarch64_p ; CHECK64-NEXT: .cfi_offset w29, -16 ; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG ; CHECK64-NEXT: mov z0.s, #0 // =0x0 -; CHECK64-NEXT: ptrue p0.s ; CHECK64-NEXT: add x8, sp, #64 ; CHECK64-NEXT: mov w0, wzr ; CHECK64-NEXT: //APP ; CHECK64-NEXT: //NO_APP -; CHECK64-NEXT: st1w { z0.s }, p0, [x8] +; CHECK64-NEXT: str z0, [x8] ; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #1 ; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload @@ -1280,12 +1275,11 @@ define i32 @svecc_csr_d8_allocnxv4i32(i64 %d, %vs) "aarch64_p ; CHECK1024-NEXT: .cfi_offset w29, -16 ; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG ; CHECK1024-NEXT: mov z0.s, #0 // =0x0 -; CHECK1024-NEXT: ptrue p0.s ; CHECK1024-NEXT: add x8, sp, #1024 ; CHECK1024-NEXT: mov w0, wzr ; CHECK1024-NEXT: //APP ; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: st1w { z0.s }, p0, [x8] +; CHECK1024-NEXT: str z0, [x8] ; CHECK1024-NEXT: add sp, sp, #1024 ; CHECK1024-NEXT: addvl sp, sp, #1 ; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload @@ -3071,8 +3065,7 @@ define i32 @sve_stack_object_and_vla(double %d, i64 %sz) "aarch64_pstate_sm_comp ; CHECK0-NEXT: sub x0, x8, x9 ; CHECK0-NEXT: mov sp, x0 ; CHECK0-NEXT: mov z0.s, #0 // =0x0 -; CHECK0-NEXT: ptrue p0.s -; CHECK0-NEXT: st1w { z0.s }, p0, [x29, #-1, mul vl] +; CHECK0-NEXT: str z0, [x29, #-1, mul vl] ; CHECK0-NEXT: bl bar ; CHECK0-NEXT: mov w0, wzr ; CHECK0-NEXT: mov sp, x29 @@ -3101,9 +3094,8 @@ define i32 @sve_stack_object_and_vla(double %d, i64 %sz) "aarch64_pstate_sm_comp ; CHECK64-NEXT: sub x0, x8, x9 ; CHECK64-NEXT: mov sp, x0 ; CHECK64-NEXT: mov z0.s, #0 // =0x0 -; CHECK64-NEXT: ptrue p0.s ; CHECK64-NEXT: sub x8, x29, #64 -; CHECK64-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl] +; CHECK64-NEXT: str z0, [x8, #-1, mul vl] ; CHECK64-NEXT: bl bar ; CHECK64-NEXT: mov w0, wzr ; CHECK64-NEXT: sub sp, x29, #64 @@ -3135,9 +3127,8 @@ define i32 @sve_stack_object_and_vla(double %d, i64 %sz) "aarch64_pstate_sm_comp ; CHECK1024-NEXT: sub x0, x8, x9 ; CHECK1024-NEXT: mov sp, x0 ; CHECK1024-NEXT: mov z0.s, #0 // =0x0 -; CHECK1024-NEXT: ptrue p0.s ; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl] +; CHECK1024-NEXT: str z0, [x8, #-1, mul vl] ; CHECK1024-NEXT: bl bar ; CHECK1024-NEXT: mov w0, wzr ; CHECK1024-NEXT: sub sp, x29, #1024 diff --git a/llvm/test/CodeGen/AArch64/sve-aliasing.ll b/llvm/test/CodeGen/AArch64/sve-aliasing.ll index a83dc494b3bd2..a27429a256250 100644 --- a/llvm/test/CodeGen/AArch64/sve-aliasing.ll +++ b/llvm/test/CodeGen/AArch64/sve-aliasing.ll @@ -6,17 +6,17 @@ define void @scalable_v16i8(ptr noalias nocapture noundef %l0) { ; CHECK-LABEL: scalable_v16i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: mul z2.b, p0/m, z2.b, z0.b ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: mul z3.b, p0/m, z3.b, z1.b ; CHECK-NEXT: eor z0.d, z2.d, z0.d ; CHECK-NEXT: eor z1.d, z3.d, z1.d -; CHECK-NEXT: st1b { z0.b }, p0, [x0] -; CHECK-NEXT: st1b { z1.b }, p0, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: str z1, [x0, #1, mul vl] ; CHECK-NEXT: ret %l3 = load , ptr %l0, align 16 %l5 = mul %l3, %l3 @@ -35,17 +35,17 @@ define void @scalable_v16i8(ptr noalias nocapture noundef %l0) { define void @scalable_v8i16(ptr noalias nocapture noundef %l0) { ; CHECK-LABEL: scalable_v8i16: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl] ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: mul z2.h, p0/m, z2.h, z0.h ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: mul z3.h, p0/m, z3.h, z1.h ; CHECK-NEXT: eor z0.d, z2.d, z0.d ; CHECK-NEXT: eor z1.d, z3.d, z1.d -; CHECK-NEXT: st1h { z0.h }, p0, [x0] -; CHECK-NEXT: st1h { z1.h }, p0, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: str z1, [x0, #1, mul vl] ; CHECK-NEXT: ret %l3 = load , ptr %l0, align 16 %l5 = mul %l3, %l3 @@ -64,17 +64,17 @@ define void @scalable_v8i16(ptr noalias nocapture noundef %l0) { define void @scalable_v4i32(ptr noalias nocapture noundef %l0) { ; CHECK-LABEL: scalable_v4i32: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl] ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: mul z2.s, p0/m, z2.s, z0.s ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: mul z3.s, p0/m, z3.s, z1.s ; CHECK-NEXT: eor z0.d, z2.d, z0.d ; CHECK-NEXT: eor z1.d, z3.d, z1.d -; CHECK-NEXT: st1w { z0.s }, p0, [x0] -; CHECK-NEXT: st1w { z1.s }, p0, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: str z1, [x0, #1, mul vl] ; CHECK-NEXT: ret %l3 = load , ptr %l0, align 16 %l5 = mul %l3, %l3 @@ -93,17 +93,17 @@ define void @scalable_v4i32(ptr noalias nocapture noundef %l0) { define void @scalable_v2i64(ptr noalias nocapture noundef %l0) { ; CHECK-LABEL: scalable_v2i64: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #1, mul vl] ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: mul z3.d, p0/m, z3.d, z1.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d ; CHECK-NEXT: eor z1.d, z3.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] -; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: str z1, [x0, #1, mul vl] ; CHECK-NEXT: ret %l3 = load , ptr %l0, align 16 %l5 = mul %l3, %l3 @@ -320,13 +320,13 @@ define void @scalable_v2i32(ptr noalias nocapture noundef %l0) { define void @negative_tooshort_v16i8(ptr noalias nocapture noundef %l0) { ; CHECK-LABEL: negative_tooshort_v16i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cnth x8 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b ; CHECK-NEXT: eor z0.d, z1.d, z0.d -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b @@ -455,10 +455,10 @@ define void @negative_scalable_v2i32(ptr noalias nocapture noundef %l0) { define void @triple_v16i8(ptr noalias nocapture noundef %l0) { ; CHECK-LABEL: triple_v16i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ldr z2, [x0, #2, mul vl] ; CHECK-NEXT: movprfx z3, z0 ; CHECK-NEXT: mul z3.b, p0/m, z3.b, z0.b ; CHECK-NEXT: movprfx z4, z1 @@ -468,9 +468,9 @@ define void @triple_v16i8(ptr noalias nocapture noundef %l0) { ; CHECK-NEXT: eor z0.d, z3.d, z0.d ; CHECK-NEXT: eor z1.d, z4.d, z1.d ; CHECK-NEXT: eor z2.d, z5.d, z2.d -; CHECK-NEXT: st1b { z0.b }, p0, [x0] -; CHECK-NEXT: st1b { z1.b }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1b { z2.b }, p0, [x0, #2, mul vl] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: str z1, [x0, #1, mul vl] +; CHECK-NEXT: str z2, [x0, #2, mul vl] ; CHECK-NEXT: ret %l3 = load , ptr %l0, align 16 %l5 = mul %l3, %l3 @@ -494,13 +494,13 @@ define void @triple_v16i8(ptr noalias nocapture noundef %l0) { define void @negative_tripletooshort_v16i8(ptr noalias nocapture noundef %l0) { ; CHECK-LABEL: negative_tripletooshort_v16i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cntw x8 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b ; CHECK-NEXT: eor z0.d, z1.d, z0.d -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll index d227538043fce..2520095cce62e 100644 --- a/llvm/test/CodeGen/AArch64/sve-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll @@ -61,9 +61,8 @@ define void @foo( %dst, i1 %cond) { ; CHECK-NEXT: sub x8, x8, x9 ; CHECK-NEXT: and x0, x8, #0xffffffffffffffe0 ; CHECK-NEXT: mov sp, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: str z1, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: bl bar ; CHECK-NEXT: addvl sp, x29, #-18 ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll index 34532ddd9a2b0..7bddd1d70aaa8 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll @@ -10,8 +10,7 @@ define aarch64_sve_vector_pcs @callee_with_many_sve_arg( %z0, %z1, %z2, %z3, %z4, %z5, %z6, %z7, %z8, %z9) { ; CHECK: name: callee_with_many_sve_arg ; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = COPY $x1 -; CHECK-DAG: [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31 -; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LD1W_IMM killed [[PTRUE]], [[BASE]] +; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LDR_ZXI [[BASE]], 0 ; CHECK-DAG: $z0 = COPY [[RES]] ; CHECK: RET_ReallyLR implicit $z0 ret %z9 @@ -25,9 +24,8 @@ define aarch64_sve_vector_pcs @caller_with_many_sve_arg( @callee_with_many_gpr_sve_arg(i ; CHECK: fixedStack: ; CHECK: - { id: 0, type: default, offset: 8, size: 8, alignment: 8, stack-id: default, ; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = LDRXui %fixed-stack.0, 0 -; CHECK-DAG: [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31 -; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LD1W_IMM killed [[PTRUE]], killed [[BASE]] +; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LDR_ZXI killed [[BASE]] ; CHECK-DAG: $z0 = COPY [[RES]] ; CHECK: RET_ReallyLR implicit $z0 ret %z9 @@ -303,10 +300,8 @@ define aarch64_sve_vector_pcs @caller_with_many_gpr_sve_arg(i ; CHECK-NEXT: stack-id: scalable-vector ; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 16, alignment: 16, ; CHECK-NEXT: stack-id: scalable-vector -; CHECK-DAG: [[PTRUE_S:%[0-9]+]]:ppr_3b = PTRUE_S 31 -; CHECK-DAG: [[PTRUE_D:%[0-9]+]]:ppr_3b = PTRUE_D 31 -; CHECK-DAG: ST1D_IMM %{{[0-9]+}}, killed [[PTRUE_D]], %stack.0, 0 -; CHECK-DAG: ST1W_IMM %{{[0-9]+}}, killed [[PTRUE_S]], %stack.1, 0 +; CHECK-DAG: STR_ZXI %{{[0-9]+}}, %stack.0, 0 +; CHECK-DAG: STR_ZXI %{{[0-9]+}}, %stack.1, 0 ; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0 ; CHECK-DAG: [[BASE2:%[0-9]+]]:gpr64common = ADDXri %stack.1, 0 ; CHECK-DAG: [[SP:%[0-9]+]]:gpr64sp = COPY $sp diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll index aa08dc7e21582..5e4c8916cbbdb 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -17,11 +17,10 @@ define float @foo1(ptr %x0, ptr %x1, ptr %x2) nounwind { ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x2] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z19.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1d { z18.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z17.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1d { z16.d }, p0, [sp] +; CHECK-NEXT: str z19, [sp, #3, mul vl] +; CHECK-NEXT: str z18, [sp, #2, mul vl] +; CHECK-NEXT: str z17, [sp, #1, mul vl] +; CHECK-NEXT: str z16, [sp] ; CHECK-NEXT: bl callee1 ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload @@ -61,23 +60,21 @@ define float @foo2(ptr %x0, ptr %x1) nounwind { ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w2, #2 // =0x2 ; CHECK-NEXT: mov w3, #3 // =0x3 +; CHECK-NEXT: mov w4, #4 // =0x4 ; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] ; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: mov w4, #4 // =0x4 ; CHECK-NEXT: mov w5, #5 // =0x5 ; CHECK-NEXT: mov w6, #6 // =0x6 ; CHECK-NEXT: mov w7, #7 // =0x7 ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w1, #1 // =0x1 -; CHECK-NEXT: st1d { z19.d }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] -; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: st1d { z16.d }, p0, [x9] ; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: str z19, [x8, #3, mul vl] +; CHECK-NEXT: str z18, [x8, #2, mul vl] +; CHECK-NEXT: str z17, [x8, #1, mul vl] +; CHECK-NEXT: str z16, [x8] ; CHECK-NEXT: bl callee2 ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: add sp, sp, #16 @@ -120,10 +117,9 @@ define float @foo3(ptr %x0, ptr %x1, ptr %x2) nounwind { ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: ld3d { z16.d - z18.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z18.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z17.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1d { z16.d }, p0, [sp] +; CHECK-NEXT: str z18, [sp, #2, mul vl] +; CHECK-NEXT: str z17, [sp, #1, mul vl] +; CHECK-NEXT: str z16, [sp] ; CHECK-NEXT: bl callee3 ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload @@ -157,20 +153,19 @@ entry: define double @foo4(double %x0, ptr %ptr1, ptr %ptr2, ptr %ptr3, %x1, %x2, %x3) nounwind { ; CHECK-LABEL: foo4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x3, #1, mul vl] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x3] -; CHECK-NEXT: ld1d { z24.d }, p0/z, [x3, #3, mul vl] -; CHECK-NEXT: ld1d { z25.d }, p0/z, [x3, #2, mul vl] -; CHECK-NEXT: st1d { z4.d }, p0, [x0, #3, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [x0, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x0] -; CHECK-NEXT: st1d { z25.d }, p0, [x1, #2, mul vl] -; CHECK-NEXT: st1d { z24.d }, p0, [x1, #3, mul vl] -; CHECK-NEXT: st1d { z7.d }, p0, [x1] -; CHECK-NEXT: st1d { z6.d }, p0, [x1, #1, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [x2] +; CHECK-NEXT: ldr z6, [x3, #1, mul vl] +; CHECK-NEXT: ldr z7, [x3] +; CHECK-NEXT: ldr z24, [x3, #3, mul vl] +; CHECK-NEXT: ldr z25, [x3, #2, mul vl] +; CHECK-NEXT: str z4, [x0, #3, mul vl] +; CHECK-NEXT: str z3, [x0, #2, mul vl] +; CHECK-NEXT: str z2, [x0, #1, mul vl] +; CHECK-NEXT: str z1, [x0] +; CHECK-NEXT: str z25, [x1, #2, mul vl] +; CHECK-NEXT: str z24, [x1, #3, mul vl] +; CHECK-NEXT: str z7, [x1] +; CHECK-NEXT: str z6, [x1, #1, mul vl] +; CHECK-NEXT: str z5, [x2] ; CHECK-NEXT: ret entry: store volatile %x1, ptr %ptr1 @@ -183,19 +178,18 @@ define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, ptr %p ; CHECK-LABEL: foo5: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x8] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x8, #3, mul vl] -; CHECK-NEXT: ld1d { z24.d }, p0/z, [x8, #2, mul vl] -; CHECK-NEXT: st1d { z4.d }, p0, [x6, #3, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [x6, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [x6, #1, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x6] -; CHECK-NEXT: st1d { z24.d }, p0, [x7, #2, mul vl] -; CHECK-NEXT: st1d { z7.d }, p0, [x7, #3, mul vl] -; CHECK-NEXT: st1d { z6.d }, p0, [x7] -; CHECK-NEXT: st1d { z5.d }, p0, [x7, #1, mul vl] +; CHECK-NEXT: ldr z5, [x8, #1, mul vl] +; CHECK-NEXT: ldr z6, [x8] +; CHECK-NEXT: ldr z7, [x8, #3, mul vl] +; CHECK-NEXT: ldr z24, [x8, #2, mul vl] +; CHECK-NEXT: str z4, [x6, #3, mul vl] +; CHECK-NEXT: str z3, [x6, #2, mul vl] +; CHECK-NEXT: str z2, [x6, #1, mul vl] +; CHECK-NEXT: str z1, [x6] +; CHECK-NEXT: str z24, [x7, #2, mul vl] +; CHECK-NEXT: str z7, [x7, #3, mul vl] +; CHECK-NEXT: str z6, [x7] +; CHECK-NEXT: str z5, [x7, #1, mul vl] ; CHECK-NEXT: ret entry: store volatile %x1, ptr %ptr1 @@ -206,17 +200,16 @@ entry: define double @foo6(double %x0, double %x1, ptr %ptr1, ptr %ptr2, %x2, %x3) nounwind { ; CHECK-LABEL: foo6: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x2] -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2, #2, mul vl] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x2, #1, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [x0, #3, mul vl] -; CHECK-NEXT: st1d { z4.d }, p0, [x0, #2, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [x0] -; CHECK-NEXT: st1d { z7.d }, p0, [x1, #1, mul vl] -; CHECK-NEXT: st1d { z6.d }, p0, [x1, #2, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x1] +; CHECK-NEXT: ldr z1, [x2] +; CHECK-NEXT: ldr z6, [x2, #2, mul vl] +; CHECK-NEXT: ldr z7, [x2, #1, mul vl] +; CHECK-NEXT: str z5, [x0, #3, mul vl] +; CHECK-NEXT: str z4, [x0, #2, mul vl] +; CHECK-NEXT: str z3, [x0, #1, mul vl] +; CHECK-NEXT: str z2, [x0] +; CHECK-NEXT: str z7, [x1, #1, mul vl] +; CHECK-NEXT: str z6, [x1, #2, mul vl] +; CHECK-NEXT: str z1, [x1] ; CHECK-NEXT: ret entry: store volatile %x2, ptr %ptr1 @@ -230,18 +223,17 @@ define void @aavpcs1(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 % ; CHECK-LABEL: aavpcs1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z24.s }, p0/z, [x7] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] -; CHECK-NEXT: st1w { z0.s }, p0, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x9] -; CHECK-NEXT: st1w { z2.s }, p0, [x9] -; CHECK-NEXT: st1w { z4.s }, p0, [x9] -; CHECK-NEXT: st1w { z5.s }, p0, [x9] -; CHECK-NEXT: st1w { z6.s }, p0, [x9] -; CHECK-NEXT: st1w { z7.s }, p0, [x9] -; CHECK-NEXT: st1w { z24.s }, p0, [x9] -; CHECK-NEXT: st1w { z3.s }, p0, [x9] +; CHECK-NEXT: ldr z24, [x7] +; CHECK-NEXT: ldr z3, [x8] +; CHECK-NEXT: str z0, [x9] +; CHECK-NEXT: str z1, [x9] +; CHECK-NEXT: str z2, [x9] +; CHECK-NEXT: str z4, [x9] +; CHECK-NEXT: str z5, [x9] +; CHECK-NEXT: str z6, [x9] +; CHECK-NEXT: str z7, [x9] +; CHECK-NEXT: str z24, [x9] +; CHECK-NEXT: str z3, [x9] ; CHECK-NEXT: ret entry: store volatile %s7, ptr %ptr @@ -262,24 +254,23 @@ define void @aavpcs2(float %s0, float %s1, float %s2, float %s3, float %s4, floa ; CHECK-LABEL: aavpcs2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x7] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x6] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: ld1w { z4.s }, p0/z, [x5] -; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1] -; CHECK-NEXT: ld1w { z6.s }, p0/z, [x4] -; CHECK-NEXT: ld1w { z24.s }, p0/z, [x3] -; CHECK-NEXT: st1w { z7.s }, p0, [x9] -; CHECK-NEXT: st1w { z2.s }, p0, [x9] -; CHECK-NEXT: st1w { z5.s }, p0, [x9] -; CHECK-NEXT: st1w { z24.s }, p0, [x9] -; CHECK-NEXT: st1w { z6.s }, p0, [x9] -; CHECK-NEXT: st1w { z4.s }, p0, [x9] -; CHECK-NEXT: st1w { z3.s }, p0, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x9] -; CHECK-NEXT: st1w { z0.s }, p0, [x9] +; CHECK-NEXT: ldr z1, [x7] +; CHECK-NEXT: ldr z2, [x0] +; CHECK-NEXT: ldr z3, [x6] +; CHECK-NEXT: ldr z4, [x5] +; CHECK-NEXT: ldr z5, [x1] +; CHECK-NEXT: ldr z6, [x4] +; CHECK-NEXT: ldr z24, [x3] +; CHECK-NEXT: ldr z0, [x8] +; CHECK-NEXT: str z7, [x9] +; CHECK-NEXT: str z2, [x9] +; CHECK-NEXT: str z5, [x9] +; CHECK-NEXT: str z24, [x9] +; CHECK-NEXT: str z6, [x9] +; CHECK-NEXT: str z4, [x9] +; CHECK-NEXT: str z3, [x9] +; CHECK-NEXT: str z1, [x9] +; CHECK-NEXT: str z0, [x9] ; CHECK-NEXT: ret entry: store volatile %s7, ptr %ptr @@ -300,26 +291,25 @@ define void @aavpcs3(float %s0, float %s1, float %s2, float %s3, float %s4, floa ; CHECK-LABEL: aavpcs3: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] -; CHECK-NEXT: ld1w { z4.s }, p0/z, [x6] -; CHECK-NEXT: ld1w { z5.s }, p0/z, [x5] -; CHECK-NEXT: ld1w { z6.s }, p0/z, [x2] -; CHECK-NEXT: ld1w { z7.s }, p0/z, [x4] -; CHECK-NEXT: ld1w { z24.s }, p0/z, [x3] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z2, [x7] +; CHECK-NEXT: ldr z3, [x1] +; CHECK-NEXT: ldr z4, [x6] +; CHECK-NEXT: ldr z5, [x5] +; CHECK-NEXT: ldr z1, [x8] +; CHECK-NEXT: ldr z6, [x2] +; CHECK-NEXT: ldr z7, [x4] +; CHECK-NEXT: ldr z24, [x3] ; CHECK-NEXT: ldr x8, [sp, #16] -; CHECK-NEXT: st1w { z1.s }, p0, [x8] -; CHECK-NEXT: st1w { z3.s }, p0, [x8] -; CHECK-NEXT: st1w { z6.s }, p0, [x8] -; CHECK-NEXT: st1w { z24.s }, p0, [x8] -; CHECK-NEXT: st1w { z7.s }, p0, [x8] -; CHECK-NEXT: st1w { z5.s }, p0, [x8] -; CHECK-NEXT: st1w { z4.s }, p0, [x8] -; CHECK-NEXT: st1w { z2.s }, p0, [x8] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: str z3, [x8] +; CHECK-NEXT: str z6, [x8] +; CHECK-NEXT: str z24, [x8] +; CHECK-NEXT: str z7, [x8] +; CHECK-NEXT: str z5, [x8] +; CHECK-NEXT: str z4, [x8] +; CHECK-NEXT: str z2, [x8] +; CHECK-NEXT: str z1, [x8] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -340,18 +330,17 @@ define void @aavpcs4(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 % ; CHECK-LABEL: aavpcs4: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x9, [sp, #16] -; CHECK-NEXT: ld1w { z24.s }, p0/z, [x8] -; CHECK-NEXT: st1w { z0.s }, p0, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x9] -; CHECK-NEXT: st1w { z2.s }, p0, [x9] -; CHECK-NEXT: st1w { z3.s }, p0, [x9] -; CHECK-NEXT: st1w { z4.s }, p0, [x9] -; CHECK-NEXT: st1w { z5.s }, p0, [x9] -; CHECK-NEXT: st1w { z6.s }, p0, [x9] -; CHECK-NEXT: st1w { z7.s }, p0, [x9] -; CHECK-NEXT: st1w { z24.s }, p0, [x9] +; CHECK-NEXT: ldr z24, [x8] +; CHECK-NEXT: str z0, [x9] +; CHECK-NEXT: str z1, [x9] +; CHECK-NEXT: str z2, [x9] +; CHECK-NEXT: str z3, [x9] +; CHECK-NEXT: str z4, [x9] +; CHECK-NEXT: str z5, [x9] +; CHECK-NEXT: str z6, [x9] +; CHECK-NEXT: str z7, [x9] +; CHECK-NEXT: str z24, [x9] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -372,26 +361,25 @@ define @aavpcs5(float %s0, float %s1, float %s2, float %s3, ; CHECK-LABEL: aavpcs5: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] -; CHECK-NEXT: ld1w { z4.s }, p0/z, [x6] -; CHECK-NEXT: ld1w { z5.s }, p0/z, [x5] -; CHECK-NEXT: ld1w { z6.s }, p0/z, [x2] -; CHECK-NEXT: ld1w { z7.s }, p0/z, [x4] -; CHECK-NEXT: ld1w { z24.s }, p0/z, [x3] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z2, [x7] +; CHECK-NEXT: ldr z3, [x1] +; CHECK-NEXT: ldr z4, [x6] +; CHECK-NEXT: ldr z5, [x5] +; CHECK-NEXT: ldr z1, [x8] +; CHECK-NEXT: ldr z6, [x2] +; CHECK-NEXT: ldr z7, [x4] +; CHECK-NEXT: ldr z24, [x3] ; CHECK-NEXT: ldr x8, [sp, #16] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] -; CHECK-NEXT: st1w { z3.s }, p0, [x8] -; CHECK-NEXT: st1w { z6.s }, p0, [x8] -; CHECK-NEXT: st1w { z24.s }, p0, [x8] -; CHECK-NEXT: st1w { z7.s }, p0, [x8] -; CHECK-NEXT: st1w { z5.s }, p0, [x8] -; CHECK-NEXT: st1w { z4.s }, p0, [x8] -; CHECK-NEXT: st1w { z2.s }, p0, [x8] -; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: str z3, [x8] +; CHECK-NEXT: str z6, [x8] +; CHECK-NEXT: str z24, [x8] +; CHECK-NEXT: str z7, [x8] +; CHECK-NEXT: str z5, [x8] +; CHECK-NEXT: str z4, [x8] +; CHECK-NEXT: str z2, [x8] +; CHECK-NEXT: str z1, [x8] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -410,26 +398,25 @@ define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float ; CHECK-LABEL: aapcs1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] -; CHECK-NEXT: ld1w { z4.s }, p0/z, [x6] -; CHECK-NEXT: ld1w { z5.s }, p0/z, [x5] -; CHECK-NEXT: ld1w { z6.s }, p0/z, [x2] -; CHECK-NEXT: ld1w { z7.s }, p0/z, [x4] -; CHECK-NEXT: ld1w { z16.s }, p0/z, [x3] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z2, [x7] +; CHECK-NEXT: ldr z3, [x1] +; CHECK-NEXT: ldr z4, [x6] +; CHECK-NEXT: ldr z5, [x5] +; CHECK-NEXT: ldr z1, [x8] +; CHECK-NEXT: ldr z6, [x2] +; CHECK-NEXT: ldr z7, [x4] +; CHECK-NEXT: ldr z16, [x3] ; CHECK-NEXT: ldr x8, [sp, #16] -; CHECK-NEXT: st1w { z1.s }, p0, [x8] -; CHECK-NEXT: st1w { z3.s }, p0, [x8] -; CHECK-NEXT: st1w { z6.s }, p0, [x8] -; CHECK-NEXT: st1w { z16.s }, p0, [x8] -; CHECK-NEXT: st1w { z7.s }, p0, [x8] -; CHECK-NEXT: st1w { z5.s }, p0, [x8] -; CHECK-NEXT: st1w { z4.s }, p0, [x8] -; CHECK-NEXT: st1w { z2.s }, p0, [x8] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: str z3, [x8] +; CHECK-NEXT: str z6, [x8] +; CHECK-NEXT: str z16, [x8] +; CHECK-NEXT: str z7, [x8] +; CHECK-NEXT: str z5, [x8] +; CHECK-NEXT: str z4, [x8] +; CHECK-NEXT: str z2, [x8] +; CHECK-NEXT: str z1, [x8] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -480,21 +467,20 @@ define void @non_sve_caller_high_range_non_sve_callee_high_range(float %f0, floa ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ldr z16, [x0] +; CHECK-NEXT: ldr z17, [x1] ; CHECK-NEXT: fmov s1, #1.00000000 ; CHECK-NEXT: fmov s2, #2.00000000 ; CHECK-NEXT: fmov s3, #3.00000000 ; CHECK-NEXT: fmov s4, #4.00000000 -; CHECK-NEXT: ld1w { z16.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z17.s }, p0/z, [x1] -; CHECK-NEXT: addvl x0, sp, #1 ; CHECK-NEXT: fmov s5, #5.00000000 +; CHECK-NEXT: addvl x0, sp, #1 ; CHECK-NEXT: fmov s6, #6.00000000 -; CHECK-NEXT: mov x1, sp ; CHECK-NEXT: fmov s7, #7.00000000 -; CHECK-NEXT: st1w { z17.s }, p0, [sp] -; CHECK-NEXT: st1w { z16.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: str z17, [sp] +; CHECK-NEXT: str z16, [sp, #1, mul vl] ; CHECK-NEXT: bl non_sve_callee_high_range ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload @@ -559,11 +545,10 @@ define @sve_caller_non_sve_callee_high_range( %val, ptr %a, %val, ptr %a, %mask) { ; CHECK-LABEL: dead_masked_store_alltrue_same: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.nxv4i32( %val, ptr %a, i32 4, %mask) call void @llvm.masked.store.nxv4i32( %val, ptr %a, i32 4, splat(i1 true)) @@ -25,8 +24,7 @@ define void @dead_masked_store_alltrue_same( %val, ptr %a, %val, %val1, ptr %a, %mask) { ; CHECK-LABEL: dead_masked_store_alltrue_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: str z1, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.nxv4i16( %val, ptr %a, i32 4, %mask) call void @llvm.masked.store.nxv4i32( %val1, ptr %a, i32 4, splat(i1 true)) diff --git a/llvm/test/CodeGen/AArch64/sve-extload-icmp.ll b/llvm/test/CodeGen/AArch64/sve-extload-icmp.ll index ad3e0b58028a6..d5f74e6b9e0c4 100644 --- a/llvm/test/CodeGen/AArch64/sve-extload-icmp.ll +++ b/llvm/test/CodeGen/AArch64/sve-extload-icmp.ll @@ -19,8 +19,8 @@ define @extload_icmp_nxv8i8(ptr %in) #0 { define @extload_icmp_nxv16i8(ptr %in) #0 { ; CHECK-LABEL: extload_icmp_nxv16i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: cnot z0.b, p0/m, z0.b ; CHECK-NEXT: ret %ld = load , ptr %in @@ -45,8 +45,8 @@ define @extload_icmp_nxv4i16(ptr %in) #0 { define @extload_icmp_nxv8i16(ptr %in) #0 { ; CHECK-LABEL: extload_icmp_nxv8i16: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: cnot z0.h, p0/m, z0.h ; CHECK-NEXT: ret %ld = load , ptr %in @@ -71,8 +71,8 @@ define @extload_icmp_nxv2i32(ptr %in) #0 { define @extload_icmp_nxv4i32(ptr %in) #0 { ; CHECK-LABEL: extload_icmp_nxv4i32: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: cnot z0.s, p0/m, z0.s ; CHECK-NEXT: ret %ld = load , ptr %in @@ -84,8 +84,8 @@ define @extload_icmp_nxv4i32(ptr %in) #0 { define @extload_icmp_nxv2i64(ptr %in) #0 { ; CHECK-LABEL: extload_icmp_nxv2i64: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: cnot z0.d, p0/m, z0.d ; CHECK-NEXT: ret %ld = load , ptr %in diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll index 5b7522856e2da..d02aa061b25d9 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll @@ -10,11 +10,10 @@ define <4 x i32> @extract_v4i32_nxv16i32_12( %arg) { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: str z3, [sp, #3, mul vl] +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: ldr q0, [sp, #48] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -30,9 +29,8 @@ define <8 x i16> @extract_v8i16_nxv32i16_8( %arg) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: ldr q0, [sp, #16] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -48,11 +46,10 @@ define <4 x i16> @extract_v4i16_nxv32i16_8( %arg) { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1h { z2.h }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str z3, [sp, #3, mul vl] +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: ldr d0, [sp, #32] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -70,17 +67,16 @@ define <2 x i16> @extract_v2i16_nxv32i16_8( %arg) { ; CHECK-NEXT: addvl sp, sp, #-8 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str z3, [sp, #3, mul vl] +; CHECK-NEXT: str z2, [sp, #2, mul vl] ; CHECK-NEXT: add x8, x8, #32 -; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1h { z2.h }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z3.h }, p0, [sp, #7, mul vl] -; CHECK-NEXT: st1h { z2.h }, p0, [sp, #6, mul vl] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #5, mul vl] -; CHECK-NEXT: st1h { z0.h }, p0, [sp, #4, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: str z3, [sp, #7, mul vl] +; CHECK-NEXT: str z2, [sp, #6, mul vl] +; CHECK-NEXT: str z1, [sp, #5, mul vl] +; CHECK-NEXT: str z0, [sp, #4, mul vl] ; CHECK-NEXT: ld1 { v0.h }[0], [x8] ; CHECK-NEXT: addvl x8, sp, #4 ; CHECK-NEXT: add x8, x8, #34 @@ -102,16 +98,15 @@ define <2 x i64> @extract_v2i64_nxv8i64_8( %arg) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #8 // =0x8 -; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: str z3, [sp, #3, mul vl] ; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: str z2, [sp, #2, mul vl] ; CHECK-NEXT: cmp x8, #8 -; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -127,11 +122,10 @@ define <4 x float> @extract_v4f32_nxv16f32_12( %arg) { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: str z3, [sp, #3, mul vl] +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: ldr q0, [sp, #48] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -179,16 +173,15 @@ define <4 x i1> @extract_v4i1_nxv32i1_16( %arg) { ; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: add x8, x8, #16 -; CHECK-NEXT: st1b { z0.b }, p2, [sp, #1, mul vl] -; CHECK-NEXT: st1b { z1.b }, p2, [sp] -; CHECK-NEXT: st1b { z0.b }, p2, [sp, #3, mul vl] -; CHECK-NEXT: st1b { z1.b }, p2, [sp, #2, mul vl] -; CHECK-NEXT: st1b { z0.b }, p2, [sp, #5, mul vl] -; CHECK-NEXT: st1b { z1.b }, p2, [sp, #4, mul vl] -; CHECK-NEXT: st1b { z0.b }, p2, [sp, #7, mul vl] -; CHECK-NEXT: st1b { z1.b }, p2, [sp, #6, mul vl] +; CHECK-NEXT: str z0, [sp, #1, mul vl] +; CHECK-NEXT: str z1, [sp] +; CHECK-NEXT: str z0, [sp, #3, mul vl] +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z0, [sp, #5, mul vl] +; CHECK-NEXT: str z1, [sp, #4, mul vl] +; CHECK-NEXT: str z0, [sp, #7, mul vl] +; CHECK-NEXT: str z1, [sp, #6, mul vl] ; CHECK-NEXT: ld1 { v0.b }[0], [x8] ; CHECK-NEXT: addvl x8, sp, #2 ; CHECK-NEXT: add x8, x8, #17 @@ -233,17 +226,16 @@ define <4 x i3> @extract_v4i3_nxv32i3_16( %arg) { ; CHECK-NEXT: addvl sp, sp, #-8 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: add x8, x8, #16 -; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1b { z0.b }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1b { z1.b }, p0, [sp, #5, mul vl] -; CHECK-NEXT: st1b { z0.b }, p0, [sp, #4, mul vl] -; CHECK-NEXT: st1b { z1.b }, p0, [sp, #7, mul vl] -; CHECK-NEXT: st1b { z0.b }, p0, [sp, #6, mul vl] +; CHECK-NEXT: str z1, [sp, #3, mul vl] +; CHECK-NEXT: str z0, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #5, mul vl] +; CHECK-NEXT: str z0, [sp, #4, mul vl] +; CHECK-NEXT: str z1, [sp, #7, mul vl] +; CHECK-NEXT: str z0, [sp, #6, mul vl] ; CHECK-NEXT: ld1 { v0.b }[0], [x8] ; CHECK-NEXT: addvl x8, sp, #2 ; CHECK-NEXT: add x8, x8, #17 @@ -281,9 +273,8 @@ define <4 x i64> @extract_v4i64_nxv8i64_0( %arg) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ldr q1, [sp, #16] ; CHECK-NEXT: addvl sp, sp, #2 diff --git a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll index 02e613f88a0aa..8620c9a34b5d6 100644 --- a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll +++ b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll @@ -4,8 +4,7 @@ define @sti64ldi64(ptr nocapture %P, %v) { ; CHECK-LABEL: sti64ldi64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0, #1, mul vl] ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds , ptr %P, i64 1 @@ -18,8 +17,7 @@ entry: define @stf64ldf64(ptr nocapture %P, %v) { ; CHECK-LABEL: stf64ldf64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0, #1, mul vl] ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds , ptr %P, i64 1 @@ -48,9 +46,8 @@ entry: define <2 x i64> @sti64ldfixedi64(ptr nocapture %P, %v) { ; CHECK-LABEL: sti64ldfixedi64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0, #1, mul vl] ; CHECK-NEXT: ldr q0, [x0, x8] ; CHECK-NEXT: ret entry: @@ -64,10 +61,8 @@ entry: define @sti64ldi32(ptr nocapture %P, %v) { ; CHECK-LABEL: sti64ldi32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] ; CHECK-NEXT: ret entry: %0 = bitcast ptr %P to ptr @@ -81,9 +76,8 @@ entry: define @stf64ldi64(ptr nocapture %P, %v) { ; CHECK-LABEL: stf64ldi64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] ; CHECK-NEXT: ret entry: %0 = bitcast ptr %P to ptr diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll index b0b6a6a530dda..4b93900c7d272 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll @@ -52,13 +52,13 @@ define half @fadda_nxv6f16( %v, half %s) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mov w8, #32768 // =0x8000 -; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: st1h { z2.d }, p1, [sp, #3, mul vl] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [sp] +; CHECK-NEXT: st1h { z2.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z2, [sp] ; CHECK-NEXT: fadda h0, p0, h0, z2.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -78,19 +78,19 @@ define half @fadda_nxv10f16( %v, half %s) { ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2 ; CHECK-NEXT: mov w8, #32768 // =0x8000 +; CHECK-NEXT: str z1, [sp] ; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: fadda h2, p0, h2, z0.h -; CHECK-NEXT: st1h { z1.h }, p0, [sp] ; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: addvl x8, sp, #1 ; CHECK-NEXT: st1h { z0.d }, p1, [sp, #1, mul vl] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z1, [sp] +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: st1h { z0.d }, p1, [sp, #6, mul vl] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #2, mul vl] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: str z1, [sp, #2, mul vl] ; CHECK-NEXT: st1h { z0.d }, p1, [x8, #7, mul vl] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] ; CHECK-NEXT: fadda h2, p0, h2, z0.h ; CHECK-NEXT: fmov s0, s2 ; CHECK-NEXT: addvl sp, sp, #3 diff --git a/llvm/test/CodeGen/AArch64/sve-fp.ll b/llvm/test/CodeGen/AArch64/sve-fp.ll index a3fc6ded5f9fa..2f3f99ce54544 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp.ll @@ -513,9 +513,8 @@ define void @scalar_to_vector(ptr %outval, %pred, , ptr %P1, align 16 store %A, ptr %P2, align 16 diff --git a/llvm/test/CodeGen/AArch64/sve-fpext-load.ll b/llvm/test/CodeGen/AArch64/sve-fpext-load.ll index aec2150124c3f..9812c3775d416 100644 --- a/llvm/test/CodeGen/AArch64/sve-fpext-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-fpext-load.ll @@ -37,8 +37,7 @@ define @ext4_f16_f64(ptr %ptr, i64 %index) { define @ext8_f16_f64(ptr %ptr, i64 %index) { ; CHECK-LABEL: ext8_f16_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.s, z0.h ; CHECK-NEXT: uunpkhi z0.s, z0.h @@ -76,8 +75,7 @@ define @ext2_f32_f64(ptr %ptr, i64 %index) { define @ext4_f32_f64(ptr %ptr, i64 %index) { ; CHECK-LABEL: ext4_f32_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll b/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll index d813294a0c415..573958771658c 100644 --- a/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll @@ -4,8 +4,8 @@ define void @fptrunc2_f64_f32(ptr %dst, ptr %src) { ; CHECK-LABEL: fptrunc2_f64_f32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x1] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: fcvt z0.s, p0/m, z0.d ; CHECK-NEXT: st1w { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -19,8 +19,8 @@ entry: define void @fptrunc2_f64_f16(ptr %dst, ptr %src) { ; CHECK-LABEL: fptrunc2_f64_f16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x1] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: st1h { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -34,8 +34,8 @@ entry: define void @fptrunc4_f32_f16(ptr %dst, ptr %src) { ; CHECK-LABEL: fptrunc4_f32_f16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x1] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1] ; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -64,20 +64,19 @@ entry: define void @fptrunc8_f64_f16(ptr %dst, ptr %src) { ; CHECK-LABEL: fptrunc8_f64_f16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x1, #3, mul vl] +; CHECK-NEXT: ldr z1, [x1] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1, #3, mul vl] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1, #2, mul vl] +; CHECK-NEXT: ldr z2, [x1, #1, mul vl] +; CHECK-NEXT: ldr z3, [x1, #2, mul vl] ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d -; CHECK-NEXT: fcvt z2.h, p0/m, z2.d ; CHECK-NEXT: fcvt z1.h, p0/m, z1.d ; CHECK-NEXT: fcvt z3.h, p0/m, z3.d -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fcvt z2.h, p0/m, z2.d ; CHECK-NEXT: uzp1 z0.s, z3.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s ; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret entry: %0 = load , ptr %src, align 8 diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll index 7cd85ab506172..7f558e32ae397 100644 --- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll @@ -598,11 +598,11 @@ define @test_predicate_insert_32xi1( %val, ; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1b { z0.b }, p1, [sp, #1, mul vl] -; CHECK-NEXT: st1b { z1.b }, p1, [sp] +; CHECK-NEXT: str z0, [sp, #1, mul vl] +; CHECK-NEXT: str z1, [sp] ; CHECK-NEXT: strb w0, [x9, x8] -; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] ; CHECK-NEXT: and z0.b, z0.b, #0x1 ; CHECK-NEXT: and z1.b, z1.b, #0x1 ; CHECK-NEXT: cmpne p0.b, p1/z, z0.b, #0 diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll index 581c163388985..14948647c2f8d 100644 --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -19,15 +19,14 @@ define @insert_v2i64_nxv2i64_idx2( %vec, <2 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cntd x8 ; CHECK-NEXT: mov w9, #2 // =0x2 -; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -53,15 +52,14 @@ define @insert_v4i32_nxv4i32_idx4( %vec, <4 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cntw x8 ; CHECK-NEXT: mov w9, #4 // =0x4 -; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: sub x8, x8, #4 ; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #2 ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -87,15 +85,14 @@ define @insert_v8i16_nxv8i16_idx8( %vec, <8 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #8 // =0x8 -; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: sub x8, x8, #8 ; CHECK-NEXT: cmp x8, #8 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #1 ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -120,15 +117,14 @@ define @insert_v16i8_nxv16i8_idx16( %vec, < ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: sub x8, x8, #16 ; CHECK-NEXT: cmp x8, #16 -; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -142,15 +138,14 @@ define @insert_v16i8_nxv16i8_idx16( %vec, < define void @insert_nxv8i64_nxv16i64( %sv0, %sv1, ptr %out) { ; CHECK-LABEL: insert_nxv8i64_nxv16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z7.d }, p0, [x0, #7, mul vl] -; CHECK-NEXT: st1d { z6.d }, p0, [x0, #6, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [x0, #5, mul vl] -; CHECK-NEXT: st1d { z4.d }, p0, [x0, #4, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [x0, #3, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [x0, #2, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: str z7, [x0, #7, mul vl] +; CHECK-NEXT: str z6, [x0, #6, mul vl] +; CHECK-NEXT: str z5, [x0, #5, mul vl] +; CHECK-NEXT: str z4, [x0, #4, mul vl] +; CHECK-NEXT: str z3, [x0, #3, mul vl] +; CHECK-NEXT: str z2, [x0, #2, mul vl] +; CHECK-NEXT: str z1, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret %v0 = call @llvm.vector.insert.nxv8i64.nxv16i64( poison, %sv0, i64 0) %v = call @llvm.vector.insert.nxv8i64.nxv16i64( %v0, %sv1, i64 8) @@ -161,11 +156,10 @@ define void @insert_nxv8i64_nxv16i64( %sv0, define void @insert_nxv8i64_nxv16i64_lo( %sv0, ptr %out) { ; CHECK-LABEL: insert_nxv8i64_nxv16i64_lo: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z3.d }, p0, [x0, #3, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [x0, #2, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: str z3, [x0, #3, mul vl] +; CHECK-NEXT: str z2, [x0, #2, mul vl] +; CHECK-NEXT: str z1, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret %v = call @llvm.vector.insert.nxv8i64.nxv16i64( poison, %sv0, i64 0) store %v, ptr %out @@ -175,11 +169,10 @@ define void @insert_nxv8i64_nxv16i64_lo( %sv0, ptr %out) { define void @insert_nxv8i64_nxv16i64_hi( %sv0, ptr %out) { ; CHECK-LABEL: insert_nxv8i64_nxv16i64_hi: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z3.d }, p0, [x0, #7, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [x0, #6, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x0, #5, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x0, #4, mul vl] +; CHECK-NEXT: str z3, [x0, #7, mul vl] +; CHECK-NEXT: str z2, [x0, #6, mul vl] +; CHECK-NEXT: str z1, [x0, #5, mul vl] +; CHECK-NEXT: str z0, [x0, #4, mul vl] ; CHECK-NEXT: ret %v = call @llvm.vector.insert.nxv8i64.nxv16i64( poison, %sv0, i64 8) store %v, ptr %out @@ -194,18 +187,17 @@ define void @insert_v2i64_nxv16i64(<2 x i64> %sv0, <2 x i64> %sv1, ptr %out) uwt ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q1, [sp, #32] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [sp] -; CHECK-NEXT: st1d { z0.d }, p0, [x0, #3, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x0, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [x0] +; CHECK-NEXT: ldr z0, [sp, #3, mul vl] +; CHECK-NEXT: ldr z1, [sp, #2, mul vl] +; CHECK-NEXT: ldr z2, [sp, #1, mul vl] +; CHECK-NEXT: ldr z3, [sp] +; CHECK-NEXT: str z0, [x0, #3, mul vl] +; CHECK-NEXT: str z1, [x0, #2, mul vl] +; CHECK-NEXT: str z2, [x0, #1, mul vl] +; CHECK-NEXT: str z3, [x0] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -221,9 +213,8 @@ define void @insert_v2i64_nxv16i64(<2 x i64> %sv0, <2 x i64> %sv1, ptr %out) uwt define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) { ; CHECK-LABEL: insert_v2i64_nxv16i64_lo0: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %sv = load <2 x i64>, ptr %psv %v = call @llvm.vector.insert.v2i64.nxv16i64( poison, <2 x i64> %sv, i64 0) @@ -240,12 +231,11 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) uwtable { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: str q0, [sp, #16] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp] -; CHECK-NEXT: st1d { z0.d }, p0, [x1, #1, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x1] +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z1, [sp] +; CHECK-NEXT: str z0, [x1, #1, mul vl] +; CHECK-NEXT: str z1, [x1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -331,11 +321,10 @@ define @insert_nxv8f16_nxv2f16( %vec, @insert_fixed_v2i64_nxv2i64( %vec, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q1, [sp, #16] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -388,9 +376,9 @@ define @insert_fixed_v4i64_nxv2i64( %vec, p ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -457,10 +445,10 @@ define @insert_nxv6i32_nxv2i32( %sv0, @mla_i8_multiuse( %a, %a, %b store %prod, ptr %p diff --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll index 6c62913e9a8e4..523fdea6b2231 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll @@ -6,8 +6,7 @@ define @ld1b_lower_bound(ptr %a) { ; CHECK-LABEL: ld1b_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: ldr z0, [x0, #-8, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 -8 %load = load , ptr %base @@ -17,8 +16,7 @@ define @ld1b_lower_bound(ptr %a) { define @ld1b_inbound(ptr %a) { ; CHECK-LABEL: ld1b_inbound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ldr z0, [x0, #2, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 2 %load = load , ptr %base @@ -28,8 +26,7 @@ define @ld1b_inbound(ptr %a) { define @ld1b_upper_bound(ptr %a) { ; CHECK-LABEL: ld1b_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #7, mul vl] +; CHECK-NEXT: ldr z0, [x0, #7, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 7 %load = load , ptr %base @@ -39,9 +36,7 @@ define @ld1b_upper_bound(ptr %a) { define @ld1b_out_of_upper_bound(ptr %a) { ; CHECK-LABEL: ld1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: rdvl x8, #8 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ldr z0, [x0, #8, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 8 %load = load , ptr %base @@ -51,9 +46,7 @@ define @ld1b_out_of_upper_bound(ptr %a) { define @ld1b_out_of_lower_bound(ptr %a) { ; CHECK-LABEL: ld1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: rdvl x8, #-9 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ldr z0, [x0, #-9, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 -9 %load = load , ptr %base @@ -65,8 +58,7 @@ define @ld1b_out_of_lower_bound(ptr %a) { define @ld1h_inbound(ptr %a) { ; CHECK-LABEL: ld1h_inbound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #-2, mul vl] +; CHECK-NEXT: ldr z0, [x0, #-2, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 -2 %load = load , ptr %base @@ -78,8 +70,7 @@ define @ld1h_inbound(ptr %a) { define @ld1s_inbound(ptr %a) { ; CHECK-LABEL: ld1s_inbound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: ldr z0, [x0, #4, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 4 %load = load , ptr %base @@ -91,8 +82,7 @@ define @ld1s_inbound(ptr %a) { define @ld1d_inbound(ptr %a) { ; CHECK-LABEL: ld1d_inbound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: ldr z0, [x0, #6, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 6 %load = load , ptr %base @@ -115,9 +105,8 @@ define void @load_nxv6f32(ptr %a) { ; CHECK-LABEL: load_nxv6f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ret %val = load volatile , ptr %a ret void @@ -127,9 +116,8 @@ define void @load_nxv12f16(ptr %a) { ; CHECK-LABEL: load_nxv12f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ret %val = load volatile , ptr %a ret void diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll index 0a3f734661502..43391c16e7cce 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -1417,7 +1417,7 @@ define ptr @avoid_preindex_load(ptr %src, ptr %out) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] ; CHECK-NEXT: add x0, x0, #1 -; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1434,9 +1434,8 @@ define ptr @avoid_preindex_load_dup(ptr %src, %pg, ptr %out) { ; CHECK-LABEL: avoid_preindex_load_dup: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x0, x0, #1 -; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1451,9 +1450,8 @@ define ptr @avoid_preindex_load_dup_passthru_zero(ptr %src, %p ; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x0, x0, #1 -; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1469,8 +1467,7 @@ define ptr @preindex_load_dup_passthru( %passthru, ptr %src, < ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsb x8, [x0, #1]! ; CHECK-NEXT: mov z0.d, p0/m, x8 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1486,9 +1483,8 @@ define ptr @preidx8sext64_instead_of_ld1r(ptr %src, ptr %out, ptr %dst) { ; CHECK-LABEL: preidx8sext64_instead_of_ld1r: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsb x8, [x0, #1]! -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: str x8, [x2] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll index a881af1612016..16e0e0c4661b6 100644 --- a/llvm/test/CodeGen/AArch64/sve-llrint.ll +++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll @@ -340,7 +340,8 @@ define @llrint_v32i64_v32f16( %x) { ; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG @@ -350,234 +351,231 @@ define @llrint_v32i64_v32f16( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG -; CHECK-NEXT: uunpkhi z5.s, z0.h ; CHECK-NEXT: uunpklo z4.s, z0.h +; CHECK-NEXT: uunpkhi z5.s, z0.h ; CHECK-NEXT: mov w9, #64511 // =0xfbff ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z6.s, z1.h -; CHECK-NEXT: mov z30.h, w9 -; CHECK-NEXT: uunpkhi z10.s, z1.h +; CHECK-NEXT: mov z26.h, w9 +; CHECK-NEXT: uunpkhi z25.s, z1.h ; CHECK-NEXT: mov w9, #31743 // =0x7bff -; CHECK-NEXT: mov z29.d, #0x8000000000000000 -; CHECK-NEXT: uunpklo z8.s, z2.h -; CHECK-NEXT: uunpkhi z13.s, z3.h -; CHECK-NEXT: uunpklo z18.s, z3.h -; CHECK-NEXT: uunpklo z7.d, z5.s +; CHECK-NEXT: mov z27.d, #0x8000000000000000 +; CHECK-NEXT: uunpklo z31.s, z2.h +; CHECK-NEXT: uunpkhi z12.s, z2.h +; CHECK-NEXT: mov z17.d, z3.d ; CHECK-NEXT: uunpklo z0.d, z4.s ; CHECK-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEXT: uunpklo z7.d, z5.s ; CHECK-NEXT: uunpkhi z24.d, z5.s -; CHECK-NEXT: uunpklo z25.d, z6.s -; CHECK-NEXT: uunpkhi z26.d, z6.s -; CHECK-NEXT: uunpklo z27.d, z10.s -; CHECK-NEXT: uunpkhi z10.d, z10.s -; CHECK-NEXT: uunpklo z12.d, z8.s -; CHECK-NEXT: uunpkhi z16.d, z8.s -; CHECK-NEXT: movprfx z5, z7 -; CHECK-NEXT: frintx z5.h, p0/m, z7.h -; CHECK-NEXT: movprfx z1, z4 -; CHECK-NEXT: frintx z1.h, p0/m, z4.h +; CHECK-NEXT: uunpklo z28.d, z6.s +; CHECK-NEXT: uunpkhi z29.d, z6.s +; CHECK-NEXT: uunpklo z8.d, z25.s +; CHECK-NEXT: uunpkhi z9.d, z25.s +; CHECK-NEXT: uunpklo z16.s, z17.h +; CHECK-NEXT: uunpklo z11.d, z31.s +; CHECK-NEXT: uunpkhi z14.d, z31.s +; CHECK-NEXT: uunpkhi z17.s, z17.h +; CHECK-NEXT: movprfx z30, z4 +; CHECK-NEXT: frintx z30.h, p0/m, z4.h +; CHECK-NEXT: movprfx z4, z7 +; CHECK-NEXT: frintx z4.h, p0/m, z7.h ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: movprfx z6, z24 ; CHECK-NEXT: frintx z6.h, p0/m, z24.h -; CHECK-NEXT: movprfx z24, z25 -; CHECK-NEXT: frintx z24.h, p0/m, z25.h -; CHECK-NEXT: movprfx z25, z26 -; CHECK-NEXT: frintx z25.h, p0/m, z26.h -; CHECK-NEXT: movprfx z28, z27 -; CHECK-NEXT: frintx z28.h, p0/m, z27.h -; CHECK-NEXT: movprfx z8, z10 -; CHECK-NEXT: frintx z8.h, p0/m, z10.h -; CHECK-NEXT: mov z7.h, w9 -; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff -; CHECK-NEXT: rdvl x9, #15 -; CHECK-NEXT: fcmge p3.h, p0/z, z5.h, z30.h -; CHECK-NEXT: movprfx z11, z5 -; CHECK-NEXT: fcvtzs z11.d, p0/m, z5.h -; CHECK-NEXT: fcmge p2.h, p0/z, z1.h, z30.h -; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z30.h -; CHECK-NEXT: fcmge p4.h, p0/z, z6.h, z30.h -; CHECK-NEXT: movprfx z9, z6 -; CHECK-NEXT: fcvtzs z9.d, p0/m, z6.h +; CHECK-NEXT: movprfx z7, z28 +; CHECK-NEXT: frintx z7.h, p0/m, z28.h +; CHECK-NEXT: movprfx z25, z29 +; CHECK-NEXT: frintx z25.h, p0/m, z29.h +; CHECK-NEXT: movprfx z3, z9 +; CHECK-NEXT: frintx z3.h, p0/m, z9.h +; CHECK-NEXT: mov z5.h, w9 +; CHECK-NEXT: movprfx z31, z11 +; CHECK-NEXT: frintx z31.h, p0/m, z11.h +; CHECK-NEXT: movprfx z9, z14 +; CHECK-NEXT: frintx z9.h, p0/m, z14.h +; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z26.h +; CHECK-NEXT: fcmge p4.h, p0/z, z4.h, z26.h +; CHECK-NEXT: movprfx z24, z0 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z30.h, z26.h +; CHECK-NEXT: movprfx z29, z4 +; CHECK-NEXT: fcvtzs z29.d, p0/m, z4.h +; CHECK-NEXT: fcmge p6.h, p0/z, z6.h, z26.h +; CHECK-NEXT: movprfx z28, z30 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z30.h +; CHECK-NEXT: movprfx z10, z6 +; CHECK-NEXT: fcvtzs z10.d, p0/m, z6.h +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: fcmge p3.h, p0/z, z7.h, z26.h +; CHECK-NEXT: movprfx z13, z7 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z7.h ; CHECK-NEXT: movprfx z15, z25 ; CHECK-NEXT: fcvtzs z15.d, p0/m, z25.h -; CHECK-NEXT: movprfx z14, z24 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z24.h -; CHECK-NEXT: movprfx z26, z0 -; CHECK-NEXT: fcvtzs z26.d, p0/m, z0.h -; CHECK-NEXT: movprfx z19, z28 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z28.h -; CHECK-NEXT: movprfx z31, z1 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z1.h -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p6.b, p0/z, p2.b -; CHECK-NEXT: fcmge p2.h, p0/z, z25.h, z30.h -; CHECK-NEXT: sel z27.d, p3, z29.d, z11.d -; CHECK-NEXT: uunpkhi z11.s, z2.h ; CHECK-NEXT: not p5.b, p0/z, p1.b -; CHECK-NEXT: fcmge p1.h, p0/z, z24.h, z30.h -; CHECK-NEXT: not p3.b, p0/z, p4.b -; CHECK-NEXT: fcmge p4.h, p0/z, z28.h, z30.h -; CHECK-NEXT: mov z26.d, p5/m, z29.d -; CHECK-NEXT: mov z31.d, p6/m, z29.d -; CHECK-NEXT: sel z2.d, p3, z29.d, z9.d -; CHECK-NEXT: movprfx z9, z12 -; CHECK-NEXT: frintx z9.h, p0/m, z12.h -; CHECK-NEXT: uunpkhi z12.d, z13.s -; CHECK-NEXT: uunpklo z17.d, z11.s -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: sel z3.d, p2, z29.d, z15.d -; CHECK-NEXT: uunpklo z15.d, z13.s -; CHECK-NEXT: fcmge p2.h, p0/z, z8.h, z30.h -; CHECK-NEXT: sel z10.d, p1, z29.d, z14.d -; CHECK-NEXT: movprfx z14, z16 -; CHECK-NEXT: frintx z14.h, p0/m, z16.h -; CHECK-NEXT: uunpkhi z16.d, z18.s -; CHECK-NEXT: movprfx z13, z17 -; CHECK-NEXT: frintx z13.h, p0/m, z17.h -; CHECK-NEXT: movprfx z20, z12 -; CHECK-NEXT: frintx z20.h, p0/m, z12.h -; CHECK-NEXT: fcmge p3.h, p0/z, z9.h, z30.h -; CHECK-NEXT: uunpkhi z17.d, z11.s -; CHECK-NEXT: uunpklo z18.d, z18.s -; CHECK-NEXT: movprfx z12, z8 -; CHECK-NEXT: fcvtzs z12.d, p0/m, z8.h -; CHECK-NEXT: movprfx z21, z15 -; CHECK-NEXT: frintx z21.h, p0/m, z15.h -; CHECK-NEXT: not p1.b, p0/z, p4.b -; CHECK-NEXT: movprfx z15, z9 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z9.h -; CHECK-NEXT: frintx z16.h, p0/m, z16.h -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: movprfx z22, z14 -; CHECK-NEXT: fcvtzs z22.d, p0/m, z14.h -; CHECK-NEXT: fcmge p4.h, p0/z, z13.h, z30.h -; CHECK-NEXT: fcmge p5.h, p0/z, z20.h, z30.h -; CHECK-NEXT: sel z11.d, p1, z29.d, z19.d +; CHECK-NEXT: movprfx z18, z3 +; CHECK-NEXT: fcvtzs z18.d, p0/m, z3.h +; CHECK-NEXT: movprfx z20, z31 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z31.h +; CHECK-NEXT: not p4.b, p0/z, p4.b +; CHECK-NEXT: movprfx z21, z9 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z9.h +; CHECK-NEXT: fcmgt p1.h, p0/z, z30.h, z5.h +; CHECK-NEXT: sel z0.d, p5, z27.d, z24.d +; CHECK-NEXT: not p7.b, p0/z, p2.b +; CHECK-NEXT: fcmgt p2.h, p0/z, z4.h, z5.h +; CHECK-NEXT: mov z29.d, p4/m, z27.d +; CHECK-NEXT: fcmge p4.h, p0/z, z25.h, z26.h +; CHECK-NEXT: not p5.b, p0/z, p6.b ; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: fcmge p6.h, p0/z, z9.h, z26.h +; CHECK-NEXT: fcmgt p9.h, p0/z, z6.h, z5.h +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z0.d, p7, z27.d, z28.d +; CHECK-NEXT: movprfx z28, z8 +; CHECK-NEXT: frintx z28.h, p0/m, z8.h +; CHECK-NEXT: sel z8.d, p5, z27.d, z10.d +; CHECK-NEXT: uunpklo z10.d, z12.s +; CHECK-NEXT: uunpkhi z12.d, z12.s +; CHECK-NEXT: not p5.b, p0/z, p4.b +; CHECK-NEXT: sel z11.d, p3, z27.d, z13.d +; CHECK-NEXT: uunpklo z13.d, z16.s +; CHECK-NEXT: fcmge p3.h, p0/z, z3.h, z26.h +; CHECK-NEXT: not p6.b, p0/z, p6.b +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: sel z24.d, p5, z27.d, z15.d +; CHECK-NEXT: uunpkhi z15.d, z16.s +; CHECK-NEXT: movprfx z14, z28 +; CHECK-NEXT: fcvtzs z14.d, p0/m, z28.h +; CHECK-NEXT: frintx z10.h, p0/m, z10.h +; CHECK-NEXT: uunpklo z16.d, z17.s +; CHECK-NEXT: frintx z12.h, p0/m, z12.h +; CHECK-NEXT: uunpkhi z17.d, z17.s +; CHECK-NEXT: movprfx z19, z13 +; CHECK-NEXT: frintx z19.h, p0/m, z13.h +; CHECK-NEXT: fcmge p4.h, p0/z, z28.h, z26.h +; CHECK-NEXT: fcmge p5.h, p0/z, z31.h, z26.h +; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: frintx z15.h, p0/m, z15.h +; CHECK-NEXT: fcmge p7.h, p0/z, z10.h, z26.h +; CHECK-NEXT: frintx z16.h, p0/m, z16.h +; CHECK-NEXT: fcmge p8.h, p0/z, z12.h, z26.h ; CHECK-NEXT: frintx z17.h, p0/m, z17.h -; CHECK-NEXT: frintx z18.h, p0/m, z18.h -; CHECK-NEXT: movprfx z19, z20 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z20.h -; CHECK-NEXT: mov z12.d, p2/m, z29.d -; CHECK-NEXT: fcmge p2.h, p0/z, z21.h, z30.h -; CHECK-NEXT: fcmge p1.h, p0/z, z14.h, z30.h -; CHECK-NEXT: mov z15.d, p3/m, z29.d -; CHECK-NEXT: movprfx z23, z21 -; CHECK-NEXT: fcvtzs z23.d, p0/m, z21.h -; CHECK-NEXT: not p3.b, p0/z, p4.b -; CHECK-NEXT: fcmge p4.h, p0/z, z16.h, z30.h -; CHECK-NEXT: fcmgt p8.h, p0/z, z21.h, z7.h +; CHECK-NEXT: movprfx z23, z19 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z19.h +; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: fcmge p6.h, p0/z, z17.h, z30.h -; CHECK-NEXT: fcmge p7.h, p0/z, z18.h, z30.h -; CHECK-NEXT: movprfx z30, z16 -; CHECK-NEXT: fcvtzs z30.d, p0/m, z16.h -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmuo p9.h, p0/z, z21.h, z21.h -; CHECK-NEXT: mov z19.d, p5/m, z29.d -; CHECK-NEXT: fcmgt p5.h, p0/z, z20.h, z7.h -; CHECK-NEXT: not p1.b, p0/z, p1.b +; CHECK-NEXT: sel z13.d, p3, z27.d, z18.d +; CHECK-NEXT: fcmge p3.h, p0/z, z19.h, z26.h +; CHECK-NEXT: movprfx z0, z15 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z15.h +; CHECK-NEXT: sel z22.d, p4, z27.d, z14.d +; CHECK-NEXT: sel z18.d, p6, z27.d, z21.d +; CHECK-NEXT: movprfx z21, z12 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.h +; CHECK-NEXT: movprfx z1, z16 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z16.h +; CHECK-NEXT: sel z14.d, p5, z27.d, z20.d +; CHECK-NEXT: fcmge p4.h, p0/z, z15.h, z26.h +; CHECK-NEXT: movprfx z20, z10 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z10.h +; CHECK-NEXT: movprfx z2, z17 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z17.h +; CHECK-NEXT: not p5.b, p0/z, p7.b +; CHECK-NEXT: fcmge p6.h, p0/z, z16.h, z26.h +; CHECK-NEXT: not p7.b, p0/z, p8.b +; CHECK-NEXT: fcmge p8.h, p0/z, z17.h, z26.h +; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff +; CHECK-NEXT: not p3.b, p0/z, p3.b ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z23.d, p2/m, z29.d -; CHECK-NEXT: fcmuo p2.h, p0/z, z20.h, z20.h -; CHECK-NEXT: movprfx z20, z18 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z18.h -; CHECK-NEXT: movprfx z21, z13 -; CHECK-NEXT: fcvtzs z21.d, p0/m, z13.h -; CHECK-NEXT: mov z22.d, p1/m, z29.d -; CHECK-NEXT: not p1.b, p0/z, p7.b -; CHECK-NEXT: mov z30.d, p4/m, z29.d -; CHECK-NEXT: fcmgt p4.h, p0/z, z18.h, z7.h -; CHECK-NEXT: mov z19.d, p5/m, z4.d -; CHECK-NEXT: fcmuo p7.h, p0/z, z18.h, z18.h -; CHECK-NEXT: movprfx z18, z17 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z17.h -; CHECK-NEXT: fcmgt p5.h, p0/z, z16.h, z7.h -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: mov z23.d, p8/m, z4.d -; CHECK-NEXT: mov z20.d, p1/m, z29.d -; CHECK-NEXT: mov z21.d, p3/m, z29.d -; CHECK-NEXT: fcmuo p3.h, p0/z, z16.h, z16.h -; CHECK-NEXT: mov z19.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.h, p0/z, z17.h, z7.h -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: sel z29.d, p6, z29.d, z18.d -; CHECK-NEXT: mov z23.d, p9/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p6.h, p0/z, z14.h, z7.h -; CHECK-NEXT: mov z30.d, p5/m, z4.d -; CHECK-NEXT: sel z16.d, p4, z4.d, z20.d -; CHECK-NEXT: fcmuo p4.h, p0/z, z17.h, z17.h -; CHECK-NEXT: st1b { z19.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #14 -; CHECK-NEXT: fcmgt p5.h, p0/z, z1.h, z7.h -; CHECK-NEXT: st1b { z23.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #13 -; CHECK-NEXT: mov z29.d, p2/m, z4.d -; CHECK-NEXT: mov z30.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.h, p0/z, z13.h, z7.h -; CHECK-NEXT: mov z16.d, p7/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.h, p0/z, z9.h, z7.h -; CHECK-NEXT: fcmuo p7.h, p0/z, z14.h, z14.h +; CHECK-NEXT: mov z20.d, p5/m, z27.d +; CHECK-NEXT: mov z21.d, p7/m, z27.d +; CHECK-NEXT: not p5.b, p0/z, p6.b +; CHECK-NEXT: mov z23.d, p3/m, z27.d +; CHECK-NEXT: fcmgt p3.h, p0/z, z17.h, z5.h +; CHECK-NEXT: not p6.b, p0/z, p8.b +; CHECK-NEXT: mov z0.d, p4/m, z27.d +; CHECK-NEXT: fcmgt p4.h, p0/z, z16.h, z5.h +; CHECK-NEXT: mov z1.d, p5/m, z27.d +; CHECK-NEXT: fcmuo p5.h, p0/z, z16.h, z16.h +; CHECK-NEXT: mov z29.d, p2/m, z26.d +; CHECK-NEXT: mov z2.d, p6/m, z27.d +; CHECK-NEXT: ldr z27, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmgt p6.h, p0/z, z7.h, z5.h +; CHECK-NEXT: fcmgt p2.h, p0/z, z12.h, z5.h +; CHECK-NEXT: fcmuo p8.h, p0/z, z17.h, z17.h +; CHECK-NEXT: fcmgt p7.h, p0/z, z28.h, z5.h +; CHECK-NEXT: mov z1.d, p4/m, z26.d +; CHECK-NEXT: fcmuo p4.h, p0/z, z15.h, z15.h +; CHECK-NEXT: mov z8.d, p9/m, z26.d +; CHECK-NEXT: mov z27.d, p1/m, z26.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z15.h, z5.h +; CHECK-NEXT: mov z2.d, p3/m, z26.d +; CHECK-NEXT: fcmgt p3.h, p0/z, z19.h, z5.h +; CHECK-NEXT: mov z11.d, p6/m, z26.d +; CHECK-NEXT: fcmuo p6.h, p0/z, z19.h, z19.h +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p5.h, p0/z, z9.h, z5.h +; CHECK-NEXT: sel z15.d, p2, z26.d, z21.d +; CHECK-NEXT: fcmuo p2.h, p0/z, z12.h, z12.h +; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z16.d, p7, z26.d, z22.d +; CHECK-NEXT: mov z0.d, p1/m, z26.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z10.h, z5.h +; CHECK-NEXT: str z1, [x8, #14, mul vl] +; CHECK-NEXT: sel z17.d, p3, z26.d, z23.d +; CHECK-NEXT: fcmuo p3.h, p0/z, z10.h, z10.h +; CHECK-NEXT: str z2, [x8, #15, mul vl] +; CHECK-NEXT: sel z2.d, p5, z26.d, z18.d +; CHECK-NEXT: fcmuo p5.h, p0/z, z9.h, z9.h +; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p4.h, p0/z, z3.h, z5.h +; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 +; CHECK-NEXT: sel z1.d, p1, z26.d, z20.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z31.h, z5.h +; CHECK-NEXT: mov z17.d, p6/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p2.h, p0/z, z31.h, z31.h +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z25.h, z25.h +; CHECK-NEXT: str z17, [x8, #12, mul vl] +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p3.h, p0/z, z25.h, z5.h +; CHECK-NEXT: str z15, [x8, #11, mul vl] +; CHECK-NEXT: sel z0.d, p1, z26.d, z14.d +; CHECK-NEXT: fcmuo p1.h, p0/z, z3.h, z3.h +; CHECK-NEXT: sel z3.d, p4, z26.d, z13.d +; CHECK-NEXT: fcmuo p4.h, p0/z, z28.h, z28.h +; CHECK-NEXT: str z1, [x8, #10, mul vl] +; CHECK-NEXT: sel z1.d, p3, z26.d, z24.d +; CHECK-NEXT: fcmuo p3.h, p0/z, z7.h, z7.h +; CHECK-NEXT: ldr z7, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z2, [x8, #9, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z3.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p1.h, p0/z, z6.h, z6.h +; CHECK-NEXT: mov z16.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.h, p0/z, z4.h, z4.h +; CHECK-NEXT: fcmgt p2.h, p0/z, z7.h, z5.h +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z30.h, z30.h +; CHECK-NEXT: str z0, [x8, #8, mul vl] +; CHECK-NEXT: fcmuo p0.h, p0/z, z7.h, z7.h +; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: str z3, [x8, #7, mul vl] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z16, [x8, #6, mul vl] +; CHECK-NEXT: mov z8.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z1, [x8, #5, mul vl] ; CHECK-NEXT: mov z29.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.h, p0/z, z13.h, z13.h -; CHECK-NEXT: st1b { z30.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #12 -; CHECK-NEXT: sel z30.d, p5, z4.d, z31.d -; CHECK-NEXT: st1b { z16.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #11 -; CHECK-NEXT: sel z31.d, p3, z4.d, z21.d -; CHECK-NEXT: st1b { z29.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #10 -; CHECK-NEXT: fcmgt p5.h, p0/z, z24.h, z7.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z28.h, z7.h -; CHECK-NEXT: sel z13.d, p2, z4.d, z15.d -; CHECK-NEXT: fcmuo p2.h, p0/z, z9.h, z9.h -; CHECK-NEXT: sel z29.d, p6, z4.d, z22.d -; CHECK-NEXT: mov z31.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p4.h, p0/z, z8.h, z7.h -; CHECK-NEXT: fcmgt p6.h, p0/z, z5.h, z7.h -; CHECK-NEXT: sel z9.d, p5, z4.d, z10.d -; CHECK-NEXT: fcmgt p5.h, p0/z, z6.h, z7.h -; CHECK-NEXT: st1b { z31.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #9 -; CHECK-NEXT: mov z29.d, p7/m, #0 // =0x0 -; CHECK-NEXT: sel z10.d, p3, z4.d, z11.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z25.h, z7.h -; CHECK-NEXT: mov z13.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p7.h, p0/z, z8.h, z8.h -; CHECK-NEXT: fcmuo p2.h, p0/z, z28.h, z28.h -; CHECK-NEXT: sel z28.d, p4, z4.d, z12.d -; CHECK-NEXT: st1b { z29.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #8 -; CHECK-NEXT: fcmuo p4.h, p0/z, z25.h, z25.h -; CHECK-NEXT: st1b { z13.b }, p1, [x8, x9] -; CHECK-NEXT: fcmuo p1.h, p0/z, z24.h, z24.h -; CHECK-NEXT: mov z2.d, p5/m, z4.d -; CHECK-NEXT: mov z3.d, p3/m, z4.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z0.h, z7.h -; CHECK-NEXT: mov z28.d, p7/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p7.h, p0/z, z6.h, z6.h -; CHECK-NEXT: mov z10.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.h, p0/z, z5.h, z5.h -; CHECK-NEXT: sel z5.d, p6, z4.d, z27.d -; CHECK-NEXT: mov z3.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.h, p0/z, z1.h, z1.h -; CHECK-NEXT: mov z9.d, p1/m, #0 // =0x0 -; CHECK-NEXT: st1d { z28.d }, p0, [x8, #7, mul vl] -; CHECK-NEXT: fcmuo p1.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z0.d, p3, z4.d, z26.d -; CHECK-NEXT: st1d { z10.d }, p0, [x8, #6, mul vl] -; CHECK-NEXT: mov z2.d, p7/m, #0 // =0x0 -; CHECK-NEXT: st1d { z3.d }, p0, [x8, #5, mul vl] -; CHECK-NEXT: mov z5.d, p2/m, #0 // =0x0 -; CHECK-NEXT: st1d { z9.d }, p0, [x8, #4, mul vl] -; CHECK-NEXT: mov z30.d, p4/m, #0 // =0x0 -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: st1d { z2.d }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [x8, #2, mul vl] -; CHECK-NEXT: st1d { z30.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: mov z27.d, p5/m, #0 // =0x0 +; CHECK-NEXT: str z11, [x8, #4, mul vl] +; CHECK-NEXT: str z8, [x8, #3, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, z26.d +; CHECK-NEXT: str z29, [x8, #2, mul vl] +; CHECK-NEXT: str z27, [x8, #1, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -939,7 +937,8 @@ define @llrint_v32i64_v32f32( %x) { ; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG @@ -950,224 +949,224 @@ define @llrint_v32i64_v32f32( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; CHECK-NEXT: uunpklo z24.d, z0.s -; CHECK-NEXT: uunpkhi z25.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, #-553648128 // =0xdf000000 ; CHECK-NEXT: uunpklo z26.d, z1.s -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uunpkhi z27.d, z1.s -; CHECK-NEXT: mov z31.s, w9 +; CHECK-NEXT: uunpkhi z25.d, z0.s +; CHECK-NEXT: uunpkhi z28.d, z1.s +; CHECK-NEXT: mov z29.s, w9 ; CHECK-NEXT: mov w9, #1593835519 // =0x5effffff -; CHECK-NEXT: uunpklo z28.d, z2.s -; CHECK-NEXT: mov z8.d, #0x8000000000000000 -; CHECK-NEXT: uunpklo z30.d, z3.s -; CHECK-NEXT: uunpklo z13.d, z4.s +; CHECK-NEXT: mov z17.d, z5.d +; CHECK-NEXT: mov z27.d, #0x8000000000000000 +; CHECK-NEXT: uunpkhi z30.d, z2.s +; CHECK-NEXT: uunpklo z8.d, z3.s ; CHECK-NEXT: movprfx z0, z24 ; CHECK-NEXT: frintx z0.s, p0/m, z24.s -; CHECK-NEXT: movprfx z1, z25 -; CHECK-NEXT: frintx z1.s, p0/m, z25.s -; CHECK-NEXT: uunpkhi z15.d, z4.s +; CHECK-NEXT: uunpkhi z9.d, z3.s +; CHECK-NEXT: uunpkhi z14.d, z4.s ; CHECK-NEXT: movprfx z24, z26 ; CHECK-NEXT: frintx z24.s, p0/m, z26.s -; CHECK-NEXT: uunpkhi z26.d, z2.s -; CHECK-NEXT: movprfx z25, z27 -; CHECK-NEXT: frintx z25.s, p0/m, z27.s -; CHECK-NEXT: movprfx z27, z28 -; CHECK-NEXT: frintx z27.s, p0/m, z28.s -; CHECK-NEXT: uunpklo z16.d, z5.s -; CHECK-NEXT: uunpkhi z17.d, z7.s -; CHECK-NEXT: frintx z30.s, p0/m, z30.s -; CHECK-NEXT: uunpklo z18.d, z7.s -; CHECK-NEXT: uunpklo z21.d, z6.s -; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z31.s -; CHECK-NEXT: movprfx z9, z0 -; CHECK-NEXT: fcvtzs z9.d, p0/m, z0.s +; CHECK-NEXT: movprfx z1, z25 +; CHECK-NEXT: frintx z1.s, p0/m, z25.s +; CHECK-NEXT: movprfx z5, z28 +; CHECK-NEXT: frintx z5.s, p0/m, z28.s +; CHECK-NEXT: uunpklo z26.d, z2.s +; CHECK-NEXT: uunpklo z16.d, z17.s +; CHECK-NEXT: mov z25.s, w9 +; CHECK-NEXT: movprfx z28, z30 +; CHECK-NEXT: frintx z28.s, p0/m, z30.s +; CHECK-NEXT: movprfx z30, z8 +; CHECK-NEXT: frintx z30.s, p0/m, z8.s +; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z29.s +; CHECK-NEXT: movprfx z31, z0 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z0.s +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z29.s +; CHECK-NEXT: fcmge p3.s, p0/z, z24.s, z29.s +; CHECK-NEXT: fcmge p5.s, p0/z, z5.s, z29.s +; CHECK-NEXT: frintx z26.s, p0/m, z26.s ; CHECK-NEXT: movprfx z10, z1 ; CHECK-NEXT: fcvtzs z10.d, p0/m, z1.s -; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z31.s -; CHECK-NEXT: fcmge p3.s, p0/z, z24.s, z31.s ; CHECK-NEXT: movprfx z11, z24 ; CHECK-NEXT: fcvtzs z11.d, p0/m, z24.s -; CHECK-NEXT: movprfx z29, z26 -; CHECK-NEXT: frintx z29.s, p0/m, z26.s -; CHECK-NEXT: fcmge p4.s, p0/z, z25.s, z31.s -; CHECK-NEXT: fcmge p5.s, p0/z, z27.s, z31.s -; CHECK-NEXT: movprfx z12, z27 -; CHECK-NEXT: fcvtzs z12.d, p0/m, z27.s -; CHECK-NEXT: movprfx z19, z30 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z30.s -; CHECK-NEXT: movprfx z7, z16 -; CHECK-NEXT: frintx z7.s, p0/m, z16.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: frintx z17.s, p0/m, z17.s -; CHECK-NEXT: uunpkhi z16.d, z5.s +; CHECK-NEXT: movprfx z12, z5 +; CHECK-NEXT: fcvtzs z12.d, p0/m, z5.s +; CHECK-NEXT: movprfx z15, z28 +; CHECK-NEXT: fcvtzs z15.d, p0/m, z28.s +; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: not p4.b, p0/z, p1.b +; CHECK-NEXT: fcmgt p1.s, p0/z, z1.s, z25.s +; CHECK-NEXT: fcmgt p9.s, p0/z, z5.s, z25.s ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: frintx z18.s, p0/m, z18.s -; CHECK-NEXT: mov z28.s, w9 -; CHECK-NEXT: not p6.b, p0/z, p3.b -; CHECK-NEXT: sel z26.d, p1, z8.d, z9.d -; CHECK-NEXT: movprfx z14, z29 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z29.s -; CHECK-NEXT: sel z9.d, p2, z8.d, z10.d -; CHECK-NEXT: uunpkhi z10.d, z3.s -; CHECK-NEXT: rdvl x9, #15 -; CHECK-NEXT: sel z3.d, p6, z8.d, z11.d -; CHECK-NEXT: movprfx z11, z25 -; CHECK-NEXT: fcvtzs z11.d, p0/m, z25.s -; CHECK-NEXT: fcmge p3.s, p0/z, z29.s, z31.s -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: fcmge p1.s, p0/z, z30.s, z31.s -; CHECK-NEXT: movprfx z23, z18 -; CHECK-NEXT: fcvtzs z23.d, p0/m, z18.s -; CHECK-NEXT: not p2.b, p0/z, p5.b -; CHECK-NEXT: fcmge p5.s, p0/z, z17.s, z31.s -; CHECK-NEXT: frintx z16.s, p0/m, z16.s -; CHECK-NEXT: frintx z10.s, p0/m, z10.s -; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff -; CHECK-NEXT: fcmgt p8.s, p0/z, z18.s, z28.s -; CHECK-NEXT: sel z4.d, p4, z8.d, z11.d -; CHECK-NEXT: movprfx z11, z13 -; CHECK-NEXT: frintx z11.s, p0/m, z13.s +; CHECK-NEXT: sel z0.d, p4, z27.d, z31.d +; CHECK-NEXT: fcmge p4.s, p0/z, z26.s, z29.s ; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: sel z13.d, p2, z8.d, z12.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmge p4.s, p0/z, z7.s, z31.s -; CHECK-NEXT: sel z12.d, p3, z8.d, z14.d -; CHECK-NEXT: movprfx z14, z15 -; CHECK-NEXT: frintx z14.s, p0/m, z15.s -; CHECK-NEXT: uunpkhi z15.d, z6.s +; CHECK-NEXT: not p5.b, p0/z, p5.b +; CHECK-NEXT: movprfx z13, z26 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z26.s +; CHECK-NEXT: sel z31.d, p2, z27.d, z10.d +; CHECK-NEXT: uunpklo z10.d, z4.s +; CHECK-NEXT: sel z8.d, p3, z27.d, z11.d +; CHECK-NEXT: fcmge p3.s, p0/z, z28.s, z29.s +; CHECK-NEXT: sel z11.d, p5, z27.d, z12.d +; CHECK-NEXT: movprfx z4, z9 +; CHECK-NEXT: frintx z4.s, p0/m, z9.s +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: not p5.b, p0/z, p4.b +; CHECK-NEXT: fcmge p4.s, p0/z, z30.s, z29.s +; CHECK-NEXT: fcmgt p2.s, p0/z, z24.s, z25.s +; CHECK-NEXT: sel z12.d, p5, z27.d, z13.d +; CHECK-NEXT: uunpkhi z13.d, z17.s +; CHECK-NEXT: movprfx z9, z10 +; CHECK-NEXT: frintx z9.s, p0/m, z10.s +; CHECK-NEXT: movprfx z10, z14 +; CHECK-NEXT: frintx z10.s, p0/m, z14.s +; CHECK-NEXT: uunpkhi z17.d, z6.s +; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: uunpklo z14.d, z6.s +; CHECK-NEXT: movprfx z6, z16 +; CHECK-NEXT: frintx z6.s, p0/m, z16.s +; CHECK-NEXT: uunpklo z16.d, z7.s +; CHECK-NEXT: uunpkhi z7.d, z7.s +; CHECK-NEXT: sel z3.d, p3, z27.d, z15.d +; CHECK-NEXT: fcmge p3.s, p0/z, z4.s, z29.s +; CHECK-NEXT: frintx z13.s, p0/m, z13.s +; CHECK-NEXT: movprfx z15, z30 +; CHECK-NEXT: fcvtzs z15.d, p0/m, z30.s +; CHECK-NEXT: fcmge p5.s, p0/z, z9.s, z29.s +; CHECK-NEXT: fcmge p6.s, p0/z, z10.s, z29.s +; CHECK-NEXT: frintx z17.s, p0/m, z17.s +; CHECK-NEXT: movprfx z18, z4 +; CHECK-NEXT: fcvtzs z18.d, p0/m, z4.s ; CHECK-NEXT: movprfx z20, z10 ; CHECK-NEXT: fcvtzs z20.d, p0/m, z10.s -; CHECK-NEXT: fcmge p2.s, p0/z, z10.s, z31.s -; CHECK-NEXT: sel z5.d, p1, z8.d, z19.d -; CHECK-NEXT: movprfx z19, z11 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z11.s -; CHECK-NEXT: fcmge p3.s, p0/z, z11.s, z31.s -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: fcmge p6.s, p0/z, z16.s, z31.s -; CHECK-NEXT: fcmuo p9.s, p0/z, z18.s, z18.s -; CHECK-NEXT: movprfx z22, z15 -; CHECK-NEXT: frintx z22.s, p0/m, z15.s -; CHECK-NEXT: fcmge p1.s, p0/z, z14.s, z31.s -; CHECK-NEXT: not p2.b, p0/z, p2.b +; CHECK-NEXT: frintx z16.s, p0/m, z16.s +; CHECK-NEXT: not p4.b, p0/z, p4.b +; CHECK-NEXT: movprfx z19, z14 +; CHECK-NEXT: frintx z19.s, p0/m, z14.s +; CHECK-NEXT: movprfx z14, z9 +; CHECK-NEXT: fcvtzs z14.d, p0/m, z9.s +; CHECK-NEXT: fcmge p7.s, p0/z, z6.s, z29.s +; CHECK-NEXT: fcmge p8.s, p0/z, z13.s, z29.s +; CHECK-NEXT: movprfx z21, z7 +; CHECK-NEXT: frintx z21.s, p0/m, z7.s ; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: sel z6.d, p2, z8.d, z20.d -; CHECK-NEXT: movprfx z20, z21 -; CHECK-NEXT: frintx z20.s, p0/m, z21.s -; CHECK-NEXT: fcmge p2.s, p0/z, z18.s, z31.s -; CHECK-NEXT: sel z15.d, p3, z8.d, z19.d -; CHECK-NEXT: movprfx z19, z17 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z17.s -; CHECK-NEXT: not p3.b, p0/z, p4.b -; CHECK-NEXT: fcmge p4.s, p0/z, z22.s, z31.s -; CHECK-NEXT: movprfx z21, z14 -; CHECK-NEXT: fcvtzs z21.d, p0/m, z14.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: movprfx z18, z7 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z7.s ; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: fcmge p7.s, p0/z, z20.s, z31.s -; CHECK-NEXT: movprfx z31, z22 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z22.s -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z19.d, p5/m, z8.d -; CHECK-NEXT: fcmgt p5.s, p0/z, z17.s, z28.s +; CHECK-NEXT: mov z15.d, p4/m, z27.d +; CHECK-NEXT: fcmge p4.s, p0/z, z17.s, z29.s +; CHECK-NEXT: not p5.b, p0/z, p5.b +; CHECK-NEXT: sel z7.d, p3, z27.d, z18.d +; CHECK-NEXT: movprfx z0, z17 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z17.s +; CHECK-NEXT: sel z18.d, p6, z27.d, z20.d +; CHECK-NEXT: movprfx z20, z6 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z6.s +; CHECK-NEXT: fcmge p6.s, p0/z, z16.s, z29.s +; CHECK-NEXT: fcmge p3.s, p0/z, z19.s, z29.s +; CHECK-NEXT: mov z14.d, p5/m, z27.d +; CHECK-NEXT: not p5.b, p0/z, p7.b +; CHECK-NEXT: not p7.b, p0/z, p8.b +; CHECK-NEXT: fcmge p8.s, p0/z, z21.s, z29.s +; CHECK-NEXT: movprfx z1, z16 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z16.s +; CHECK-NEXT: movprfx z22, z13 +; CHECK-NEXT: fcvtzs z22.d, p0/m, z13.s +; CHECK-NEXT: movprfx z23, z19 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z19.s ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z23.d, p2/m, z8.d -; CHECK-NEXT: fcmuo p2.s, p0/z, z17.s, z17.s -; CHECK-NEXT: movprfx z17, z20 -; CHECK-NEXT: fcvtzs z17.d, p0/m, z20.s -; CHECK-NEXT: mov z21.d, p1/m, z8.d -; CHECK-NEXT: mov z18.d, p3/m, z8.d -; CHECK-NEXT: not p1.b, p0/z, p7.b -; CHECK-NEXT: mov z31.d, p4/m, z8.d -; CHECK-NEXT: fcmgt p4.s, p0/z, z20.s, z28.s -; CHECK-NEXT: mov z19.d, p5/m, z2.d -; CHECK-NEXT: fcmuo p7.s, p0/z, z20.s, z20.s -; CHECK-NEXT: movprfx z20, z16 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z16.s -; CHECK-NEXT: fcmgt p5.s, p0/z, z22.s, z28.s -; CHECK-NEXT: mov z23.d, p8/m, z2.d -; CHECK-NEXT: fcmuo p3.s, p0/z, z22.s, z22.s -; CHECK-NEXT: mov z17.d, p1/m, z8.d -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: mov z19.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.s, p0/z, z16.s, z28.s -; CHECK-NEXT: sel z8.d, p6, z8.d, z20.d -; CHECK-NEXT: mov z23.d, p9/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p6.s, p0/z, z14.s, z28.s -; CHECK-NEXT: mov z31.d, p5/m, z2.d -; CHECK-NEXT: mov z17.d, p4/m, z2.d -; CHECK-NEXT: fcmuo p4.s, p0/z, z16.s, z16.s -; CHECK-NEXT: st1b { z19.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #14 -; CHECK-NEXT: fcmgt p5.s, p0/z, z1.s, z28.s -; CHECK-NEXT: st1b { z23.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #13 -; CHECK-NEXT: mov z8.d, p2/m, z2.d -; CHECK-NEXT: mov z31.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.s, p0/z, z7.s, z28.s -; CHECK-NEXT: mov z17.d, p7/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.s, p0/z, z11.s, z28.s -; CHECK-NEXT: fcmuo p7.s, p0/z, z14.s, z14.s -; CHECK-NEXT: mov z8.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.s, p0/z, z7.s, z7.s -; CHECK-NEXT: sel z7.d, p5, z2.d, z9.d -; CHECK-NEXT: st1b { z31.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #12 -; CHECK-NEXT: fcmgt p5.s, p0/z, z27.s, z28.s -; CHECK-NEXT: st1b { z17.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #11 -; CHECK-NEXT: sel z31.d, p3, z2.d, z18.d -; CHECK-NEXT: st1b { z8.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #10 -; CHECK-NEXT: fcmgt p3.s, p0/z, z30.s, z28.s -; CHECK-NEXT: sel z9.d, p2, z2.d, z15.d -; CHECK-NEXT: fcmuo p2.s, p0/z, z11.s, z11.s -; CHECK-NEXT: sel z8.d, p6, z2.d, z21.d +; CHECK-NEXT: movprfx z2, z21 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z21.s +; CHECK-NEXT: mov z29.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z20.d, p5/m, z27.d +; CHECK-NEXT: not p5.b, p0/z, p6.b +; CHECK-NEXT: mov z0.d, p4/m, z27.d +; CHECK-NEXT: fcmgt p4.s, p0/z, z16.s, z25.s +; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: not p6.b, p0/z, p8.b +; CHECK-NEXT: mov z1.d, p5/m, z27.d +; CHECK-NEXT: mov z22.d, p7/m, z27.d +; CHECK-NEXT: mov z23.d, p3/m, z27.d +; CHECK-NEXT: fcmgt p3.s, p0/z, z21.s, z25.s +; CHECK-NEXT: fcmuo p5.s, p0/z, z16.s, z16.s +; CHECK-NEXT: mov z2.d, p6/m, z27.d +; CHECK-NEXT: sel z27.d, p1, z29.d, z31.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z17.s, z25.s +; CHECK-NEXT: mov z1.d, p4/m, z29.d +; CHECK-NEXT: fcmgt p6.s, p0/z, z26.s, z25.s +; CHECK-NEXT: fcmgt p7.s, p0/z, z30.s, z25.s +; CHECK-NEXT: sel z31.d, p2, z29.d, z8.d +; CHECK-NEXT: fcmgt p2.s, p0/z, z13.s, z25.s +; CHECK-NEXT: fcmuo p8.s, p0/z, z21.s, z21.s +; CHECK-NEXT: mov z2.d, p3/m, z29.d +; CHECK-NEXT: fcmuo p4.s, p0/z, z17.s, z17.s +; CHECK-NEXT: fcmgt p3.s, p0/z, z19.s, z25.s +; CHECK-NEXT: mov z0.d, p1/m, z29.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z6.s, z25.s +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: sel z8.d, p9, z29.d, z11.d +; CHECK-NEXT: sel z11.d, p6, z29.d, z12.d +; CHECK-NEXT: sel z12.d, p7, z29.d, z15.d +; CHECK-NEXT: fcmgt p5.s, p0/z, z10.s, z25.s +; CHECK-NEXT: sel z15.d, p2, z29.d, z22.d +; CHECK-NEXT: fcmuo p2.s, p0/z, z13.s, z13.s +; CHECK-NEXT: str z1, [x8, #14, mul vl] +; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 +; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 +; CHECK-NEXT: sel z1.d, p1, z29.d, z20.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z9.s, z25.s +; CHECK-NEXT: fcmuo p6.s, p0/z, z19.s, z19.s +; CHECK-NEXT: sel z16.d, p3, z29.d, z23.d +; CHECK-NEXT: fcmuo p3.s, p0/z, z6.s, z6.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z4.s, z25.s +; CHECK-NEXT: str z2, [x8, #15, mul vl] +; CHECK-NEXT: sel z2.d, p5, z29.d, z18.d +; CHECK-NEXT: fcmuo p5.s, p0/z, z10.s, z10.s +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p2.s, p0/z, z9.s, z9.s +; CHECK-NEXT: sel z0.d, p1, z29.d, z14.d +; CHECK-NEXT: mov z16.d, p6/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p1.s, p0/z, z4.s, z4.s +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p3.s, p0/z, z28.s, z25.s +; CHECK-NEXT: sel z4.d, p4, z29.d, z7.d +; CHECK-NEXT: str z15, [x8, #11, mul vl] +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.s, p0/z, z28.s, z28.s +; CHECK-NEXT: str z16, [x8, #12, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.s, p0/z, z30.s, z30.s +; CHECK-NEXT: str z1, [x8, #10, mul vl] +; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p1.s, p0/z, z5.s, z5.s +; CHECK-NEXT: sel z1.d, p3, z29.d, z3.d +; CHECK-NEXT: ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z2, [x8, #9, mul vl] +; CHECK-NEXT: str z0, [x8, #8, mul vl] +; CHECK-NEXT: fcmuo p3.s, p0/z, z26.s, z26.s +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z4, [x8, #7, mul vl] +; CHECK-NEXT: mov z12.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p2.s, p0/z, z3.s, z25.s +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.s, p0/z, z24.s, z24.s +; CHECK-NEXT: mov z8.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.s, p0/z, z0.s, z0.s +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: str z12, [x8, #6, mul vl] +; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: fcmuo p0.s, p0/z, z3.s, z3.s +; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: str z8, [x8, #3, mul vl] ; CHECK-NEXT: mov z31.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p4.s, p0/z, z10.s, z28.s -; CHECK-NEXT: fcmgt p6.s, p0/z, z24.s, z28.s -; CHECK-NEXT: sel z11.d, p5, z2.d, z13.d -; CHECK-NEXT: fcmgt p5.s, p0/z, z25.s, z28.s -; CHECK-NEXT: mov z8.d, p7/m, #0 // =0x0 -; CHECK-NEXT: mov z5.d, p3/m, z2.d -; CHECK-NEXT: fcmgt p3.s, p0/z, z29.s, z28.s -; CHECK-NEXT: st1b { z31.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #9 -; CHECK-NEXT: mov z9.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p7.s, p0/z, z10.s, z10.s -; CHECK-NEXT: fcmuo p2.s, p0/z, z30.s, z30.s -; CHECK-NEXT: mov z6.d, p4/m, z2.d -; CHECK-NEXT: st1b { z8.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #8 -; CHECK-NEXT: fcmuo p4.s, p0/z, z29.s, z29.s -; CHECK-NEXT: st1b { z9.b }, p1, [x8, x9] -; CHECK-NEXT: fcmuo p1.s, p0/z, z27.s, z27.s -; CHECK-NEXT: sel z27.d, p3, z2.d, z12.d -; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z28.s -; CHECK-NEXT: mov z4.d, p5/m, z2.d -; CHECK-NEXT: mov z3.d, p6/m, z2.d -; CHECK-NEXT: mov z6.d, p7/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p7.s, p0/z, z25.s, z25.s -; CHECK-NEXT: mov z5.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.s, p0/z, z24.s, z24.s -; CHECK-NEXT: mov z27.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.s, p0/z, z1.s, z1.s -; CHECK-NEXT: mov z11.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.s, p0/z, z0.s, z0.s -; CHECK-NEXT: st1d { z6.d }, p0, [x8, #7, mul vl] -; CHECK-NEXT: sel z0.d, p3, z2.d, z26.d -; CHECK-NEXT: st1d { z5.d }, p0, [x8, #6, mul vl] -; CHECK-NEXT: mov z4.d, p7/m, #0 // =0x0 -; CHECK-NEXT: st1d { z27.d }, p0, [x8, #5, mul vl] -; CHECK-NEXT: mov z3.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z7.d, p4/m, #0 // =0x0 -; CHECK-NEXT: st1d { z11.d }, p0, [x8, #4, mul vl] -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: st1d { z4.d }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [x8, #2, mul vl] -; CHECK-NEXT: st1d { z7.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: mov z0.d, p2/m, z29.d +; CHECK-NEXT: str z11, [x8, #4, mul vl] +; CHECK-NEXT: mov z27.d, p5/m, #0 // =0x0 +; CHECK-NEXT: str z31, [x8, #2, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z27, [x8, #1, mul vl] +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1490,24 +1489,31 @@ define @llrint_v32f64( %x) { ; CHECK-LABEL: llrint_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-12 +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z18, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z17, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z16, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z15, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z14, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 96 * VG +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG @@ -1517,244 +1523,245 @@ define @llrint_v32f64( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: rdvl x9, #8 -; CHECK-NEXT: rdvl x10, #9 +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: rdvl x11, #10 -; CHECK-NEXT: mov x12, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x9] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, x10] -; CHECK-NEXT: mov z2.d, x12 -; CHECK-NEXT: rdvl x14, #13 -; CHECK-NEXT: rdvl x13, #12 -; CHECK-NEXT: rdvl x12, #11 -; CHECK-NEXT: ld1b { z6.b }, p1/z, [x0, x14] -; CHECK-NEXT: ld1b { z7.b }, p1/z, [x0, x13] -; CHECK-NEXT: mov z3.d, #0x8000000000000000 -; CHECK-NEXT: movprfx z24, z0 -; CHECK-NEXT: frintx z24.d, p0/m, z0.d -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x11] -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: frintx z5.d, p0/m, z1.d -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, x12] -; CHECK-NEXT: mov x15, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: rdvl x16, #15 -; CHECK-NEXT: movprfx z30, z6 -; CHECK-NEXT: frintx z30.d, p0/m, z6.d -; CHECK-NEXT: movprfx z28, z7 -; CHECK-NEXT: frintx z28.d, p0/m, z7.d -; CHECK-NEXT: ld1b { z8.b }, p1/z, [x0, x16] -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: frintx z4.d, p0/m, z0.d -; CHECK-NEXT: mov z0.d, #0x7fffffffffffffff -; CHECK-NEXT: ld1d { z18.d }, p0/z, [x0] -; CHECK-NEXT: fcmge p3.d, p0/z, z5.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z24.d, z2.d -; CHECK-NEXT: movprfx z6, z5 -; CHECK-NEXT: fcvtzs z6.d, p0/m, z5.d -; CHECK-NEXT: movprfx z27, z1 -; CHECK-NEXT: frintx z27.d, p0/m, z1.d +; CHECK-NEXT: ldr z2, [x0, #2, mul vl] +; CHECK-NEXT: mov x9, #-4332462841530417152 // =0xc3e0000000000000 +; CHECK-NEXT: ldr z24, [x0, #6, mul vl] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] +; CHECK-NEXT: mov z7.d, x9 +; CHECK-NEXT: mov z26.d, #0x8000000000000000 +; CHECK-NEXT: ldr z3, [x0, #3, mul vl] +; CHECK-NEXT: frintx z0.d, p0/m, z0.d +; CHECK-NEXT: movprfx z30, z2 +; CHECK-NEXT: frintx z30.d, p0/m, z2.d +; CHECK-NEXT: ldr z6, [x0, #5, mul vl] ; CHECK-NEXT: movprfx z25, z24 -; CHECK-NEXT: fcvtzs z25.d, p0/m, z24.d -; CHECK-NEXT: mov z1.d, x15 -; CHECK-NEXT: rdvl x15, #14 -; CHECK-NEXT: movprfx z9, z28 -; CHECK-NEXT: fcvtzs z9.d, p0/m, z28.d -; CHECK-NEXT: movprfx z13, z8 -; CHECK-NEXT: frintx z13.d, p0/m, z8.d -; CHECK-NEXT: fcmge p4.d, p0/z, z4.d, z2.d -; CHECK-NEXT: movprfx z7, z4 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z4.d -; CHECK-NEXT: ld1d { z15.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: fcmgt p5.d, p0/z, z24.d, z1.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z5.d, z1.d -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmge p7.d, p0/z, z27.d, z2.d -; CHECK-NEXT: movprfx z26, z27 -; CHECK-NEXT: fcvtzs z26.d, p0/m, z27.d -; CHECK-NEXT: sel z29.d, p3, z3.d, z6.d -; CHECK-NEXT: ld1b { z6.b }, p1/z, [x0, x15] -; CHECK-NEXT: fcmge p3.d, p0/z, z28.d, z2.d +; CHECK-NEXT: frintx z25.d, p0/m, z24.d +; CHECK-NEXT: movprfx z12, z1 +; CHECK-NEXT: frintx z12.d, p0/m, z1.d +; CHECK-NEXT: ldr z5, [x0, #4, mul vl] +; CHECK-NEXT: frintx z3.d, p0/m, z3.d +; CHECK-NEXT: mov x9, #4890909195324358655 // =0x43dfffffffffffff +; CHECK-NEXT: frintx z6.d, p0/m, z6.d +; CHECK-NEXT: mov z4.d, x9 +; CHECK-NEXT: fcmge p3.d, p0/z, z0.d, z7.d +; CHECK-NEXT: movprfx z24, z0 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.d +; CHECK-NEXT: fcmge p5.d, p0/z, z30.d, z7.d +; CHECK-NEXT: movprfx z28, z30 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z30.d +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: frintx z5.d, p0/m, z5.d +; CHECK-NEXT: fcmge p4.d, p0/z, z12.d, z7.d +; CHECK-NEXT: ldr z8, [x0, #7, mul vl] +; CHECK-NEXT: ldr z9, [x0, #15, mul vl] +; CHECK-NEXT: movprfx z27, z12 +; CHECK-NEXT: fcvtzs z27.d, p0/m, z12.d +; CHECK-NEXT: fcmge p6.d, p0/z, z3.d, z7.d +; CHECK-NEXT: fcmge p9.d, p0/z, z6.d, z7.d +; CHECK-NEXT: not p7.b, p0/z, p3.b +; CHECK-NEXT: movprfx z31, z3 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z3.d +; CHECK-NEXT: movprfx z15, z6 +; CHECK-NEXT: fcvtzs z15.d, p0/m, z6.d +; CHECK-NEXT: not p5.b, p0/z, p5.b +; CHECK-NEXT: fcmge p8.d, p0/z, z5.d, z7.d +; CHECK-NEXT: movprfx z13, z5 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z5.d +; CHECK-NEXT: sel z0.d, p7, z26.d, z24.d ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z25.d, p2/m, z3.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z4.d, z1.d -; CHECK-NEXT: movprfx z16, z13 -; CHECK-NEXT: fcvtzs z16.d, p0/m, z13.d -; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z14.d }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: sel z31.d, p4, z3.d, z7.d -; CHECK-NEXT: movprfx z11, z6 -; CHECK-NEXT: frintx z11.d, p0/m, z6.d -; CHECK-NEXT: not p7.b, p0/z, p7.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: sel z6.d, p5, z0.d, z25.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z27.d, z1.d -; CHECK-NEXT: sel z7.d, p6, z0.d, z29.d -; CHECK-NEXT: mov z26.d, p7/m, z3.d -; CHECK-NEXT: fcmge p5.d, p0/z, z13.d, z2.d -; CHECK-NEXT: sel z25.d, p2, z0.d, z31.d -; CHECK-NEXT: fcmge p2.d, p0/z, z30.d, z2.d -; CHECK-NEXT: sel z29.d, p3, z3.d, z9.d -; CHECK-NEXT: fcmge p3.d, p0/z, z11.d, z2.d -; CHECK-NEXT: movprfx z31, z30 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z30.d -; CHECK-NEXT: movprfx z9, z11 -; CHECK-NEXT: fcvtzs z9.d, p0/m, z11.d -; CHECK-NEXT: mov z26.d, p4/m, z0.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z28.d, z1.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z30.d, z1.d -; CHECK-NEXT: not p7.b, p0/z, p5.b -; CHECK-NEXT: fcmuo p5.d, p0/z, z27.d, z27.d -; CHECK-NEXT: fcmgt p8.d, p0/z, z13.d, z1.d -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: movprfx z27, z18 -; CHECK-NEXT: frintx z27.d, p0/m, z18.d -; CHECK-NEXT: ld1d { z8.d }, p0/z, [x0, #7, mul vl] -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z16.d, p7/m, z3.d -; CHECK-NEXT: fcmuo p7.d, p0/z, z13.d, z13.d -; CHECK-NEXT: mov z31.d, p2/m, z3.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z11.d, z1.d -; CHECK-NEXT: mov z29.d, p4/m, z0.d -; CHECK-NEXT: mov z9.d, p3/m, z3.d -; CHECK-NEXT: fcmuo p3.d, p0/z, z28.d, z28.d -; CHECK-NEXT: fcmuo p4.d, p0/z, z30.d, z30.d -; CHECK-NEXT: movprfx z28, z17 -; CHECK-NEXT: frintx z28.d, p0/m, z17.d -; CHECK-NEXT: movprfx z30, z15 -; CHECK-NEXT: frintx z30.d, p0/m, z15.d -; CHECK-NEXT: ld1d { z13.d }, p0/z, [x0, #4, mul vl] -; CHECK-NEXT: mov z31.d, p6/m, z0.d -; CHECK-NEXT: fcmuo p6.d, p0/z, z11.d, z11.d -; CHECK-NEXT: sel z11.d, p8, z0.d, z16.d -; CHECK-NEXT: mov z9.d, p2/m, z0.d -; CHECK-NEXT: fcmuo p2.d, p0/z, z24.d, z24.d -; CHECK-NEXT: movprfx z24, z14 -; CHECK-NEXT: frintx z24.d, p0/m, z14.d -; CHECK-NEXT: fcmge p8.d, p0/z, z27.d, z2.d -; CHECK-NEXT: ld1d { z10.d }, p0/z, [x0, #6, mul vl] -; CHECK-NEXT: ld1d { z12.d }, p0/z, [x0, #5, mul vl] -; CHECK-NEXT: mov z26.d, p5/m, #0 // =0x0 -; CHECK-NEXT: mov z29.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmge p5.d, p0/z, z28.d, z2.d -; CHECK-NEXT: movprfx z14, z27 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z27.d -; CHECK-NEXT: fcmge p3.d, p0/z, z30.d, z2.d -; CHECK-NEXT: frintx z13.d, p0/m, z13.d -; CHECK-NEXT: mov z31.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmge p4.d, p0/z, z24.d, z2.d -; CHECK-NEXT: mov z9.d, p6/m, #0 // =0x0 -; CHECK-NEXT: movprfx z15, z28 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z28.d -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: movprfx z16, z30 -; CHECK-NEXT: fcvtzs z16.d, p0/m, z30.d -; CHECK-NEXT: frintx z12.d, p0/m, z12.d +; CHECK-NEXT: movprfx z17, z25 +; CHECK-NEXT: fcvtzs z17.d, p0/m, z25.d +; CHECK-NEXT: not p3.b, p0/z, p6.b +; CHECK-NEXT: fcmge p6.d, p0/z, z25.d, z7.d +; CHECK-NEXT: movprfx z22, z9 +; CHECK-NEXT: frintx z22.d, p0/m, z9.d +; CHECK-NEXT: sel z29.d, p4, z26.d, z27.d +; CHECK-NEXT: movprfx z27, z8 +; CHECK-NEXT: frintx z27.d, p0/m, z8.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z12.d, z4.d +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z0.d, p5, z26.d, z28.d +; CHECK-NEXT: not p4.b, p0/z, p8.b +; CHECK-NEXT: ldr z10, [x0, #8, mul vl] +; CHECK-NEXT: not p5.b, p0/z, p9.b +; CHECK-NEXT: sel z24.d, p3, z26.d, z31.d +; CHECK-NEXT: not p3.b, p0/z, p6.b +; CHECK-NEXT: movprfx z2, z22 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z22.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z30.d, z4.d +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fcmge p7.d, p0/z, z27.d, z7.d +; CHECK-NEXT: sel z31.d, p5, z26.d, z15.d +; CHECK-NEXT: ldr z11, [x0, #9, mul vl] +; CHECK-NEXT: movprfx z28, z10 +; CHECK-NEXT: frintx z28.d, p0/m, z10.d +; CHECK-NEXT: ldr z10, [x0, #10, mul vl] +; CHECK-NEXT: ldr z18, [x0, #11, mul vl] +; CHECK-NEXT: ldr z16, [x0, #13, mul vl] +; CHECK-NEXT: ldr z14, [x0, #14, mul vl] +; CHECK-NEXT: ldr z19, [x0, #12, mul vl] +; CHECK-NEXT: mov z17.d, p3/m, z26.d +; CHECK-NEXT: fcmgt p9.d, p0/z, z3.d, z4.d +; CHECK-NEXT: movprfx z8, z11 +; CHECK-NEXT: frintx z8.d, p0/m, z11.d +; CHECK-NEXT: sel z11.d, p4, z26.d, z13.d ; CHECK-NEXT: frintx z10.d, p0/m, z10.d -; CHECK-NEXT: movprfx z17, z24 -; CHECK-NEXT: fcvtzs z17.d, p0/m, z24.d -; CHECK-NEXT: movprfx z18, z8 -; CHECK-NEXT: frintx z18.d, p0/m, z8.d +; CHECK-NEXT: movprfx z13, z18 +; CHECK-NEXT: frintx z13.d, p0/m, z18.d +; CHECK-NEXT: fcmge p5.d, p0/z, z28.d, z7.d +; CHECK-NEXT: movprfx z18, z27 +; CHECK-NEXT: fcvtzs z18.d, p0/m, z27.d +; CHECK-NEXT: frintx z16.d, p0/m, z16.d +; CHECK-NEXT: movprfx z15, z19 +; CHECK-NEXT: frintx z15.d, p0/m, z19.d +; CHECK-NEXT: movprfx z19, z28 +; CHECK-NEXT: fcvtzs z19.d, p0/m, z28.d +; CHECK-NEXT: movprfx z21, z14 +; CHECK-NEXT: frintx z21.d, p0/m, z14.d +; CHECK-NEXT: not p4.b, p0/z, p7.b +; CHECK-NEXT: fcmge p6.d, p0/z, z8.d, z7.d +; CHECK-NEXT: movprfx z20, z8 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z8.d +; CHECK-NEXT: fcmge p7.d, p0/z, z10.d, z7.d +; CHECK-NEXT: fcmge p8.d, p0/z, z13.d, z7.d ; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: sel z8.d, p6, z3.d, z14.d -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: fcmge p6.d, p0/z, z13.d, z2.d -; CHECK-NEXT: mov z11.d, p7/m, #0 // =0x0 -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: sel z14.d, p5, z3.d, z15.d -; CHECK-NEXT: fcmuo p7.d, p0/z, z5.d, z5.d -; CHECK-NEXT: sel z15.d, p3, z3.d, z16.d -; CHECK-NEXT: movprfx z16, z13 -; CHECK-NEXT: fcvtzs z16.d, p0/m, z13.d -; CHECK-NEXT: fcmge p5.d, p0/z, z12.d, z2.d -; CHECK-NEXT: fcmge p3.d, p0/z, z10.d, z2.d -; CHECK-NEXT: sel z5.d, p4, z3.d, z17.d -; CHECK-NEXT: fcmge p4.d, p0/z, z18.d, z2.d +; CHECK-NEXT: sel z9.d, p4, z26.d, z18.d +; CHECK-NEXT: fcmge p4.d, p0/z, z16.d, z7.d +; CHECK-NEXT: fcmge p3.d, p0/z, z15.d, z7.d +; CHECK-NEXT: movprfx z0, z16 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z16.d +; CHECK-NEXT: sel z14.d, p5, z26.d, z19.d +; CHECK-NEXT: movprfx z19, z10 +; CHECK-NEXT: fcvtzs z19.d, p0/m, z10.d +; CHECK-NEXT: movprfx z1, z21 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z21.d ; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: movprfx z2, z12 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z12.d -; CHECK-NEXT: movprfx z17, z10 -; CHECK-NEXT: fcvtzs z17.d, p0/m, z10.d -; CHECK-NEXT: st1b { z11.b }, p1, [x8, x16] -; CHECK-NEXT: movprfx z11, z18 -; CHECK-NEXT: fcvtzs z11.d, p0/m, z18.d -; CHECK-NEXT: mov z6.d, p2/m, #0 // =0x0 -; CHECK-NEXT: st1b { z9.b }, p1, [x8, x15] -; CHECK-NEXT: sel z9.d, p6, z3.d, z16.d -; CHECK-NEXT: fcmuo p6.d, p0/z, z4.d, z4.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: fcmgt p2.d, p0/z, z18.d, z1.d -; CHECK-NEXT: mov z7.d, p7/m, #0 // =0x0 -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: st1b { z31.b }, p1, [x8, x14] -; CHECK-NEXT: fcmgt p7.d, p0/z, z24.d, z1.d +; CHECK-NEXT: movprfx z23, z15 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z15.d +; CHECK-NEXT: not p5.b, p0/z, p7.b +; CHECK-NEXT: sel z18.d, p6, z26.d, z20.d +; CHECK-NEXT: fcmge p6.d, p0/z, z21.d, z7.d +; CHECK-NEXT: not p7.b, p0/z, p8.b +; CHECK-NEXT: fcmge p8.d, p0/z, z22.d, z7.d +; CHECK-NEXT: movprfx z20, z13 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z13.d ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z2.d, p5/m, z3.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z28.d, z1.d -; CHECK-NEXT: sel z4.d, p3, z3.d, z17.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z13.d, z1.d -; CHECK-NEXT: mov z25.d, p6/m, #0 // =0x0 -; CHECK-NEXT: sel z3.d, p4, z3.d, z11.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z10.d, z1.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z12.d, z1.d -; CHECK-NEXT: st1b { z29.b }, p1, [x8, x13] -; CHECK-NEXT: st1b { z26.b }, p1, [x8, x12] -; CHECK-NEXT: sel z26.d, p5, z0.d, z14.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z30.d, z1.d -; CHECK-NEXT: sel z29.d, p3, z0.d, z9.d -; CHECK-NEXT: fcmuo p3.d, p0/z, z18.d, z18.d -; CHECK-NEXT: mov z3.d, p2/m, z0.d -; CHECK-NEXT: st1b { z25.b }, p1, [x8, x11] -; CHECK-NEXT: fcmuo p2.d, p0/z, z10.d, z10.d -; CHECK-NEXT: mov z4.d, p4/m, z0.d -; CHECK-NEXT: fcmuo p4.d, p0/z, z12.d, z12.d -; CHECK-NEXT: st1b { z7.b }, p1, [x8, x10] -; CHECK-NEXT: mov z2.d, p6/m, z0.d -; CHECK-NEXT: st1b { z6.b }, p1, [x8, x9] -; CHECK-NEXT: fcmuo p1.d, p0/z, z13.d, z13.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z27.d, z1.d -; CHECK-NEXT: mov z3.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p3.d, p0/z, z24.d, z24.d -; CHECK-NEXT: sel z1.d, p7, z0.d, z5.d -; CHECK-NEXT: mov z4.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.d, p0/z, z30.d, z30.d -; CHECK-NEXT: sel z5.d, p5, z0.d, z15.d -; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.d, p0/z, z28.d, z28.d -; CHECK-NEXT: mov z29.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.d, p0/z, z27.d, z27.d -; CHECK-NEXT: sel z0.d, p6, z0.d, z8.d +; CHECK-NEXT: mov z7.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z19.d, p5/m, z26.d +; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: mov z0.d, p4/m, z26.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z21.d, z4.d +; CHECK-NEXT: not p5.b, p0/z, p6.b +; CHECK-NEXT: mov z23.d, p3/m, z26.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z22.d, z4.d +; CHECK-NEXT: not p6.b, p0/z, p8.b +; CHECK-NEXT: mov z20.d, p7/m, z26.d +; CHECK-NEXT: fcmuo p8.d, p0/z, z22.d, z22.d +; CHECK-NEXT: mov z1.d, p5/m, z26.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z21.d, z21.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z25.d, z4.d +; CHECK-NEXT: mov z2.d, p6/m, z26.d +; CHECK-NEXT: sel z26.d, p1, z7.d, z29.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z16.d, z4.d +; CHECK-NEXT: ldr z29, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmgt p6.d, p0/z, z5.d, z4.d +; CHECK-NEXT: mov z24.d, p9/m, z7.d +; CHECK-NEXT: mov z1.d, p4/m, z7.d +; CHECK-NEXT: fcmuo p4.d, p0/z, z16.d, z16.d +; CHECK-NEXT: mov z2.d, p3/m, z7.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z15.d, z4.d +; CHECK-NEXT: mov z17.d, p7/m, z7.d +; CHECK-NEXT: mov z29.d, p2/m, z7.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z13.d, z4.d +; CHECK-NEXT: mov z0.d, p1/m, z7.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z10.d, z4.d +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z11.d, p6/m, z7.d +; CHECK-NEXT: fcmuo p6.d, p0/z, z15.d, z15.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z8.d, z4.d +; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z16.d, p3, z7.d, z23.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z10.d, z10.d +; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 +; CHECK-NEXT: sel z15.d, p2, z7.d, z20.d +; CHECK-NEXT: fcmuo p2.d, p0/z, z13.d, z13.d +; CHECK-NEXT: str z1, [x8, #14, mul vl] +; CHECK-NEXT: sel z1.d, p1, z7.d, z19.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z28.d, z4.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z27.d, z4.d +; CHECK-NEXT: str z2, [x8, #15, mul vl] +; CHECK-NEXT: sel z2.d, p5, z7.d, z18.d +; CHECK-NEXT: mov z16.d, p6/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.d, p0/z, z8.d, z8.d +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p2.d, p0/z, z28.d, z28.d ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 -; CHECK-NEXT: st1d { z3.d }, p0, [x8, #7, mul vl] -; CHECK-NEXT: mov z5.d, p2/m, #0 // =0x0 -; CHECK-NEXT: st1d { z4.d }, p0, [x8, #6, mul vl] -; CHECK-NEXT: mov z26.d, p4/m, #0 // =0x0 -; CHECK-NEXT: st1d { z2.d }, p0, [x8, #5, mul vl] -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: st1d { z29.d }, p0, [x8, #4, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [x8, #2, mul vl] -; CHECK-NEXT: st1d { z26.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] -; CHECK-NEXT: ldr z18, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z17, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z16, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z15, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: fcmgt p3.d, p0/z, z6.d, z4.d +; CHECK-NEXT: sel z0.d, p1, z7.d, z14.d +; CHECK-NEXT: fcmuo p1.d, p0/z, z27.d, z27.d +; CHECK-NEXT: sel z27.d, p4, z7.d, z9.d +; CHECK-NEXT: str z16, [x8, #12, mul vl] +; CHECK-NEXT: fcmuo p4.d, p0/z, z25.d, z25.d +; CHECK-NEXT: str z15, [x8, #11, mul vl] +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.d, p0/z, z6.d, z6.d +; CHECK-NEXT: str z1, [x8, #10, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: sel z1.d, p3, z7.d, z31.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z5.d, z5.d +; CHECK-NEXT: ldr z5, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z27.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z2, [x8, #9, mul vl] +; CHECK-NEXT: fcmuo p1.d, p0/z, z3.d, z3.d +; CHECK-NEXT: str z0, [x8, #8, mul vl] +; CHECK-NEXT: mov z17.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.d, p0/z, z30.d, z30.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z5.d, z4.d +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.d, p0/z, z12.d, z12.d +; CHECK-NEXT: str z27, [x8, #7, mul vl] +; CHECK-NEXT: fcmuo p0.d, p0/z, z5.d, z5.d +; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z24.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z17, [x8, #6, mul vl] +; CHECK-NEXT: mov z29.d, p4/m, #0 // =0x0 +; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: mov z26.d, p5/m, #0 // =0x0 +; CHECK-NEXT: str z11, [x8, #4, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: str z24, [x8, #3, mul vl] +; CHECK-NEXT: str z29, [x8, #2, mul vl] +; CHECK-NEXT: str z26, [x8, #1, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #12 +; CHECK-NEXT: addvl sp, sp, #17 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %a = call @llvm.llrint.nxv32i64.nxv16f64( %x) diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll b/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll index c5b0651ab01d4..03b08ff437599 100644 --- a/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll +++ b/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll @@ -1,8 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix CHECK-NO-STRICT-ALIGN %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+strict-align < %s | FileCheck %s define void @nxv16i8(ptr %ldptr, ptr %stptr) { +; CHECK-NO-STRICT-ALIGN-LABEL: nxv16i8: +; CHECK-NO-STRICT-ALIGN: // %bb.0: +; CHECK-NO-STRICT-ALIGN-NEXT: ldr z0, [x0] +; CHECK-NO-STRICT-ALIGN-NEXT: str z0, [x1] +; CHECK-NO-STRICT-ALIGN-NEXT: ret +; ; CHECK-LABEL: nxv16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b @@ -15,6 +21,12 @@ define void @nxv16i8(ptr %ldptr, ptr %stptr) { } define void @nxv8i16(ptr %ldptr, ptr %stptr) { +; CHECK-NO-STRICT-ALIGN-LABEL: nxv8i16: +; CHECK-NO-STRICT-ALIGN: // %bb.0: +; CHECK-NO-STRICT-ALIGN-NEXT: ldr z0, [x0] +; CHECK-NO-STRICT-ALIGN-NEXT: str z0, [x1] +; CHECK-NO-STRICT-ALIGN-NEXT: ret +; ; CHECK-LABEL: nxv8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h @@ -27,6 +39,12 @@ define void @nxv8i16(ptr %ldptr, ptr %stptr) { } define void @nxv4i32(ptr %ldptr, ptr %stptr) { +; CHECK-NO-STRICT-ALIGN-LABEL: nxv4i32: +; CHECK-NO-STRICT-ALIGN: // %bb.0: +; CHECK-NO-STRICT-ALIGN-NEXT: ldr z0, [x0] +; CHECK-NO-STRICT-ALIGN-NEXT: str z0, [x1] +; CHECK-NO-STRICT-ALIGN-NEXT: ret +; ; CHECK-LABEL: nxv4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s @@ -39,6 +57,12 @@ define void @nxv4i32(ptr %ldptr, ptr %stptr) { } define void @nxv2i64(ptr %ldptr, ptr %stptr) { +; CHECK-NO-STRICT-ALIGN-LABEL: nxv2i64: +; CHECK-NO-STRICT-ALIGN: // %bb.0: +; CHECK-NO-STRICT-ALIGN-NEXT: ldr z0, [x0] +; CHECK-NO-STRICT-ALIGN-NEXT: str z0, [x1] +; CHECK-NO-STRICT-ALIGN-NEXT: ret +; ; CHECK-LABEL: nxv2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d @@ -51,6 +75,12 @@ define void @nxv2i64(ptr %ldptr, ptr %stptr) { } define void @nxv16i1(ptr %ldptr, ptr %stptr) { +; CHECK-NO-STRICT-ALIGN-LABEL: nxv16i1: +; CHECK-NO-STRICT-ALIGN: // %bb.0: +; CHECK-NO-STRICT-ALIGN-NEXT: ldr p0, [x0] +; CHECK-NO-STRICT-ALIGN-NEXT: str p0, [x1] +; CHECK-NO-STRICT-ALIGN-NEXT: ret +; ; CHECK-LABEL: nxv16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-lrint.ll b/llvm/test/CodeGen/AArch64/sve-lrint.ll index 2a1432d881e57..908ba2392a437 100644 --- a/llvm/test/CodeGen/AArch64/sve-lrint.ll +++ b/llvm/test/CodeGen/AArch64/sve-lrint.ll @@ -341,7 +341,8 @@ define @lrint_v32f16( %x) { ; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG @@ -351,234 +352,231 @@ define @lrint_v32f16( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG -; CHECK-NEXT: uunpkhi z5.s, z0.h ; CHECK-NEXT: uunpklo z4.s, z0.h +; CHECK-NEXT: uunpkhi z5.s, z0.h ; CHECK-NEXT: mov w9, #64511 // =0xfbff ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z6.s, z1.h -; CHECK-NEXT: mov z30.h, w9 -; CHECK-NEXT: uunpkhi z10.s, z1.h +; CHECK-NEXT: mov z26.h, w9 +; CHECK-NEXT: uunpkhi z25.s, z1.h ; CHECK-NEXT: mov w9, #31743 // =0x7bff -; CHECK-NEXT: mov z29.d, #0x8000000000000000 -; CHECK-NEXT: uunpklo z8.s, z2.h -; CHECK-NEXT: uunpkhi z13.s, z3.h -; CHECK-NEXT: uunpklo z18.s, z3.h -; CHECK-NEXT: uunpklo z7.d, z5.s +; CHECK-NEXT: mov z27.d, #0x8000000000000000 +; CHECK-NEXT: uunpklo z31.s, z2.h +; CHECK-NEXT: uunpkhi z12.s, z2.h +; CHECK-NEXT: mov z17.d, z3.d ; CHECK-NEXT: uunpklo z0.d, z4.s ; CHECK-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEXT: uunpklo z7.d, z5.s ; CHECK-NEXT: uunpkhi z24.d, z5.s -; CHECK-NEXT: uunpklo z25.d, z6.s -; CHECK-NEXT: uunpkhi z26.d, z6.s -; CHECK-NEXT: uunpklo z27.d, z10.s -; CHECK-NEXT: uunpkhi z10.d, z10.s -; CHECK-NEXT: uunpklo z12.d, z8.s -; CHECK-NEXT: uunpkhi z16.d, z8.s -; CHECK-NEXT: movprfx z5, z7 -; CHECK-NEXT: frintx z5.h, p0/m, z7.h -; CHECK-NEXT: movprfx z1, z4 -; CHECK-NEXT: frintx z1.h, p0/m, z4.h +; CHECK-NEXT: uunpklo z28.d, z6.s +; CHECK-NEXT: uunpkhi z29.d, z6.s +; CHECK-NEXT: uunpklo z8.d, z25.s +; CHECK-NEXT: uunpkhi z9.d, z25.s +; CHECK-NEXT: uunpklo z16.s, z17.h +; CHECK-NEXT: uunpklo z11.d, z31.s +; CHECK-NEXT: uunpkhi z14.d, z31.s +; CHECK-NEXT: uunpkhi z17.s, z17.h +; CHECK-NEXT: movprfx z30, z4 +; CHECK-NEXT: frintx z30.h, p0/m, z4.h +; CHECK-NEXT: movprfx z4, z7 +; CHECK-NEXT: frintx z4.h, p0/m, z7.h ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: movprfx z6, z24 ; CHECK-NEXT: frintx z6.h, p0/m, z24.h -; CHECK-NEXT: movprfx z24, z25 -; CHECK-NEXT: frintx z24.h, p0/m, z25.h -; CHECK-NEXT: movprfx z25, z26 -; CHECK-NEXT: frintx z25.h, p0/m, z26.h -; CHECK-NEXT: movprfx z28, z27 -; CHECK-NEXT: frintx z28.h, p0/m, z27.h -; CHECK-NEXT: movprfx z8, z10 -; CHECK-NEXT: frintx z8.h, p0/m, z10.h -; CHECK-NEXT: mov z7.h, w9 -; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff -; CHECK-NEXT: rdvl x9, #15 -; CHECK-NEXT: fcmge p3.h, p0/z, z5.h, z30.h -; CHECK-NEXT: movprfx z11, z5 -; CHECK-NEXT: fcvtzs z11.d, p0/m, z5.h -; CHECK-NEXT: fcmge p2.h, p0/z, z1.h, z30.h -; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z30.h -; CHECK-NEXT: fcmge p4.h, p0/z, z6.h, z30.h -; CHECK-NEXT: movprfx z9, z6 -; CHECK-NEXT: fcvtzs z9.d, p0/m, z6.h +; CHECK-NEXT: movprfx z7, z28 +; CHECK-NEXT: frintx z7.h, p0/m, z28.h +; CHECK-NEXT: movprfx z25, z29 +; CHECK-NEXT: frintx z25.h, p0/m, z29.h +; CHECK-NEXT: movprfx z3, z9 +; CHECK-NEXT: frintx z3.h, p0/m, z9.h +; CHECK-NEXT: mov z5.h, w9 +; CHECK-NEXT: movprfx z31, z11 +; CHECK-NEXT: frintx z31.h, p0/m, z11.h +; CHECK-NEXT: movprfx z9, z14 +; CHECK-NEXT: frintx z9.h, p0/m, z14.h +; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z26.h +; CHECK-NEXT: fcmge p4.h, p0/z, z4.h, z26.h +; CHECK-NEXT: movprfx z24, z0 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z30.h, z26.h +; CHECK-NEXT: movprfx z29, z4 +; CHECK-NEXT: fcvtzs z29.d, p0/m, z4.h +; CHECK-NEXT: fcmge p6.h, p0/z, z6.h, z26.h +; CHECK-NEXT: movprfx z28, z30 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z30.h +; CHECK-NEXT: movprfx z10, z6 +; CHECK-NEXT: fcvtzs z10.d, p0/m, z6.h +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: fcmge p3.h, p0/z, z7.h, z26.h +; CHECK-NEXT: movprfx z13, z7 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z7.h ; CHECK-NEXT: movprfx z15, z25 ; CHECK-NEXT: fcvtzs z15.d, p0/m, z25.h -; CHECK-NEXT: movprfx z14, z24 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z24.h -; CHECK-NEXT: movprfx z26, z0 -; CHECK-NEXT: fcvtzs z26.d, p0/m, z0.h -; CHECK-NEXT: movprfx z19, z28 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z28.h -; CHECK-NEXT: movprfx z31, z1 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z1.h -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p6.b, p0/z, p2.b -; CHECK-NEXT: fcmge p2.h, p0/z, z25.h, z30.h -; CHECK-NEXT: sel z27.d, p3, z29.d, z11.d -; CHECK-NEXT: uunpkhi z11.s, z2.h ; CHECK-NEXT: not p5.b, p0/z, p1.b -; CHECK-NEXT: fcmge p1.h, p0/z, z24.h, z30.h -; CHECK-NEXT: not p3.b, p0/z, p4.b -; CHECK-NEXT: fcmge p4.h, p0/z, z28.h, z30.h -; CHECK-NEXT: mov z26.d, p5/m, z29.d -; CHECK-NEXT: mov z31.d, p6/m, z29.d -; CHECK-NEXT: sel z2.d, p3, z29.d, z9.d -; CHECK-NEXT: movprfx z9, z12 -; CHECK-NEXT: frintx z9.h, p0/m, z12.h -; CHECK-NEXT: uunpkhi z12.d, z13.s -; CHECK-NEXT: uunpklo z17.d, z11.s -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: sel z3.d, p2, z29.d, z15.d -; CHECK-NEXT: uunpklo z15.d, z13.s -; CHECK-NEXT: fcmge p2.h, p0/z, z8.h, z30.h -; CHECK-NEXT: sel z10.d, p1, z29.d, z14.d -; CHECK-NEXT: movprfx z14, z16 -; CHECK-NEXT: frintx z14.h, p0/m, z16.h -; CHECK-NEXT: uunpkhi z16.d, z18.s -; CHECK-NEXT: movprfx z13, z17 -; CHECK-NEXT: frintx z13.h, p0/m, z17.h -; CHECK-NEXT: movprfx z20, z12 -; CHECK-NEXT: frintx z20.h, p0/m, z12.h -; CHECK-NEXT: fcmge p3.h, p0/z, z9.h, z30.h -; CHECK-NEXT: uunpkhi z17.d, z11.s -; CHECK-NEXT: uunpklo z18.d, z18.s -; CHECK-NEXT: movprfx z12, z8 -; CHECK-NEXT: fcvtzs z12.d, p0/m, z8.h -; CHECK-NEXT: movprfx z21, z15 -; CHECK-NEXT: frintx z21.h, p0/m, z15.h -; CHECK-NEXT: not p1.b, p0/z, p4.b -; CHECK-NEXT: movprfx z15, z9 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z9.h -; CHECK-NEXT: frintx z16.h, p0/m, z16.h -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: movprfx z22, z14 -; CHECK-NEXT: fcvtzs z22.d, p0/m, z14.h -; CHECK-NEXT: fcmge p4.h, p0/z, z13.h, z30.h -; CHECK-NEXT: fcmge p5.h, p0/z, z20.h, z30.h -; CHECK-NEXT: sel z11.d, p1, z29.d, z19.d +; CHECK-NEXT: movprfx z18, z3 +; CHECK-NEXT: fcvtzs z18.d, p0/m, z3.h +; CHECK-NEXT: movprfx z20, z31 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z31.h +; CHECK-NEXT: not p4.b, p0/z, p4.b +; CHECK-NEXT: movprfx z21, z9 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z9.h +; CHECK-NEXT: fcmgt p1.h, p0/z, z30.h, z5.h +; CHECK-NEXT: sel z0.d, p5, z27.d, z24.d +; CHECK-NEXT: not p7.b, p0/z, p2.b +; CHECK-NEXT: fcmgt p2.h, p0/z, z4.h, z5.h +; CHECK-NEXT: mov z29.d, p4/m, z27.d +; CHECK-NEXT: fcmge p4.h, p0/z, z25.h, z26.h +; CHECK-NEXT: not p5.b, p0/z, p6.b ; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: fcmge p6.h, p0/z, z9.h, z26.h +; CHECK-NEXT: fcmgt p9.h, p0/z, z6.h, z5.h +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z0.d, p7, z27.d, z28.d +; CHECK-NEXT: movprfx z28, z8 +; CHECK-NEXT: frintx z28.h, p0/m, z8.h +; CHECK-NEXT: sel z8.d, p5, z27.d, z10.d +; CHECK-NEXT: uunpklo z10.d, z12.s +; CHECK-NEXT: uunpkhi z12.d, z12.s +; CHECK-NEXT: not p5.b, p0/z, p4.b +; CHECK-NEXT: sel z11.d, p3, z27.d, z13.d +; CHECK-NEXT: uunpklo z13.d, z16.s +; CHECK-NEXT: fcmge p3.h, p0/z, z3.h, z26.h +; CHECK-NEXT: not p6.b, p0/z, p6.b +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: sel z24.d, p5, z27.d, z15.d +; CHECK-NEXT: uunpkhi z15.d, z16.s +; CHECK-NEXT: movprfx z14, z28 +; CHECK-NEXT: fcvtzs z14.d, p0/m, z28.h +; CHECK-NEXT: frintx z10.h, p0/m, z10.h +; CHECK-NEXT: uunpklo z16.d, z17.s +; CHECK-NEXT: frintx z12.h, p0/m, z12.h +; CHECK-NEXT: uunpkhi z17.d, z17.s +; CHECK-NEXT: movprfx z19, z13 +; CHECK-NEXT: frintx z19.h, p0/m, z13.h +; CHECK-NEXT: fcmge p4.h, p0/z, z28.h, z26.h +; CHECK-NEXT: fcmge p5.h, p0/z, z31.h, z26.h +; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: frintx z15.h, p0/m, z15.h +; CHECK-NEXT: fcmge p7.h, p0/z, z10.h, z26.h +; CHECK-NEXT: frintx z16.h, p0/m, z16.h +; CHECK-NEXT: fcmge p8.h, p0/z, z12.h, z26.h ; CHECK-NEXT: frintx z17.h, p0/m, z17.h -; CHECK-NEXT: frintx z18.h, p0/m, z18.h -; CHECK-NEXT: movprfx z19, z20 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z20.h -; CHECK-NEXT: mov z12.d, p2/m, z29.d -; CHECK-NEXT: fcmge p2.h, p0/z, z21.h, z30.h -; CHECK-NEXT: fcmge p1.h, p0/z, z14.h, z30.h -; CHECK-NEXT: mov z15.d, p3/m, z29.d -; CHECK-NEXT: movprfx z23, z21 -; CHECK-NEXT: fcvtzs z23.d, p0/m, z21.h -; CHECK-NEXT: not p3.b, p0/z, p4.b -; CHECK-NEXT: fcmge p4.h, p0/z, z16.h, z30.h -; CHECK-NEXT: fcmgt p8.h, p0/z, z21.h, z7.h +; CHECK-NEXT: movprfx z23, z19 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z19.h +; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: fcmge p6.h, p0/z, z17.h, z30.h -; CHECK-NEXT: fcmge p7.h, p0/z, z18.h, z30.h -; CHECK-NEXT: movprfx z30, z16 -; CHECK-NEXT: fcvtzs z30.d, p0/m, z16.h -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmuo p9.h, p0/z, z21.h, z21.h -; CHECK-NEXT: mov z19.d, p5/m, z29.d -; CHECK-NEXT: fcmgt p5.h, p0/z, z20.h, z7.h -; CHECK-NEXT: not p1.b, p0/z, p1.b +; CHECK-NEXT: sel z13.d, p3, z27.d, z18.d +; CHECK-NEXT: fcmge p3.h, p0/z, z19.h, z26.h +; CHECK-NEXT: movprfx z0, z15 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z15.h +; CHECK-NEXT: sel z22.d, p4, z27.d, z14.d +; CHECK-NEXT: sel z18.d, p6, z27.d, z21.d +; CHECK-NEXT: movprfx z21, z12 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.h +; CHECK-NEXT: movprfx z1, z16 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z16.h +; CHECK-NEXT: sel z14.d, p5, z27.d, z20.d +; CHECK-NEXT: fcmge p4.h, p0/z, z15.h, z26.h +; CHECK-NEXT: movprfx z20, z10 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z10.h +; CHECK-NEXT: movprfx z2, z17 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z17.h +; CHECK-NEXT: not p5.b, p0/z, p7.b +; CHECK-NEXT: fcmge p6.h, p0/z, z16.h, z26.h +; CHECK-NEXT: not p7.b, p0/z, p8.b +; CHECK-NEXT: fcmge p8.h, p0/z, z17.h, z26.h +; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff +; CHECK-NEXT: not p3.b, p0/z, p3.b ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z23.d, p2/m, z29.d -; CHECK-NEXT: fcmuo p2.h, p0/z, z20.h, z20.h -; CHECK-NEXT: movprfx z20, z18 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z18.h -; CHECK-NEXT: movprfx z21, z13 -; CHECK-NEXT: fcvtzs z21.d, p0/m, z13.h -; CHECK-NEXT: mov z22.d, p1/m, z29.d -; CHECK-NEXT: not p1.b, p0/z, p7.b -; CHECK-NEXT: mov z30.d, p4/m, z29.d -; CHECK-NEXT: fcmgt p4.h, p0/z, z18.h, z7.h -; CHECK-NEXT: mov z19.d, p5/m, z4.d -; CHECK-NEXT: fcmuo p7.h, p0/z, z18.h, z18.h -; CHECK-NEXT: movprfx z18, z17 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z17.h -; CHECK-NEXT: fcmgt p5.h, p0/z, z16.h, z7.h -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: mov z23.d, p8/m, z4.d -; CHECK-NEXT: mov z20.d, p1/m, z29.d -; CHECK-NEXT: mov z21.d, p3/m, z29.d -; CHECK-NEXT: fcmuo p3.h, p0/z, z16.h, z16.h -; CHECK-NEXT: mov z19.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.h, p0/z, z17.h, z7.h -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: sel z29.d, p6, z29.d, z18.d -; CHECK-NEXT: mov z23.d, p9/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p6.h, p0/z, z14.h, z7.h -; CHECK-NEXT: mov z30.d, p5/m, z4.d -; CHECK-NEXT: sel z16.d, p4, z4.d, z20.d -; CHECK-NEXT: fcmuo p4.h, p0/z, z17.h, z17.h -; CHECK-NEXT: st1b { z19.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #14 -; CHECK-NEXT: fcmgt p5.h, p0/z, z1.h, z7.h -; CHECK-NEXT: st1b { z23.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #13 -; CHECK-NEXT: mov z29.d, p2/m, z4.d -; CHECK-NEXT: mov z30.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.h, p0/z, z13.h, z7.h -; CHECK-NEXT: mov z16.d, p7/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.h, p0/z, z9.h, z7.h -; CHECK-NEXT: fcmuo p7.h, p0/z, z14.h, z14.h +; CHECK-NEXT: mov z20.d, p5/m, z27.d +; CHECK-NEXT: mov z21.d, p7/m, z27.d +; CHECK-NEXT: not p5.b, p0/z, p6.b +; CHECK-NEXT: mov z23.d, p3/m, z27.d +; CHECK-NEXT: fcmgt p3.h, p0/z, z17.h, z5.h +; CHECK-NEXT: not p6.b, p0/z, p8.b +; CHECK-NEXT: mov z0.d, p4/m, z27.d +; CHECK-NEXT: fcmgt p4.h, p0/z, z16.h, z5.h +; CHECK-NEXT: mov z1.d, p5/m, z27.d +; CHECK-NEXT: fcmuo p5.h, p0/z, z16.h, z16.h +; CHECK-NEXT: mov z29.d, p2/m, z26.d +; CHECK-NEXT: mov z2.d, p6/m, z27.d +; CHECK-NEXT: ldr z27, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmgt p6.h, p0/z, z7.h, z5.h +; CHECK-NEXT: fcmgt p2.h, p0/z, z12.h, z5.h +; CHECK-NEXT: fcmuo p8.h, p0/z, z17.h, z17.h +; CHECK-NEXT: fcmgt p7.h, p0/z, z28.h, z5.h +; CHECK-NEXT: mov z1.d, p4/m, z26.d +; CHECK-NEXT: fcmuo p4.h, p0/z, z15.h, z15.h +; CHECK-NEXT: mov z8.d, p9/m, z26.d +; CHECK-NEXT: mov z27.d, p1/m, z26.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z15.h, z5.h +; CHECK-NEXT: mov z2.d, p3/m, z26.d +; CHECK-NEXT: fcmgt p3.h, p0/z, z19.h, z5.h +; CHECK-NEXT: mov z11.d, p6/m, z26.d +; CHECK-NEXT: fcmuo p6.h, p0/z, z19.h, z19.h +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p5.h, p0/z, z9.h, z5.h +; CHECK-NEXT: sel z15.d, p2, z26.d, z21.d +; CHECK-NEXT: fcmuo p2.h, p0/z, z12.h, z12.h +; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z16.d, p7, z26.d, z22.d +; CHECK-NEXT: mov z0.d, p1/m, z26.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z10.h, z5.h +; CHECK-NEXT: str z1, [x8, #14, mul vl] +; CHECK-NEXT: sel z17.d, p3, z26.d, z23.d +; CHECK-NEXT: fcmuo p3.h, p0/z, z10.h, z10.h +; CHECK-NEXT: str z2, [x8, #15, mul vl] +; CHECK-NEXT: sel z2.d, p5, z26.d, z18.d +; CHECK-NEXT: fcmuo p5.h, p0/z, z9.h, z9.h +; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p4.h, p0/z, z3.h, z5.h +; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 +; CHECK-NEXT: sel z1.d, p1, z26.d, z20.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z31.h, z5.h +; CHECK-NEXT: mov z17.d, p6/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p2.h, p0/z, z31.h, z31.h +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z25.h, z25.h +; CHECK-NEXT: str z17, [x8, #12, mul vl] +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p3.h, p0/z, z25.h, z5.h +; CHECK-NEXT: str z15, [x8, #11, mul vl] +; CHECK-NEXT: sel z0.d, p1, z26.d, z14.d +; CHECK-NEXT: fcmuo p1.h, p0/z, z3.h, z3.h +; CHECK-NEXT: sel z3.d, p4, z26.d, z13.d +; CHECK-NEXT: fcmuo p4.h, p0/z, z28.h, z28.h +; CHECK-NEXT: str z1, [x8, #10, mul vl] +; CHECK-NEXT: sel z1.d, p3, z26.d, z24.d +; CHECK-NEXT: fcmuo p3.h, p0/z, z7.h, z7.h +; CHECK-NEXT: ldr z7, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z2, [x8, #9, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z3.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p1.h, p0/z, z6.h, z6.h +; CHECK-NEXT: mov z16.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.h, p0/z, z4.h, z4.h +; CHECK-NEXT: fcmgt p2.h, p0/z, z7.h, z5.h +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z30.h, z30.h +; CHECK-NEXT: str z0, [x8, #8, mul vl] +; CHECK-NEXT: fcmuo p0.h, p0/z, z7.h, z7.h +; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: str z3, [x8, #7, mul vl] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z16, [x8, #6, mul vl] +; CHECK-NEXT: mov z8.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z1, [x8, #5, mul vl] ; CHECK-NEXT: mov z29.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.h, p0/z, z13.h, z13.h -; CHECK-NEXT: st1b { z30.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #12 -; CHECK-NEXT: sel z30.d, p5, z4.d, z31.d -; CHECK-NEXT: st1b { z16.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #11 -; CHECK-NEXT: sel z31.d, p3, z4.d, z21.d -; CHECK-NEXT: st1b { z29.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #10 -; CHECK-NEXT: fcmgt p5.h, p0/z, z24.h, z7.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z28.h, z7.h -; CHECK-NEXT: sel z13.d, p2, z4.d, z15.d -; CHECK-NEXT: fcmuo p2.h, p0/z, z9.h, z9.h -; CHECK-NEXT: sel z29.d, p6, z4.d, z22.d -; CHECK-NEXT: mov z31.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p4.h, p0/z, z8.h, z7.h -; CHECK-NEXT: fcmgt p6.h, p0/z, z5.h, z7.h -; CHECK-NEXT: sel z9.d, p5, z4.d, z10.d -; CHECK-NEXT: fcmgt p5.h, p0/z, z6.h, z7.h -; CHECK-NEXT: st1b { z31.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #9 -; CHECK-NEXT: mov z29.d, p7/m, #0 // =0x0 -; CHECK-NEXT: sel z10.d, p3, z4.d, z11.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z25.h, z7.h -; CHECK-NEXT: mov z13.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p7.h, p0/z, z8.h, z8.h -; CHECK-NEXT: fcmuo p2.h, p0/z, z28.h, z28.h -; CHECK-NEXT: sel z28.d, p4, z4.d, z12.d -; CHECK-NEXT: st1b { z29.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #8 -; CHECK-NEXT: fcmuo p4.h, p0/z, z25.h, z25.h -; CHECK-NEXT: st1b { z13.b }, p1, [x8, x9] -; CHECK-NEXT: fcmuo p1.h, p0/z, z24.h, z24.h -; CHECK-NEXT: mov z2.d, p5/m, z4.d -; CHECK-NEXT: mov z3.d, p3/m, z4.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z0.h, z7.h -; CHECK-NEXT: mov z28.d, p7/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p7.h, p0/z, z6.h, z6.h -; CHECK-NEXT: mov z10.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.h, p0/z, z5.h, z5.h -; CHECK-NEXT: sel z5.d, p6, z4.d, z27.d -; CHECK-NEXT: mov z3.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.h, p0/z, z1.h, z1.h -; CHECK-NEXT: mov z9.d, p1/m, #0 // =0x0 -; CHECK-NEXT: st1d { z28.d }, p0, [x8, #7, mul vl] -; CHECK-NEXT: fcmuo p1.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z0.d, p3, z4.d, z26.d -; CHECK-NEXT: st1d { z10.d }, p0, [x8, #6, mul vl] -; CHECK-NEXT: mov z2.d, p7/m, #0 // =0x0 -; CHECK-NEXT: st1d { z3.d }, p0, [x8, #5, mul vl] -; CHECK-NEXT: mov z5.d, p2/m, #0 // =0x0 -; CHECK-NEXT: st1d { z9.d }, p0, [x8, #4, mul vl] -; CHECK-NEXT: mov z30.d, p4/m, #0 // =0x0 -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: st1d { z2.d }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [x8, #2, mul vl] -; CHECK-NEXT: st1d { z30.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: mov z27.d, p5/m, #0 // =0x0 +; CHECK-NEXT: str z11, [x8, #4, mul vl] +; CHECK-NEXT: str z8, [x8, #3, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, z26.d +; CHECK-NEXT: str z29, [x8, #2, mul vl] +; CHECK-NEXT: str z27, [x8, #1, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -940,7 +938,8 @@ define @lrint_v32f32( %x) { ; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG @@ -951,224 +950,224 @@ define @lrint_v32f32( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; CHECK-NEXT: uunpklo z24.d, z0.s -; CHECK-NEXT: uunpkhi z25.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, #-553648128 // =0xdf000000 ; CHECK-NEXT: uunpklo z26.d, z1.s -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uunpkhi z27.d, z1.s -; CHECK-NEXT: mov z31.s, w9 +; CHECK-NEXT: uunpkhi z25.d, z0.s +; CHECK-NEXT: uunpkhi z28.d, z1.s +; CHECK-NEXT: mov z29.s, w9 ; CHECK-NEXT: mov w9, #1593835519 // =0x5effffff -; CHECK-NEXT: uunpklo z28.d, z2.s -; CHECK-NEXT: mov z8.d, #0x8000000000000000 -; CHECK-NEXT: uunpklo z30.d, z3.s -; CHECK-NEXT: uunpklo z13.d, z4.s +; CHECK-NEXT: mov z17.d, z5.d +; CHECK-NEXT: mov z27.d, #0x8000000000000000 +; CHECK-NEXT: uunpkhi z30.d, z2.s +; CHECK-NEXT: uunpklo z8.d, z3.s ; CHECK-NEXT: movprfx z0, z24 ; CHECK-NEXT: frintx z0.s, p0/m, z24.s -; CHECK-NEXT: movprfx z1, z25 -; CHECK-NEXT: frintx z1.s, p0/m, z25.s -; CHECK-NEXT: uunpkhi z15.d, z4.s +; CHECK-NEXT: uunpkhi z9.d, z3.s +; CHECK-NEXT: uunpkhi z14.d, z4.s ; CHECK-NEXT: movprfx z24, z26 ; CHECK-NEXT: frintx z24.s, p0/m, z26.s -; CHECK-NEXT: uunpkhi z26.d, z2.s -; CHECK-NEXT: movprfx z25, z27 -; CHECK-NEXT: frintx z25.s, p0/m, z27.s -; CHECK-NEXT: movprfx z27, z28 -; CHECK-NEXT: frintx z27.s, p0/m, z28.s -; CHECK-NEXT: uunpklo z16.d, z5.s -; CHECK-NEXT: uunpkhi z17.d, z7.s -; CHECK-NEXT: frintx z30.s, p0/m, z30.s -; CHECK-NEXT: uunpklo z18.d, z7.s -; CHECK-NEXT: uunpklo z21.d, z6.s -; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z31.s -; CHECK-NEXT: movprfx z9, z0 -; CHECK-NEXT: fcvtzs z9.d, p0/m, z0.s +; CHECK-NEXT: movprfx z1, z25 +; CHECK-NEXT: frintx z1.s, p0/m, z25.s +; CHECK-NEXT: movprfx z5, z28 +; CHECK-NEXT: frintx z5.s, p0/m, z28.s +; CHECK-NEXT: uunpklo z26.d, z2.s +; CHECK-NEXT: uunpklo z16.d, z17.s +; CHECK-NEXT: mov z25.s, w9 +; CHECK-NEXT: movprfx z28, z30 +; CHECK-NEXT: frintx z28.s, p0/m, z30.s +; CHECK-NEXT: movprfx z30, z8 +; CHECK-NEXT: frintx z30.s, p0/m, z8.s +; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z29.s +; CHECK-NEXT: movprfx z31, z0 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z0.s +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z29.s +; CHECK-NEXT: fcmge p3.s, p0/z, z24.s, z29.s +; CHECK-NEXT: fcmge p5.s, p0/z, z5.s, z29.s +; CHECK-NEXT: frintx z26.s, p0/m, z26.s ; CHECK-NEXT: movprfx z10, z1 ; CHECK-NEXT: fcvtzs z10.d, p0/m, z1.s -; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z31.s -; CHECK-NEXT: fcmge p3.s, p0/z, z24.s, z31.s ; CHECK-NEXT: movprfx z11, z24 ; CHECK-NEXT: fcvtzs z11.d, p0/m, z24.s -; CHECK-NEXT: movprfx z29, z26 -; CHECK-NEXT: frintx z29.s, p0/m, z26.s -; CHECK-NEXT: fcmge p4.s, p0/z, z25.s, z31.s -; CHECK-NEXT: fcmge p5.s, p0/z, z27.s, z31.s -; CHECK-NEXT: movprfx z12, z27 -; CHECK-NEXT: fcvtzs z12.d, p0/m, z27.s -; CHECK-NEXT: movprfx z19, z30 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z30.s -; CHECK-NEXT: movprfx z7, z16 -; CHECK-NEXT: frintx z7.s, p0/m, z16.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: frintx z17.s, p0/m, z17.s -; CHECK-NEXT: uunpkhi z16.d, z5.s +; CHECK-NEXT: movprfx z12, z5 +; CHECK-NEXT: fcvtzs z12.d, p0/m, z5.s +; CHECK-NEXT: movprfx z15, z28 +; CHECK-NEXT: fcvtzs z15.d, p0/m, z28.s +; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: not p4.b, p0/z, p1.b +; CHECK-NEXT: fcmgt p1.s, p0/z, z1.s, z25.s +; CHECK-NEXT: fcmgt p9.s, p0/z, z5.s, z25.s ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: frintx z18.s, p0/m, z18.s -; CHECK-NEXT: mov z28.s, w9 -; CHECK-NEXT: not p6.b, p0/z, p3.b -; CHECK-NEXT: sel z26.d, p1, z8.d, z9.d -; CHECK-NEXT: movprfx z14, z29 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z29.s -; CHECK-NEXT: sel z9.d, p2, z8.d, z10.d -; CHECK-NEXT: uunpkhi z10.d, z3.s -; CHECK-NEXT: rdvl x9, #15 -; CHECK-NEXT: sel z3.d, p6, z8.d, z11.d -; CHECK-NEXT: movprfx z11, z25 -; CHECK-NEXT: fcvtzs z11.d, p0/m, z25.s -; CHECK-NEXT: fcmge p3.s, p0/z, z29.s, z31.s -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: fcmge p1.s, p0/z, z30.s, z31.s -; CHECK-NEXT: movprfx z23, z18 -; CHECK-NEXT: fcvtzs z23.d, p0/m, z18.s -; CHECK-NEXT: not p2.b, p0/z, p5.b -; CHECK-NEXT: fcmge p5.s, p0/z, z17.s, z31.s -; CHECK-NEXT: frintx z16.s, p0/m, z16.s -; CHECK-NEXT: frintx z10.s, p0/m, z10.s -; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff -; CHECK-NEXT: fcmgt p8.s, p0/z, z18.s, z28.s -; CHECK-NEXT: sel z4.d, p4, z8.d, z11.d -; CHECK-NEXT: movprfx z11, z13 -; CHECK-NEXT: frintx z11.s, p0/m, z13.s +; CHECK-NEXT: sel z0.d, p4, z27.d, z31.d +; CHECK-NEXT: fcmge p4.s, p0/z, z26.s, z29.s ; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: sel z13.d, p2, z8.d, z12.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmge p4.s, p0/z, z7.s, z31.s -; CHECK-NEXT: sel z12.d, p3, z8.d, z14.d -; CHECK-NEXT: movprfx z14, z15 -; CHECK-NEXT: frintx z14.s, p0/m, z15.s -; CHECK-NEXT: uunpkhi z15.d, z6.s +; CHECK-NEXT: not p5.b, p0/z, p5.b +; CHECK-NEXT: movprfx z13, z26 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z26.s +; CHECK-NEXT: sel z31.d, p2, z27.d, z10.d +; CHECK-NEXT: uunpklo z10.d, z4.s +; CHECK-NEXT: sel z8.d, p3, z27.d, z11.d +; CHECK-NEXT: fcmge p3.s, p0/z, z28.s, z29.s +; CHECK-NEXT: sel z11.d, p5, z27.d, z12.d +; CHECK-NEXT: movprfx z4, z9 +; CHECK-NEXT: frintx z4.s, p0/m, z9.s +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: not p5.b, p0/z, p4.b +; CHECK-NEXT: fcmge p4.s, p0/z, z30.s, z29.s +; CHECK-NEXT: fcmgt p2.s, p0/z, z24.s, z25.s +; CHECK-NEXT: sel z12.d, p5, z27.d, z13.d +; CHECK-NEXT: uunpkhi z13.d, z17.s +; CHECK-NEXT: movprfx z9, z10 +; CHECK-NEXT: frintx z9.s, p0/m, z10.s +; CHECK-NEXT: movprfx z10, z14 +; CHECK-NEXT: frintx z10.s, p0/m, z14.s +; CHECK-NEXT: uunpkhi z17.d, z6.s +; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: uunpklo z14.d, z6.s +; CHECK-NEXT: movprfx z6, z16 +; CHECK-NEXT: frintx z6.s, p0/m, z16.s +; CHECK-NEXT: uunpklo z16.d, z7.s +; CHECK-NEXT: uunpkhi z7.d, z7.s +; CHECK-NEXT: sel z3.d, p3, z27.d, z15.d +; CHECK-NEXT: fcmge p3.s, p0/z, z4.s, z29.s +; CHECK-NEXT: frintx z13.s, p0/m, z13.s +; CHECK-NEXT: movprfx z15, z30 +; CHECK-NEXT: fcvtzs z15.d, p0/m, z30.s +; CHECK-NEXT: fcmge p5.s, p0/z, z9.s, z29.s +; CHECK-NEXT: fcmge p6.s, p0/z, z10.s, z29.s +; CHECK-NEXT: frintx z17.s, p0/m, z17.s +; CHECK-NEXT: movprfx z18, z4 +; CHECK-NEXT: fcvtzs z18.d, p0/m, z4.s ; CHECK-NEXT: movprfx z20, z10 ; CHECK-NEXT: fcvtzs z20.d, p0/m, z10.s -; CHECK-NEXT: fcmge p2.s, p0/z, z10.s, z31.s -; CHECK-NEXT: sel z5.d, p1, z8.d, z19.d -; CHECK-NEXT: movprfx z19, z11 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z11.s -; CHECK-NEXT: fcmge p3.s, p0/z, z11.s, z31.s -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: fcmge p6.s, p0/z, z16.s, z31.s -; CHECK-NEXT: fcmuo p9.s, p0/z, z18.s, z18.s -; CHECK-NEXT: movprfx z22, z15 -; CHECK-NEXT: frintx z22.s, p0/m, z15.s -; CHECK-NEXT: fcmge p1.s, p0/z, z14.s, z31.s -; CHECK-NEXT: not p2.b, p0/z, p2.b +; CHECK-NEXT: frintx z16.s, p0/m, z16.s +; CHECK-NEXT: not p4.b, p0/z, p4.b +; CHECK-NEXT: movprfx z19, z14 +; CHECK-NEXT: frintx z19.s, p0/m, z14.s +; CHECK-NEXT: movprfx z14, z9 +; CHECK-NEXT: fcvtzs z14.d, p0/m, z9.s +; CHECK-NEXT: fcmge p7.s, p0/z, z6.s, z29.s +; CHECK-NEXT: fcmge p8.s, p0/z, z13.s, z29.s +; CHECK-NEXT: movprfx z21, z7 +; CHECK-NEXT: frintx z21.s, p0/m, z7.s ; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: sel z6.d, p2, z8.d, z20.d -; CHECK-NEXT: movprfx z20, z21 -; CHECK-NEXT: frintx z20.s, p0/m, z21.s -; CHECK-NEXT: fcmge p2.s, p0/z, z18.s, z31.s -; CHECK-NEXT: sel z15.d, p3, z8.d, z19.d -; CHECK-NEXT: movprfx z19, z17 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z17.s -; CHECK-NEXT: not p3.b, p0/z, p4.b -; CHECK-NEXT: fcmge p4.s, p0/z, z22.s, z31.s -; CHECK-NEXT: movprfx z21, z14 -; CHECK-NEXT: fcvtzs z21.d, p0/m, z14.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: movprfx z18, z7 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z7.s ; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: fcmge p7.s, p0/z, z20.s, z31.s -; CHECK-NEXT: movprfx z31, z22 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z22.s -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z19.d, p5/m, z8.d -; CHECK-NEXT: fcmgt p5.s, p0/z, z17.s, z28.s +; CHECK-NEXT: mov z15.d, p4/m, z27.d +; CHECK-NEXT: fcmge p4.s, p0/z, z17.s, z29.s +; CHECK-NEXT: not p5.b, p0/z, p5.b +; CHECK-NEXT: sel z7.d, p3, z27.d, z18.d +; CHECK-NEXT: movprfx z0, z17 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z17.s +; CHECK-NEXT: sel z18.d, p6, z27.d, z20.d +; CHECK-NEXT: movprfx z20, z6 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z6.s +; CHECK-NEXT: fcmge p6.s, p0/z, z16.s, z29.s +; CHECK-NEXT: fcmge p3.s, p0/z, z19.s, z29.s +; CHECK-NEXT: mov z14.d, p5/m, z27.d +; CHECK-NEXT: not p5.b, p0/z, p7.b +; CHECK-NEXT: not p7.b, p0/z, p8.b +; CHECK-NEXT: fcmge p8.s, p0/z, z21.s, z29.s +; CHECK-NEXT: movprfx z1, z16 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z16.s +; CHECK-NEXT: movprfx z22, z13 +; CHECK-NEXT: fcvtzs z22.d, p0/m, z13.s +; CHECK-NEXT: movprfx z23, z19 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z19.s ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z23.d, p2/m, z8.d -; CHECK-NEXT: fcmuo p2.s, p0/z, z17.s, z17.s -; CHECK-NEXT: movprfx z17, z20 -; CHECK-NEXT: fcvtzs z17.d, p0/m, z20.s -; CHECK-NEXT: mov z21.d, p1/m, z8.d -; CHECK-NEXT: mov z18.d, p3/m, z8.d -; CHECK-NEXT: not p1.b, p0/z, p7.b -; CHECK-NEXT: mov z31.d, p4/m, z8.d -; CHECK-NEXT: fcmgt p4.s, p0/z, z20.s, z28.s -; CHECK-NEXT: mov z19.d, p5/m, z2.d -; CHECK-NEXT: fcmuo p7.s, p0/z, z20.s, z20.s -; CHECK-NEXT: movprfx z20, z16 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z16.s -; CHECK-NEXT: fcmgt p5.s, p0/z, z22.s, z28.s -; CHECK-NEXT: mov z23.d, p8/m, z2.d -; CHECK-NEXT: fcmuo p3.s, p0/z, z22.s, z22.s -; CHECK-NEXT: mov z17.d, p1/m, z8.d -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: mov z19.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.s, p0/z, z16.s, z28.s -; CHECK-NEXT: sel z8.d, p6, z8.d, z20.d -; CHECK-NEXT: mov z23.d, p9/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p6.s, p0/z, z14.s, z28.s -; CHECK-NEXT: mov z31.d, p5/m, z2.d -; CHECK-NEXT: mov z17.d, p4/m, z2.d -; CHECK-NEXT: fcmuo p4.s, p0/z, z16.s, z16.s -; CHECK-NEXT: st1b { z19.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #14 -; CHECK-NEXT: fcmgt p5.s, p0/z, z1.s, z28.s -; CHECK-NEXT: st1b { z23.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #13 -; CHECK-NEXT: mov z8.d, p2/m, z2.d -; CHECK-NEXT: mov z31.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.s, p0/z, z7.s, z28.s -; CHECK-NEXT: mov z17.d, p7/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.s, p0/z, z11.s, z28.s -; CHECK-NEXT: fcmuo p7.s, p0/z, z14.s, z14.s -; CHECK-NEXT: mov z8.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.s, p0/z, z7.s, z7.s -; CHECK-NEXT: sel z7.d, p5, z2.d, z9.d -; CHECK-NEXT: st1b { z31.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #12 -; CHECK-NEXT: fcmgt p5.s, p0/z, z27.s, z28.s -; CHECK-NEXT: st1b { z17.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #11 -; CHECK-NEXT: sel z31.d, p3, z2.d, z18.d -; CHECK-NEXT: st1b { z8.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #10 -; CHECK-NEXT: fcmgt p3.s, p0/z, z30.s, z28.s -; CHECK-NEXT: sel z9.d, p2, z2.d, z15.d -; CHECK-NEXT: fcmuo p2.s, p0/z, z11.s, z11.s -; CHECK-NEXT: sel z8.d, p6, z2.d, z21.d +; CHECK-NEXT: movprfx z2, z21 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z21.s +; CHECK-NEXT: mov z29.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z20.d, p5/m, z27.d +; CHECK-NEXT: not p5.b, p0/z, p6.b +; CHECK-NEXT: mov z0.d, p4/m, z27.d +; CHECK-NEXT: fcmgt p4.s, p0/z, z16.s, z25.s +; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: not p6.b, p0/z, p8.b +; CHECK-NEXT: mov z1.d, p5/m, z27.d +; CHECK-NEXT: mov z22.d, p7/m, z27.d +; CHECK-NEXT: mov z23.d, p3/m, z27.d +; CHECK-NEXT: fcmgt p3.s, p0/z, z21.s, z25.s +; CHECK-NEXT: fcmuo p5.s, p0/z, z16.s, z16.s +; CHECK-NEXT: mov z2.d, p6/m, z27.d +; CHECK-NEXT: sel z27.d, p1, z29.d, z31.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z17.s, z25.s +; CHECK-NEXT: mov z1.d, p4/m, z29.d +; CHECK-NEXT: fcmgt p6.s, p0/z, z26.s, z25.s +; CHECK-NEXT: fcmgt p7.s, p0/z, z30.s, z25.s +; CHECK-NEXT: sel z31.d, p2, z29.d, z8.d +; CHECK-NEXT: fcmgt p2.s, p0/z, z13.s, z25.s +; CHECK-NEXT: fcmuo p8.s, p0/z, z21.s, z21.s +; CHECK-NEXT: mov z2.d, p3/m, z29.d +; CHECK-NEXT: fcmuo p4.s, p0/z, z17.s, z17.s +; CHECK-NEXT: fcmgt p3.s, p0/z, z19.s, z25.s +; CHECK-NEXT: mov z0.d, p1/m, z29.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z6.s, z25.s +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: sel z8.d, p9, z29.d, z11.d +; CHECK-NEXT: sel z11.d, p6, z29.d, z12.d +; CHECK-NEXT: sel z12.d, p7, z29.d, z15.d +; CHECK-NEXT: fcmgt p5.s, p0/z, z10.s, z25.s +; CHECK-NEXT: sel z15.d, p2, z29.d, z22.d +; CHECK-NEXT: fcmuo p2.s, p0/z, z13.s, z13.s +; CHECK-NEXT: str z1, [x8, #14, mul vl] +; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 +; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 +; CHECK-NEXT: sel z1.d, p1, z29.d, z20.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z9.s, z25.s +; CHECK-NEXT: fcmuo p6.s, p0/z, z19.s, z19.s +; CHECK-NEXT: sel z16.d, p3, z29.d, z23.d +; CHECK-NEXT: fcmuo p3.s, p0/z, z6.s, z6.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z4.s, z25.s +; CHECK-NEXT: str z2, [x8, #15, mul vl] +; CHECK-NEXT: sel z2.d, p5, z29.d, z18.d +; CHECK-NEXT: fcmuo p5.s, p0/z, z10.s, z10.s +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p2.s, p0/z, z9.s, z9.s +; CHECK-NEXT: sel z0.d, p1, z29.d, z14.d +; CHECK-NEXT: mov z16.d, p6/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p1.s, p0/z, z4.s, z4.s +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p3.s, p0/z, z28.s, z25.s +; CHECK-NEXT: sel z4.d, p4, z29.d, z7.d +; CHECK-NEXT: str z15, [x8, #11, mul vl] +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.s, p0/z, z28.s, z28.s +; CHECK-NEXT: str z16, [x8, #12, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.s, p0/z, z30.s, z30.s +; CHECK-NEXT: str z1, [x8, #10, mul vl] +; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p1.s, p0/z, z5.s, z5.s +; CHECK-NEXT: sel z1.d, p3, z29.d, z3.d +; CHECK-NEXT: ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z2, [x8, #9, mul vl] +; CHECK-NEXT: str z0, [x8, #8, mul vl] +; CHECK-NEXT: fcmuo p3.s, p0/z, z26.s, z26.s +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z4, [x8, #7, mul vl] +; CHECK-NEXT: mov z12.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p2.s, p0/z, z3.s, z25.s +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.s, p0/z, z24.s, z24.s +; CHECK-NEXT: mov z8.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.s, p0/z, z0.s, z0.s +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: str z12, [x8, #6, mul vl] +; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: fcmuo p0.s, p0/z, z3.s, z3.s +; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: str z8, [x8, #3, mul vl] ; CHECK-NEXT: mov z31.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p4.s, p0/z, z10.s, z28.s -; CHECK-NEXT: fcmgt p6.s, p0/z, z24.s, z28.s -; CHECK-NEXT: sel z11.d, p5, z2.d, z13.d -; CHECK-NEXT: fcmgt p5.s, p0/z, z25.s, z28.s -; CHECK-NEXT: mov z8.d, p7/m, #0 // =0x0 -; CHECK-NEXT: mov z5.d, p3/m, z2.d -; CHECK-NEXT: fcmgt p3.s, p0/z, z29.s, z28.s -; CHECK-NEXT: st1b { z31.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #9 -; CHECK-NEXT: mov z9.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p7.s, p0/z, z10.s, z10.s -; CHECK-NEXT: fcmuo p2.s, p0/z, z30.s, z30.s -; CHECK-NEXT: mov z6.d, p4/m, z2.d -; CHECK-NEXT: st1b { z8.b }, p1, [x8, x9] -; CHECK-NEXT: rdvl x9, #8 -; CHECK-NEXT: fcmuo p4.s, p0/z, z29.s, z29.s -; CHECK-NEXT: st1b { z9.b }, p1, [x8, x9] -; CHECK-NEXT: fcmuo p1.s, p0/z, z27.s, z27.s -; CHECK-NEXT: sel z27.d, p3, z2.d, z12.d -; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z28.s -; CHECK-NEXT: mov z4.d, p5/m, z2.d -; CHECK-NEXT: mov z3.d, p6/m, z2.d -; CHECK-NEXT: mov z6.d, p7/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p7.s, p0/z, z25.s, z25.s -; CHECK-NEXT: mov z5.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.s, p0/z, z24.s, z24.s -; CHECK-NEXT: mov z27.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.s, p0/z, z1.s, z1.s -; CHECK-NEXT: mov z11.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.s, p0/z, z0.s, z0.s -; CHECK-NEXT: st1d { z6.d }, p0, [x8, #7, mul vl] -; CHECK-NEXT: sel z0.d, p3, z2.d, z26.d -; CHECK-NEXT: st1d { z5.d }, p0, [x8, #6, mul vl] -; CHECK-NEXT: mov z4.d, p7/m, #0 // =0x0 -; CHECK-NEXT: st1d { z27.d }, p0, [x8, #5, mul vl] -; CHECK-NEXT: mov z3.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z7.d, p4/m, #0 // =0x0 -; CHECK-NEXT: st1d { z11.d }, p0, [x8, #4, mul vl] -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: st1d { z4.d }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [x8, #2, mul vl] -; CHECK-NEXT: st1d { z7.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: mov z0.d, p2/m, z29.d +; CHECK-NEXT: str z11, [x8, #4, mul vl] +; CHECK-NEXT: mov z27.d, p5/m, #0 // =0x0 +; CHECK-NEXT: str z31, [x8, #2, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z27, [x8, #1, mul vl] +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1491,24 +1490,31 @@ define @lrint_v32f64( %x) { ; CHECK-LABEL: lrint_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-12 +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z18, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z17, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z16, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z15, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z14, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 96 * VG +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG @@ -1518,244 +1524,245 @@ define @lrint_v32f64( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: rdvl x9, #8 -; CHECK-NEXT: rdvl x10, #9 +; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: rdvl x11, #10 -; CHECK-NEXT: mov x12, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x9] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, x10] -; CHECK-NEXT: mov z2.d, x12 -; CHECK-NEXT: rdvl x14, #13 -; CHECK-NEXT: rdvl x13, #12 -; CHECK-NEXT: rdvl x12, #11 -; CHECK-NEXT: ld1b { z6.b }, p1/z, [x0, x14] -; CHECK-NEXT: ld1b { z7.b }, p1/z, [x0, x13] -; CHECK-NEXT: mov z3.d, #0x8000000000000000 -; CHECK-NEXT: movprfx z24, z0 -; CHECK-NEXT: frintx z24.d, p0/m, z0.d -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x11] -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: frintx z5.d, p0/m, z1.d -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, x12] -; CHECK-NEXT: mov x15, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: rdvl x16, #15 -; CHECK-NEXT: movprfx z30, z6 -; CHECK-NEXT: frintx z30.d, p0/m, z6.d -; CHECK-NEXT: movprfx z28, z7 -; CHECK-NEXT: frintx z28.d, p0/m, z7.d -; CHECK-NEXT: ld1b { z8.b }, p1/z, [x0, x16] -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: frintx z4.d, p0/m, z0.d -; CHECK-NEXT: mov z0.d, #0x7fffffffffffffff -; CHECK-NEXT: ld1d { z18.d }, p0/z, [x0] -; CHECK-NEXT: fcmge p3.d, p0/z, z5.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z24.d, z2.d -; CHECK-NEXT: movprfx z6, z5 -; CHECK-NEXT: fcvtzs z6.d, p0/m, z5.d -; CHECK-NEXT: movprfx z27, z1 -; CHECK-NEXT: frintx z27.d, p0/m, z1.d +; CHECK-NEXT: ldr z2, [x0, #2, mul vl] +; CHECK-NEXT: mov x9, #-4332462841530417152 // =0xc3e0000000000000 +; CHECK-NEXT: ldr z24, [x0, #6, mul vl] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] +; CHECK-NEXT: mov z7.d, x9 +; CHECK-NEXT: mov z26.d, #0x8000000000000000 +; CHECK-NEXT: ldr z3, [x0, #3, mul vl] +; CHECK-NEXT: frintx z0.d, p0/m, z0.d +; CHECK-NEXT: movprfx z30, z2 +; CHECK-NEXT: frintx z30.d, p0/m, z2.d +; CHECK-NEXT: ldr z6, [x0, #5, mul vl] ; CHECK-NEXT: movprfx z25, z24 -; CHECK-NEXT: fcvtzs z25.d, p0/m, z24.d -; CHECK-NEXT: mov z1.d, x15 -; CHECK-NEXT: rdvl x15, #14 -; CHECK-NEXT: movprfx z9, z28 -; CHECK-NEXT: fcvtzs z9.d, p0/m, z28.d -; CHECK-NEXT: movprfx z13, z8 -; CHECK-NEXT: frintx z13.d, p0/m, z8.d -; CHECK-NEXT: fcmge p4.d, p0/z, z4.d, z2.d -; CHECK-NEXT: movprfx z7, z4 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z4.d -; CHECK-NEXT: ld1d { z15.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: fcmgt p5.d, p0/z, z24.d, z1.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z5.d, z1.d -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmge p7.d, p0/z, z27.d, z2.d -; CHECK-NEXT: movprfx z26, z27 -; CHECK-NEXT: fcvtzs z26.d, p0/m, z27.d -; CHECK-NEXT: sel z29.d, p3, z3.d, z6.d -; CHECK-NEXT: ld1b { z6.b }, p1/z, [x0, x15] -; CHECK-NEXT: fcmge p3.d, p0/z, z28.d, z2.d +; CHECK-NEXT: frintx z25.d, p0/m, z24.d +; CHECK-NEXT: movprfx z12, z1 +; CHECK-NEXT: frintx z12.d, p0/m, z1.d +; CHECK-NEXT: ldr z5, [x0, #4, mul vl] +; CHECK-NEXT: frintx z3.d, p0/m, z3.d +; CHECK-NEXT: mov x9, #4890909195324358655 // =0x43dfffffffffffff +; CHECK-NEXT: frintx z6.d, p0/m, z6.d +; CHECK-NEXT: mov z4.d, x9 +; CHECK-NEXT: fcmge p3.d, p0/z, z0.d, z7.d +; CHECK-NEXT: movprfx z24, z0 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.d +; CHECK-NEXT: fcmge p5.d, p0/z, z30.d, z7.d +; CHECK-NEXT: movprfx z28, z30 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z30.d +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: frintx z5.d, p0/m, z5.d +; CHECK-NEXT: fcmge p4.d, p0/z, z12.d, z7.d +; CHECK-NEXT: ldr z8, [x0, #7, mul vl] +; CHECK-NEXT: ldr z9, [x0, #15, mul vl] +; CHECK-NEXT: movprfx z27, z12 +; CHECK-NEXT: fcvtzs z27.d, p0/m, z12.d +; CHECK-NEXT: fcmge p6.d, p0/z, z3.d, z7.d +; CHECK-NEXT: fcmge p9.d, p0/z, z6.d, z7.d +; CHECK-NEXT: not p7.b, p0/z, p3.b +; CHECK-NEXT: movprfx z31, z3 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z3.d +; CHECK-NEXT: movprfx z15, z6 +; CHECK-NEXT: fcvtzs z15.d, p0/m, z6.d +; CHECK-NEXT: not p5.b, p0/z, p5.b +; CHECK-NEXT: fcmge p8.d, p0/z, z5.d, z7.d +; CHECK-NEXT: movprfx z13, z5 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z5.d +; CHECK-NEXT: sel z0.d, p7, z26.d, z24.d ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z25.d, p2/m, z3.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z4.d, z1.d -; CHECK-NEXT: movprfx z16, z13 -; CHECK-NEXT: fcvtzs z16.d, p0/m, z13.d -; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z14.d }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: sel z31.d, p4, z3.d, z7.d -; CHECK-NEXT: movprfx z11, z6 -; CHECK-NEXT: frintx z11.d, p0/m, z6.d -; CHECK-NEXT: not p7.b, p0/z, p7.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: sel z6.d, p5, z0.d, z25.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z27.d, z1.d -; CHECK-NEXT: sel z7.d, p6, z0.d, z29.d -; CHECK-NEXT: mov z26.d, p7/m, z3.d -; CHECK-NEXT: fcmge p5.d, p0/z, z13.d, z2.d -; CHECK-NEXT: sel z25.d, p2, z0.d, z31.d -; CHECK-NEXT: fcmge p2.d, p0/z, z30.d, z2.d -; CHECK-NEXT: sel z29.d, p3, z3.d, z9.d -; CHECK-NEXT: fcmge p3.d, p0/z, z11.d, z2.d -; CHECK-NEXT: movprfx z31, z30 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z30.d -; CHECK-NEXT: movprfx z9, z11 -; CHECK-NEXT: fcvtzs z9.d, p0/m, z11.d -; CHECK-NEXT: mov z26.d, p4/m, z0.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z28.d, z1.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z30.d, z1.d -; CHECK-NEXT: not p7.b, p0/z, p5.b -; CHECK-NEXT: fcmuo p5.d, p0/z, z27.d, z27.d -; CHECK-NEXT: fcmgt p8.d, p0/z, z13.d, z1.d -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: movprfx z27, z18 -; CHECK-NEXT: frintx z27.d, p0/m, z18.d -; CHECK-NEXT: ld1d { z8.d }, p0/z, [x0, #7, mul vl] -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z16.d, p7/m, z3.d -; CHECK-NEXT: fcmuo p7.d, p0/z, z13.d, z13.d -; CHECK-NEXT: mov z31.d, p2/m, z3.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z11.d, z1.d -; CHECK-NEXT: mov z29.d, p4/m, z0.d -; CHECK-NEXT: mov z9.d, p3/m, z3.d -; CHECK-NEXT: fcmuo p3.d, p0/z, z28.d, z28.d -; CHECK-NEXT: fcmuo p4.d, p0/z, z30.d, z30.d -; CHECK-NEXT: movprfx z28, z17 -; CHECK-NEXT: frintx z28.d, p0/m, z17.d -; CHECK-NEXT: movprfx z30, z15 -; CHECK-NEXT: frintx z30.d, p0/m, z15.d -; CHECK-NEXT: ld1d { z13.d }, p0/z, [x0, #4, mul vl] -; CHECK-NEXT: mov z31.d, p6/m, z0.d -; CHECK-NEXT: fcmuo p6.d, p0/z, z11.d, z11.d -; CHECK-NEXT: sel z11.d, p8, z0.d, z16.d -; CHECK-NEXT: mov z9.d, p2/m, z0.d -; CHECK-NEXT: fcmuo p2.d, p0/z, z24.d, z24.d -; CHECK-NEXT: movprfx z24, z14 -; CHECK-NEXT: frintx z24.d, p0/m, z14.d -; CHECK-NEXT: fcmge p8.d, p0/z, z27.d, z2.d -; CHECK-NEXT: ld1d { z10.d }, p0/z, [x0, #6, mul vl] -; CHECK-NEXT: ld1d { z12.d }, p0/z, [x0, #5, mul vl] -; CHECK-NEXT: mov z26.d, p5/m, #0 // =0x0 -; CHECK-NEXT: mov z29.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmge p5.d, p0/z, z28.d, z2.d -; CHECK-NEXT: movprfx z14, z27 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z27.d -; CHECK-NEXT: fcmge p3.d, p0/z, z30.d, z2.d -; CHECK-NEXT: frintx z13.d, p0/m, z13.d -; CHECK-NEXT: mov z31.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmge p4.d, p0/z, z24.d, z2.d -; CHECK-NEXT: mov z9.d, p6/m, #0 // =0x0 -; CHECK-NEXT: movprfx z15, z28 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z28.d -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: movprfx z16, z30 -; CHECK-NEXT: fcvtzs z16.d, p0/m, z30.d -; CHECK-NEXT: frintx z12.d, p0/m, z12.d +; CHECK-NEXT: movprfx z17, z25 +; CHECK-NEXT: fcvtzs z17.d, p0/m, z25.d +; CHECK-NEXT: not p3.b, p0/z, p6.b +; CHECK-NEXT: fcmge p6.d, p0/z, z25.d, z7.d +; CHECK-NEXT: movprfx z22, z9 +; CHECK-NEXT: frintx z22.d, p0/m, z9.d +; CHECK-NEXT: sel z29.d, p4, z26.d, z27.d +; CHECK-NEXT: movprfx z27, z8 +; CHECK-NEXT: frintx z27.d, p0/m, z8.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z12.d, z4.d +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z0.d, p5, z26.d, z28.d +; CHECK-NEXT: not p4.b, p0/z, p8.b +; CHECK-NEXT: ldr z10, [x0, #8, mul vl] +; CHECK-NEXT: not p5.b, p0/z, p9.b +; CHECK-NEXT: sel z24.d, p3, z26.d, z31.d +; CHECK-NEXT: not p3.b, p0/z, p6.b +; CHECK-NEXT: movprfx z2, z22 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z22.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z30.d, z4.d +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fcmge p7.d, p0/z, z27.d, z7.d +; CHECK-NEXT: sel z31.d, p5, z26.d, z15.d +; CHECK-NEXT: ldr z11, [x0, #9, mul vl] +; CHECK-NEXT: movprfx z28, z10 +; CHECK-NEXT: frintx z28.d, p0/m, z10.d +; CHECK-NEXT: ldr z10, [x0, #10, mul vl] +; CHECK-NEXT: ldr z18, [x0, #11, mul vl] +; CHECK-NEXT: ldr z16, [x0, #13, mul vl] +; CHECK-NEXT: ldr z14, [x0, #14, mul vl] +; CHECK-NEXT: ldr z19, [x0, #12, mul vl] +; CHECK-NEXT: mov z17.d, p3/m, z26.d +; CHECK-NEXT: fcmgt p9.d, p0/z, z3.d, z4.d +; CHECK-NEXT: movprfx z8, z11 +; CHECK-NEXT: frintx z8.d, p0/m, z11.d +; CHECK-NEXT: sel z11.d, p4, z26.d, z13.d ; CHECK-NEXT: frintx z10.d, p0/m, z10.d -; CHECK-NEXT: movprfx z17, z24 -; CHECK-NEXT: fcvtzs z17.d, p0/m, z24.d -; CHECK-NEXT: movprfx z18, z8 -; CHECK-NEXT: frintx z18.d, p0/m, z8.d +; CHECK-NEXT: movprfx z13, z18 +; CHECK-NEXT: frintx z13.d, p0/m, z18.d +; CHECK-NEXT: fcmge p5.d, p0/z, z28.d, z7.d +; CHECK-NEXT: movprfx z18, z27 +; CHECK-NEXT: fcvtzs z18.d, p0/m, z27.d +; CHECK-NEXT: frintx z16.d, p0/m, z16.d +; CHECK-NEXT: movprfx z15, z19 +; CHECK-NEXT: frintx z15.d, p0/m, z19.d +; CHECK-NEXT: movprfx z19, z28 +; CHECK-NEXT: fcvtzs z19.d, p0/m, z28.d +; CHECK-NEXT: movprfx z21, z14 +; CHECK-NEXT: frintx z21.d, p0/m, z14.d +; CHECK-NEXT: not p4.b, p0/z, p7.b +; CHECK-NEXT: fcmge p6.d, p0/z, z8.d, z7.d +; CHECK-NEXT: movprfx z20, z8 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z8.d +; CHECK-NEXT: fcmge p7.d, p0/z, z10.d, z7.d +; CHECK-NEXT: fcmge p8.d, p0/z, z13.d, z7.d ; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: sel z8.d, p6, z3.d, z14.d -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: fcmge p6.d, p0/z, z13.d, z2.d -; CHECK-NEXT: mov z11.d, p7/m, #0 // =0x0 -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: sel z14.d, p5, z3.d, z15.d -; CHECK-NEXT: fcmuo p7.d, p0/z, z5.d, z5.d -; CHECK-NEXT: sel z15.d, p3, z3.d, z16.d -; CHECK-NEXT: movprfx z16, z13 -; CHECK-NEXT: fcvtzs z16.d, p0/m, z13.d -; CHECK-NEXT: fcmge p5.d, p0/z, z12.d, z2.d -; CHECK-NEXT: fcmge p3.d, p0/z, z10.d, z2.d -; CHECK-NEXT: sel z5.d, p4, z3.d, z17.d -; CHECK-NEXT: fcmge p4.d, p0/z, z18.d, z2.d +; CHECK-NEXT: sel z9.d, p4, z26.d, z18.d +; CHECK-NEXT: fcmge p4.d, p0/z, z16.d, z7.d +; CHECK-NEXT: fcmge p3.d, p0/z, z15.d, z7.d +; CHECK-NEXT: movprfx z0, z16 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z16.d +; CHECK-NEXT: sel z14.d, p5, z26.d, z19.d +; CHECK-NEXT: movprfx z19, z10 +; CHECK-NEXT: fcvtzs z19.d, p0/m, z10.d +; CHECK-NEXT: movprfx z1, z21 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z21.d ; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: movprfx z2, z12 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z12.d -; CHECK-NEXT: movprfx z17, z10 -; CHECK-NEXT: fcvtzs z17.d, p0/m, z10.d -; CHECK-NEXT: st1b { z11.b }, p1, [x8, x16] -; CHECK-NEXT: movprfx z11, z18 -; CHECK-NEXT: fcvtzs z11.d, p0/m, z18.d -; CHECK-NEXT: mov z6.d, p2/m, #0 // =0x0 -; CHECK-NEXT: st1b { z9.b }, p1, [x8, x15] -; CHECK-NEXT: sel z9.d, p6, z3.d, z16.d -; CHECK-NEXT: fcmuo p6.d, p0/z, z4.d, z4.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: fcmgt p2.d, p0/z, z18.d, z1.d -; CHECK-NEXT: mov z7.d, p7/m, #0 // =0x0 -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: st1b { z31.b }, p1, [x8, x14] -; CHECK-NEXT: fcmgt p7.d, p0/z, z24.d, z1.d +; CHECK-NEXT: movprfx z23, z15 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z15.d +; CHECK-NEXT: not p5.b, p0/z, p7.b +; CHECK-NEXT: sel z18.d, p6, z26.d, z20.d +; CHECK-NEXT: fcmge p6.d, p0/z, z21.d, z7.d +; CHECK-NEXT: not p7.b, p0/z, p8.b +; CHECK-NEXT: fcmge p8.d, p0/z, z22.d, z7.d +; CHECK-NEXT: movprfx z20, z13 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z13.d ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z2.d, p5/m, z3.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z28.d, z1.d -; CHECK-NEXT: sel z4.d, p3, z3.d, z17.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z13.d, z1.d -; CHECK-NEXT: mov z25.d, p6/m, #0 // =0x0 -; CHECK-NEXT: sel z3.d, p4, z3.d, z11.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z10.d, z1.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z12.d, z1.d -; CHECK-NEXT: st1b { z29.b }, p1, [x8, x13] -; CHECK-NEXT: st1b { z26.b }, p1, [x8, x12] -; CHECK-NEXT: sel z26.d, p5, z0.d, z14.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z30.d, z1.d -; CHECK-NEXT: sel z29.d, p3, z0.d, z9.d -; CHECK-NEXT: fcmuo p3.d, p0/z, z18.d, z18.d -; CHECK-NEXT: mov z3.d, p2/m, z0.d -; CHECK-NEXT: st1b { z25.b }, p1, [x8, x11] -; CHECK-NEXT: fcmuo p2.d, p0/z, z10.d, z10.d -; CHECK-NEXT: mov z4.d, p4/m, z0.d -; CHECK-NEXT: fcmuo p4.d, p0/z, z12.d, z12.d -; CHECK-NEXT: st1b { z7.b }, p1, [x8, x10] -; CHECK-NEXT: mov z2.d, p6/m, z0.d -; CHECK-NEXT: st1b { z6.b }, p1, [x8, x9] -; CHECK-NEXT: fcmuo p1.d, p0/z, z13.d, z13.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z27.d, z1.d -; CHECK-NEXT: mov z3.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p3.d, p0/z, z24.d, z24.d -; CHECK-NEXT: sel z1.d, p7, z0.d, z5.d -; CHECK-NEXT: mov z4.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.d, p0/z, z30.d, z30.d -; CHECK-NEXT: sel z5.d, p5, z0.d, z15.d -; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.d, p0/z, z28.d, z28.d -; CHECK-NEXT: mov z29.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.d, p0/z, z27.d, z27.d -; CHECK-NEXT: sel z0.d, p6, z0.d, z8.d +; CHECK-NEXT: mov z7.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z19.d, p5/m, z26.d +; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: mov z0.d, p4/m, z26.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z21.d, z4.d +; CHECK-NEXT: not p5.b, p0/z, p6.b +; CHECK-NEXT: mov z23.d, p3/m, z26.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z22.d, z4.d +; CHECK-NEXT: not p6.b, p0/z, p8.b +; CHECK-NEXT: mov z20.d, p7/m, z26.d +; CHECK-NEXT: fcmuo p8.d, p0/z, z22.d, z22.d +; CHECK-NEXT: mov z1.d, p5/m, z26.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z21.d, z21.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z25.d, z4.d +; CHECK-NEXT: mov z2.d, p6/m, z26.d +; CHECK-NEXT: sel z26.d, p1, z7.d, z29.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z16.d, z4.d +; CHECK-NEXT: ldr z29, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmgt p6.d, p0/z, z5.d, z4.d +; CHECK-NEXT: mov z24.d, p9/m, z7.d +; CHECK-NEXT: mov z1.d, p4/m, z7.d +; CHECK-NEXT: fcmuo p4.d, p0/z, z16.d, z16.d +; CHECK-NEXT: mov z2.d, p3/m, z7.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z15.d, z4.d +; CHECK-NEXT: mov z17.d, p7/m, z7.d +; CHECK-NEXT: mov z29.d, p2/m, z7.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z13.d, z4.d +; CHECK-NEXT: mov z0.d, p1/m, z7.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z10.d, z4.d +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z11.d, p6/m, z7.d +; CHECK-NEXT: fcmuo p6.d, p0/z, z15.d, z15.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z8.d, z4.d +; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z16.d, p3, z7.d, z23.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z10.d, z10.d +; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 +; CHECK-NEXT: sel z15.d, p2, z7.d, z20.d +; CHECK-NEXT: fcmuo p2.d, p0/z, z13.d, z13.d +; CHECK-NEXT: str z1, [x8, #14, mul vl] +; CHECK-NEXT: sel z1.d, p1, z7.d, z19.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z28.d, z4.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z27.d, z4.d +; CHECK-NEXT: str z2, [x8, #15, mul vl] +; CHECK-NEXT: sel z2.d, p5, z7.d, z18.d +; CHECK-NEXT: mov z16.d, p6/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.d, p0/z, z8.d, z8.d +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p2.d, p0/z, z28.d, z28.d ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 -; CHECK-NEXT: st1d { z3.d }, p0, [x8, #7, mul vl] -; CHECK-NEXT: mov z5.d, p2/m, #0 // =0x0 -; CHECK-NEXT: st1d { z4.d }, p0, [x8, #6, mul vl] -; CHECK-NEXT: mov z26.d, p4/m, #0 // =0x0 -; CHECK-NEXT: st1d { z2.d }, p0, [x8, #5, mul vl] -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: st1d { z29.d }, p0, [x8, #4, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [x8, #2, mul vl] -; CHECK-NEXT: st1d { z26.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] -; CHECK-NEXT: ldr z18, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z17, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z16, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z15, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: fcmgt p3.d, p0/z, z6.d, z4.d +; CHECK-NEXT: sel z0.d, p1, z7.d, z14.d +; CHECK-NEXT: fcmuo p1.d, p0/z, z27.d, z27.d +; CHECK-NEXT: sel z27.d, p4, z7.d, z9.d +; CHECK-NEXT: str z16, [x8, #12, mul vl] +; CHECK-NEXT: fcmuo p4.d, p0/z, z25.d, z25.d +; CHECK-NEXT: str z15, [x8, #11, mul vl] +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.d, p0/z, z6.d, z6.d +; CHECK-NEXT: str z1, [x8, #10, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: sel z1.d, p3, z7.d, z31.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z5.d, z5.d +; CHECK-NEXT: ldr z5, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z27.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z2, [x8, #9, mul vl] +; CHECK-NEXT: fcmuo p1.d, p0/z, z3.d, z3.d +; CHECK-NEXT: str z0, [x8, #8, mul vl] +; CHECK-NEXT: mov z17.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.d, p0/z, z30.d, z30.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z5.d, z4.d +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.d, p0/z, z12.d, z12.d +; CHECK-NEXT: str z27, [x8, #7, mul vl] +; CHECK-NEXT: fcmuo p0.d, p0/z, z5.d, z5.d +; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z24.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z17, [x8, #6, mul vl] +; CHECK-NEXT: mov z29.d, p4/m, #0 // =0x0 +; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: mov z26.d, p5/m, #0 // =0x0 +; CHECK-NEXT: str z11, [x8, #4, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: str z24, [x8, #3, mul vl] +; CHECK-NEXT: str z29, [x8, #2, mul vl] +; CHECK-NEXT: str z26, [x8, #1, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #12 +; CHECK-NEXT: addvl sp, sp, #17 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %a = call @llvm.lrint.nxv32iXLen.nxv16f64( %x) diff --git a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll index 1931cfc2ef51d..78f93f1ecbb26 100644 --- a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll +++ b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll @@ -42,7 +42,7 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x16, x14] -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16] +; CHECK-NEXT: ldr z5, [x16] ; CHECK-NEXT: add x17, x16, x15 ; CHECK-NEXT: add x18, x16, x14 ; CHECK-NEXT: add x3, x17, #8 @@ -54,37 +54,37 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, x13, lsl #1] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16, #1, mul vl] -; CHECK-NEXT: st1h { z4.h }, p0, [x16] -; CHECK-NEXT: ld1h { z4.h }, p0/z, [x18, #1, mul vl] +; CHECK-NEXT: ldr z5, [x16, #1, mul vl] +; CHECK-NEXT: str z4, [x16] +; CHECK-NEXT: ldr z4, [x18, #1, mul vl] ; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #1, mul vl] +; CHECK-NEXT: ldr z5, [x17, #1, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x3, #1, mul vl] +; CHECK-NEXT: ldr z5, [x3, #1, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #1, mul vl] +; CHECK-NEXT: ldr z5, [x4, #1, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16, #2, mul vl] -; CHECK-NEXT: st1h { z4.h }, p0, [x16, #1, mul vl] -; CHECK-NEXT: ld1h { z4.h }, p0/z, [x18, #2, mul vl] +; CHECK-NEXT: ldr z5, [x16, #2, mul vl] +; CHECK-NEXT: str z4, [x16, #1, mul vl] +; CHECK-NEXT: ldr z4, [x18, #2, mul vl] ; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #2, mul vl] +; CHECK-NEXT: ldr z5, [x17, #2, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x3, #2, mul vl] +; CHECK-NEXT: ldr z5, [x3, #2, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #2, mul vl] +; CHECK-NEXT: ldr z5, [x4, #2, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16, #3, mul vl] -; CHECK-NEXT: st1h { z4.h }, p0, [x16, #2, mul vl] -; CHECK-NEXT: ld1h { z4.h }, p0/z, [x18, #3, mul vl] +; CHECK-NEXT: ldr z5, [x16, #3, mul vl] +; CHECK-NEXT: str z4, [x16, #2, mul vl] +; CHECK-NEXT: ldr z4, [x18, #3, mul vl] ; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #3, mul vl] +; CHECK-NEXT: ldr z5, [x17, #3, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x3, #3, mul vl] +; CHECK-NEXT: ldr z5, [x3, #3, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h -; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #3, mul vl] +; CHECK-NEXT: ldr z5, [x4, #3, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h -; CHECK-NEXT: st1h { z4.h }, p0, [x16, #3, mul vl] +; CHECK-NEXT: str z4, [x16, #3, mul vl] ; CHECK-NEXT: addvl x16, x16, #4 ; CHECK-NEXT: cmp x16, x11 ; CHECK-NEXT: b.lo .LBB0_4 diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll index 09ee1e7d3b12e..622040eeb2f33 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll @@ -89,19 +89,18 @@ define void @masked_scatter_nxv8f32( %data, ptr %base, %data, ptr %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.s -; CHECK-NEXT: ld1w { z24.s }, p2/z, [x1, #7, mul vl] -; CHECK-NEXT: ld1w { z25.s }, p2/z, [x1, #6, mul vl] -; CHECK-NEXT: ld1w { z26.s }, p2/z, [x1, #5, mul vl] -; CHECK-NEXT: ld1w { z27.s }, p2/z, [x1, #4, mul vl] -; CHECK-NEXT: ld1w { z28.s }, p2/z, [x1, #3, mul vl] -; CHECK-NEXT: ld1w { z29.s }, p2/z, [x1, #2, mul vl] -; CHECK-NEXT: ld1w { z30.s }, p2/z, [x1, #1, mul vl] -; CHECK-NEXT: ld1w { z31.s }, p2/z, [x1] ; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: ldr z30, [x1, #1, mul vl] +; CHECK-NEXT: ldr z31, [x1] ; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ldr z28, [x1, #3, mul vl] +; CHECK-NEXT: ldr z29, [x1, #2, mul vl] ; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: ldr z24, [x1, #7, mul vl] +; CHECK-NEXT: ldr z25, [x1, #6, mul vl] ; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: ldr z26, [x1, #5, mul vl] +; CHECK-NEXT: ldr z27, [x1, #4, mul vl] ; CHECK-NEXT: st1w { z0.s }, p3, [x0, z31.s, sxtw #2] ; CHECK-NEXT: st1w { z1.s }, p2, [x0, z30.s, sxtw #2] ; CHECK-NEXT: punpklo p2.h, p0.b diff --git a/llvm/test/CodeGen/AArch64/sve-min-max-pred.ll b/llvm/test/CodeGen/AArch64/sve-min-max-pred.ll index cd0206fdcbf8e..076928ae7451f 100644 --- a/llvm/test/CodeGen/AArch64/sve-min-max-pred.ll +++ b/llvm/test/CodeGen/AArch64/sve-min-max-pred.ll @@ -179,7 +179,7 @@ define @umin_select_i64_multiuse( %pg, @llvm.umin.nxv2i64( %a, %b) store %sel, ptr %p diff --git a/llvm/test/CodeGen/AArch64/sve-pr92779.ll b/llvm/test/CodeGen/AArch64/sve-pr92779.ll index e25794817add0..480f41eb0f81b 100644 --- a/llvm/test/CodeGen/AArch64/sve-pr92779.ll +++ b/llvm/test/CodeGen/AArch64/sve-pr92779.ll @@ -14,8 +14,7 @@ define void @main(ptr %0) { ; CHECK-NEXT: smov x9, v1.s[1] ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: mov z0.d, p0/m, x9 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret "entry": %1 = bitcast zeroinitializer to diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll index adbdee0eb0847..bbc94f568dd0a 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll @@ -2954,9 +2954,9 @@ define @mul_use_nxv4i32_x( %x, %n, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll index 6607f9c3b368e..66dece82a0ac5 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll @@ -1832,11 +1832,11 @@ define @mul_nxv4i32_multiuse_x( %x, %n, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-reassocadd.ll b/llvm/test/CodeGen/AArch64/sve-reassocadd.ll index f54098b29a272..58697e6c2ec71 100644 --- a/llvm/test/CodeGen/AArch64/sve-reassocadd.ll +++ b/llvm/test/CodeGen/AArch64/sve-reassocadd.ll @@ -22,9 +22,8 @@ entry: define @i8_4s_1v(ptr %b) { ; CHECK-LABEL: i8_4s_1v: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ldr z0, [x8, #1, mul vl] ; CHECK-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 4 @@ -56,9 +55,8 @@ entry: define @i16_8s_1v(ptr %b) { ; CHECK-LABEL: i16_8s_1v: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ldr z0, [x8, #1, mul vl] ; CHECK-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 8 @@ -90,9 +88,8 @@ entry: define @i16_8s_2v(ptr %b) { ; CHECK-LABEL: i16_8s_2v: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ldr z0, [x8, #2, mul vl] ; CHECK-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 8 @@ -124,9 +121,8 @@ entry: define @i32_16s_2v(ptr %b) { ; CHECK-LABEL: i32_16s_2v: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, x0, #16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ldr z0, [x8, #1, mul vl] ; CHECK-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 16 @@ -158,9 +154,8 @@ entry: define @i64_32s_2v(ptr %b) { ; CHECK-LABEL: i64_32s_2v: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, x0, #32 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ldr z0, [x8, #1, mul vl] ; CHECK-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 32 @@ -193,9 +188,8 @@ entry: define @i8_4s_m2v(ptr %b) { ; CHECK-LABEL: i8_4s_m2v: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, #-2, mul vl] +; CHECK-NEXT: ldr z0, [x8, #-2, mul vl] ; CHECK-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 4 @@ -227,9 +221,8 @@ entry: define @i16_8s_m2v(ptr %b) { ; CHECK-LABEL: i16_8s_m2v: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, #-2, mul vl] +; CHECK-NEXT: ldr z0, [x8, #-2, mul vl] ; CHECK-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 8 @@ -261,9 +254,8 @@ entry: define @i32_16s_m2v(ptr %b) { ; CHECK-LABEL: i32_16s_m2v: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, x0, #16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, #-2, mul vl] +; CHECK-NEXT: ldr z0, [x8, #-2, mul vl] ; CHECK-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 16 @@ -295,9 +287,8 @@ entry: define @i64_32s_m2v(ptr %b) { ; CHECK-LABEL: i64_32s_m2v: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, x0, #32 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, #-2, mul vl] +; CHECK-NEXT: ldr z0, [x8, #-2, mul vl] ; CHECK-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 32 diff --git a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll index 508fe5d5a58a5..8f458fbc93fec 100644 --- a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll @@ -11,8 +11,7 @@ define void @redundant_store(ptr nocapture %p, %v) { ; CHECK-LABEL: redundant_store: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret store i32 1, ptr %p, align 4 store %v, ptr %p, align 16 @@ -22,8 +21,7 @@ define void @redundant_store(ptr nocapture %p, %v) { define void @two_scalable_same_size(ptr writeonly %ptr, %a, %b) { ; CHECK-LABEL: two_scalable_same_size: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: str z1, [x0] ; CHECK-NEXT: ret entry: store %a, ptr %ptr @@ -36,8 +34,7 @@ define void @keep_scalable_store(ptr writeonly %ptr, ptr %a, ; CHECK-LABEL: keep_scalable_store: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q2, q1, [x1] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret entry: @@ -50,11 +47,9 @@ entry: define void @two_scalable_keep_stores(ptr writeonly %ptr, %a, %b) { ; CHECK-LABEL: two_scalable_keep_stores: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: st1d { z2.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x0] -; CHECK-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-NEXT: str z2, [x0, #1, mul vl] +; CHECK-NEXT: str z1, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret entry: store %b, ptr %ptr @@ -65,9 +60,8 @@ entry: define void @two_scalable_remove_store(ptr writeonly %ptr, %a, %b) { ; CHECK-LABEL: two_scalable_remove_store: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z2.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x0] +; CHECK-NEXT: str z2, [x0, #1, mul vl] +; CHECK-NEXT: str z1, [x0] ; CHECK-NEXT: ret entry: store %a, ptr %ptr diff --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll index 76190eba870de..9a4231a57c61f 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll @@ -23,12 +23,11 @@ define i8 @split_extract_32i8_idx( %a, i32 %idx) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdvl x8, #2 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w9, w0 +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldrb w0, [x9, x8] @@ -47,12 +46,11 @@ define i16 @split_extract_16i16_idx( %a, i32 %idx) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, w0 +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldrh w0, [x9, x8, lsl #1] @@ -71,12 +69,11 @@ define i32 @split_extract_8i32_idx( %a, i32 %idx) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cnth x8 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w9, w0 +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldr w0, [x9, x8, lsl #2] @@ -95,16 +92,15 @@ define i64 @split_extract_8i64_idx( %a, i32 %idx) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cnth x8 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, w0 +; CHECK-NEXT: str z3, [sp, #3, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z2, [sp, #2, mul vl] ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: ldr x0, [x9, x8, lsl #3] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -141,12 +137,11 @@ define i16 @split_extract_16i16( %a) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, #128 // =0x80 +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldrh w0, [x9, x8, lsl #1] @@ -166,16 +161,15 @@ define i32 @split_extract_16i32( %a) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdvl x8, #1 ; CHECK-NEXT: mov w9, #34464 // =0x86a0 -; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z3, [sp, #3, mul vl] ; CHECK-NEXT: movk w9, #1, lsl #16 ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z2, [sp, #2, mul vl] ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: ldr w0, [x9, x8, lsl #2] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -192,12 +186,11 @@ define i64 @split_extract_4i64( %a) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cntw x8 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, #10 // =0xa +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: cmp x8, #10 -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldr x0, [x9, x8, lsl #3] diff --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll index 75366384cb750..d7ed42d717937 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll @@ -24,16 +24,15 @@ define @split_insert_32i8_idx( %a, i8 %elt, ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdvl x8, #2 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: cmp x1, x8 -; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x1, x8, lo -; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: strb w0, [x9, x8] -; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -49,16 +48,15 @@ define @split_insert_8f32_idx( %a, floa ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cnth x8 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: cmp x0, x8 -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x0, x8, lo -; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: str s2, [x9, x8, lsl #2] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -74,20 +72,19 @@ define @split_insert_8i64_idx( %a, i64 %elt ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cnth x8 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str z3, [sp, #3, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z2, [sp, #2, mul vl] ; CHECK-NEXT: cmp x1, x8 -; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x1, x8, lo -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str x0, [x9, x8, lsl #3] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [sp, #3, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: ldr z3, [sp, #3, mul vl] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -136,21 +133,20 @@ define @split_insert_32i16( %a, i16 %elt) ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdvl x8, #2 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, #128 // =0x80 +; CHECK-NEXT: str z3, [sp, #3, mul vl] ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z2, [sp, #2, mul vl] ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1h { z2.h }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: strh w0, [x9, x8, lsl #1] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: ld1h { z3.h }, p0/z, [sp, #3, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: ldr z3, [sp, #3, mul vl] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -167,17 +163,16 @@ define @split_insert_8i32( %a, i32 %elt) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #16960 // =0x4240 -; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: movk w9, #15, lsl #16 ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: str w0, [x9, x8, lsl #2] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll index 065f49433c9aa..e1dd66c9d249a 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll @@ -16,9 +16,8 @@ define @load_promote_4i16(ptr %a) { define @load_split_i16(ptr %a) { ; CHECK-LABEL: load_split_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] ; CHECK-NEXT: ret %load = load , ptr %a ret %load @@ -27,10 +26,9 @@ define @load_split_i16(ptr %a) { define @load_split_24i16(ptr %a) { ; CHECK-LABEL: load_split_24i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] +; CHECK-NEXT: ldr z2, [x0, #2, mul vl] ; CHECK-NEXT: ret %load = load , ptr %a ret %load @@ -39,11 +37,10 @@ define @load_split_24i16(ptr %a) { define @load_split_32i16(ptr %a) { ; CHECK-LABEL: load_split_32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, #3, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] +; CHECK-NEXT: ldr z2, [x0, #2, mul vl] +; CHECK-NEXT: ldr z3, [x0, #3, mul vl] ; CHECK-NEXT: ret %load = load , ptr %a ret %load @@ -52,15 +49,14 @@ define @load_split_32i16(ptr %a) { define @load_split_16i64(ptr %a) { ; CHECK-LABEL: load_split_16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #4, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #5, mul vl] -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #6, mul vl] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x0, #7, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #1, mul vl] +; CHECK-NEXT: ldr z2, [x0, #2, mul vl] +; CHECK-NEXT: ldr z3, [x0, #3, mul vl] +; CHECK-NEXT: ldr z4, [x0, #4, mul vl] +; CHECK-NEXT: ldr z5, [x0, #5, mul vl] +; CHECK-NEXT: ldr z6, [x0, #6, mul vl] +; CHECK-NEXT: ldr z7, [x0, #7, mul vl] ; CHECK-NEXT: ret %load = load , ptr %a ret %load diff --git a/llvm/test/CodeGen/AArch64/sve-split-store.ll b/llvm/test/CodeGen/AArch64/sve-split-store.ll index affa9a18ac182..b1419b3f679cf 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-store.ll @@ -16,9 +16,8 @@ define void @store_promote_4i8( %data, ptr %a) { define void @store_split_i16( %data, ptr %a) { ; CHECK-LABEL: store_split_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z1.h }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: str z1, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret store %data, ptr %a ret void @@ -27,11 +26,10 @@ define void @store_split_i16( %data, ptr %a) { define void @store_split_16i32( %data, ptr %a) { ; CHECK-LABEL: store_split_16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z3.s }, p0, [x0, #3, mul vl] -; CHECK-NEXT: st1w { z2.s }, p0, [x0, #2, mul vl] -; CHECK-NEXT: st1w { z1.s }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: str z3, [x0, #3, mul vl] +; CHECK-NEXT: str z2, [x0, #2, mul vl] +; CHECK-NEXT: str z1, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret store %data, ptr %a ret void @@ -40,15 +38,14 @@ define void @store_split_16i32( %data, ptr %a) { define void @store_split_16i64( %data, ptr %a) { ; CHECK-LABEL: store_split_16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z7.d }, p0, [x0, #7, mul vl] -; CHECK-NEXT: st1d { z6.d }, p0, [x0, #6, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [x0, #5, mul vl] -; CHECK-NEXT: st1d { z4.d }, p0, [x0, #4, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [x0, #3, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [x0, #2, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: str z7, [x0, #7, mul vl] +; CHECK-NEXT: str z6, [x0, #6, mul vl] +; CHECK-NEXT: str z5, [x0, #5, mul vl] +; CHECK-NEXT: str z4, [x0, #4, mul vl] +; CHECK-NEXT: str z3, [x0, #3, mul vl] +; CHECK-NEXT: str z2, [x0, #2, mul vl] +; CHECK-NEXT: str z1, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret store %data, ptr %a ret void diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll index a0bfc7034a386..71b883f0ef7ec 100644 --- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll @@ -6,8 +6,7 @@ define void @st1b_lower_bound( %data, ptr %a) { ; CHECK-LABEL: st1b_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0, #-8, mul vl] +; CHECK-NEXT: str z0, [x0, #-8, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 -8 store %data, ptr %base @@ -17,8 +16,7 @@ define void @st1b_lower_bound( %data, ptr %a) { define void @st1b_inbound( %data, ptr %a) { ; CHECK-LABEL: st1b_inbound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0, #1, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 1 store %data, ptr %base @@ -28,8 +26,7 @@ define void @st1b_inbound( %data, ptr %a) { define void @st1b_upper_bound( %data, ptr %a) { ; CHECK-LABEL: st1b_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0, #7, mul vl] +; CHECK-NEXT: str z0, [x0, #7, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 7 store %data, ptr %base @@ -39,9 +36,7 @@ define void @st1b_upper_bound( %data, ptr %a) { define void @st1b_out_of_upper_bound( %data, ptr %a) { ; CHECK-LABEL: st1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: rdvl x8, #8 -; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] +; CHECK-NEXT: str z0, [x0, #8, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 8 store %data, ptr %base @@ -51,9 +46,7 @@ define void @st1b_out_of_upper_bound( %data, ptr %a) { define void @st1b_out_of_lower_bound( %data, ptr %a) { ; CHECK-LABEL: st1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: rdvl x8, #-9 -; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] +; CHECK-NEXT: str z0, [x0, #-9, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 -9 store %data, ptr %base @@ -65,8 +58,7 @@ define void @st1b_out_of_lower_bound( %data, ptr %a) { define void @st1h_inbound( %data, ptr %a) { ; CHECK-LABEL: st1h_inbound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0, #-6, mul vl] +; CHECK-NEXT: str z0, [x0, #-6, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 -6 store %data, ptr %base @@ -78,8 +70,7 @@ define void @st1h_inbound( %data, ptr %a) { define void @st1w_inbound( %data, ptr %a) { ; CHECK-LABEL: st1w_inbound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: str z0, [x0, #2, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 2 store %data, ptr %base @@ -91,8 +82,7 @@ define void @st1w_inbound( %data, ptr %a) { define void @st1d_inbound( %data, ptr %a) { ; CHECK-LABEL: st1d_inbound: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0, #5, mul vl] +; CHECK-NEXT: str z0, [x0, #5, mul vl] ; CHECK-NEXT: ret %base = getelementptr , ptr %a, i64 5 store %data, ptr %base @@ -131,9 +121,8 @@ define void @store_nxv6f32(ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: fmov z0.s, #1.00000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: st1w { z0.d }, p0, [x0, #2, mul vl] -; CHECK-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret store splat(float 1.0), ptr %out ret void @@ -144,9 +133,8 @@ define void @store_nxv12f16(ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: fmov z0.h, #1.00000000 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: st1h { z0.s }, p0, [x0, #2, mul vl] -; CHECK-NEXT: st1h { z0.h }, p1, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret store splat(half 1.0), ptr %out ret void diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index b4efbecb7f8bb..c5cf4593cc86d 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -20,14 +20,13 @@ define i32 @csr_d8_allocnxv4i32i32f64(double %d) "aarch64_pstate_sm_compatible" ; CHECK-NEXT: .cfi_offset w29, -8 ; CHECK-NEXT: .cfi_offset b8, -16 ; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str wzr, [sp, #12] ; CHECK-NEXT: str d0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: str z1, [x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload @@ -65,7 +64,6 @@ define i32 @csr_d8_allocnxv4i32i32f64_fp(double %d) "aarch64_pstate_sm_compatibl ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_offset b8, -32 ; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: addvl x8, sp, #1 ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP @@ -73,7 +71,7 @@ define i32 @csr_d8_allocnxv4i32i32f64_fp(double %d) "aarch64_pstate_sm_compatibl ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: str d0, [sp, #8] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #-1, mul vl] +; CHECK-NEXT: str z1, [x8, #-1, mul vl] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload @@ -117,14 +115,13 @@ define i32 @csr_d8_allocnxv4i32i32f64_dynamicrealign(double %d) "aarch64_pstate_ ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_offset b8, -32 ; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str wzr, [sp] ; CHECK-NEXT: stur d0, [x29, #-8] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #-1, mul vl] +; CHECK-NEXT: str z1, [x8, #-1, mul vl] ; CHECK-NEXT: sub sp, x29, #16 ; CHECK-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload @@ -179,15 +176,14 @@ define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_ ; CHECK-NEXT: sub x8, x10, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str wzr, [x8] ; CHECK-NEXT: sub x8, x29, #8 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: str wzr, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #-1, mul vl] ; CHECK-NEXT: str d0, [x19, #8] +; CHECK-NEXT: str z1, [x8, #-1, mul vl] ; CHECK-NEXT: sub sp, x29, #8 ; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload @@ -227,14 +223,13 @@ define i32 @csr_d8_allocnxv4i32i32f64_stackargsi32f64(double %d0, double %d1, do ; CHECK-NEXT: .cfi_offset w29, -8 ; CHECK-NEXT: .cfi_offset b8, -16 ; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str wzr, [sp, #12] ; CHECK-NEXT: str d0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: str z1, [x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload @@ -272,12 +267,11 @@ define i32 @svecc_z8_allocnxv4i32i32f64_fp(double %d, %v) "aa ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str wzr, [sp, #12] -; CHECK-NEXT: st1w { z1.s }, p0, [x29, #-2, mul vl] +; CHECK-NEXT: str z1, [x29, #-2, mul vl] ; CHECK-NEXT: str d0, [sp], #16 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload @@ -317,12 +311,11 @@ define i32 @svecc_z8_allocnxv4i32i32f64_stackargsi32_fp(double %d, i32 %i0, i32 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str wzr, [sp, #12] -; CHECK-NEXT: st1w { z1.s }, p0, [x29, #-2, mul vl] +; CHECK-NEXT: str z1, [x29, #-2, mul vl] ; CHECK-NEXT: str d0, [sp], #16 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload @@ -602,3 +595,5 @@ entry: ret i32 %x } declare void @other() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-FRAMELAYOUT: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll b/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll index 27637800f751f..981ccdbf589a4 100644 --- a/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll +++ b/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll @@ -21,9 +21,8 @@ define void @unaligned_nxv16i1(ptr %ldptr, ptr %stptr) { define void @unaligned_nxv2i64(ptr %ldptr, ptr %stptr) { ; CHECK-LABEL: unaligned_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %l3 = load , ptr %ldptr, align 4 store %l3, ptr %stptr, align 4 diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll index 8c198ee518873..8a504cd739211 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll @@ -115,14 +115,14 @@ define @test_compress_large( %vec, %add_op1){ ; CHECK-LABEL: neg_trunc_lsr_add_op1_not_splat: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z1, [x0] ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: lsr z0.h, z0.h, #6 ; CHECK-NEXT: st1b { z0.h }, p0, [x1, x2] @@ -226,8 +226,8 @@ define void @neg_trunc_lsr_add_op1_not_splat(ptr %ptr, ptr %dst, i64 %index, %lshr_op1){ ; CHECK-LABEL: neg_trunc_lsr_op1_not_splat: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr z1, [x0] ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] ; CHECK-NEXT: add z1.h, z1.h, #32 // =0x20 ; CHECK-NEXT: lsrr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: st1b { z0.h }, p0, [x1, x2] @@ -244,8 +244,8 @@ define void @neg_trunc_lsr_op1_not_splat(ptr %ptr, ptr %dst, i64 %index, @neg_urshr_4( %x, ptr %p) { ; CHECK-LABEL: neg_urshr_4: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add z1.d, z1.d, #32 // =0x20 ; CHECK-NEXT: lsr z0.d, z1.d, #6 -; CHECK-NEXT: st1d { z1.d }, p0, [x0] +; CHECK-NEXT: str z1, [x0] ; CHECK-NEXT: ret %add = add nuw nsw %x, splat (i64 32) %sh = lshr %add, splat (i64 6) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll index b19b5d871459a..58ba7603a702d 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll @@ -11,13 +11,12 @@ define { , , , , , , , , , , , , , , , , , , , , , , , , This Inner Loop Header: Depth=1 -; COMMON-NEXT: ld1b { z0.b }, p0/z, [x0] -; COMMON-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] -; COMMON-NEXT: ld1b { z2.b }, p0/z, [x0, #2, mul vl] -; COMMON-NEXT: ld1b { z3.b }, p0/z, [x0, #3, mul vl] +; COMMON-NEXT: ldr z0, [x0] +; COMMON-NEXT: ldr z1, [x0, #1, mul vl] +; COMMON-NEXT: ldr z2, [x0, #2, mul vl] +; COMMON-NEXT: ldr z3, [x0, #3, mul vl] ; COMMON-NEXT: addvl x0, x0, #5 ; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b ; COMMON-NEXT: movprfx z1, z2 @@ -67,10 +67,10 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i ; COMMON-NEXT: .LBB1_1: // %for.body ; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 ; COMMON-NEXT: add x8, x0, x2 -; COMMON-NEXT: ld1b { z0.b }, p0/z, [x0] +; COMMON-NEXT: ldr z0, [x0] ; COMMON-NEXT: ld1b { z1.b }, p0/z, [x0, x2] -; COMMON-NEXT: ld1b { z2.b }, p0/z, [x0, #1, mul vl] -; COMMON-NEXT: ld1b { z3.b }, p0/z, [x8, #1, mul vl] +; COMMON-NEXT: ldr z2, [x0, #1, mul vl] +; COMMON-NEXT: ldr z3, [x8, #1, mul vl] ; COMMON-NEXT: subs x3, x3, #1 ; COMMON-NEXT: addvl x0, x0, #2 ; COMMON-NEXT: add z0.b, z0.b, z1.b @@ -119,15 +119,14 @@ for.exit: define void @fixed_iv_scalable_offset(ptr %src, ptr %dst, i64 %count) #0 { ; COMMON-LABEL: fixed_iv_scalable_offset: ; COMMON: // %bb.0: // %entry -; COMMON-NEXT: ptrue p0.s ; COMMON-NEXT: .LBB2_1: // %for.body ; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 -; COMMON-NEXT: ld1w { z0.s }, p0/z, [x0] -; COMMON-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] +; COMMON-NEXT: ldr z0, [x0] +; COMMON-NEXT: ldr z1, [x0, #4, mul vl] ; COMMON-NEXT: subs x2, x2, #4 ; COMMON-NEXT: add x0, x0, #16 ; COMMON-NEXT: add z0.s, z0.s, z1.s -; COMMON-NEXT: st1w { z0.s }, p0, [x1] +; COMMON-NEXT: str z0, [x1] ; COMMON-NEXT: add x1, x1, #16 ; COMMON-NEXT: b.ne .LBB2_1 ; COMMON-NEXT: // %bb.2: // %for.exit @@ -162,14 +161,14 @@ define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) # ; BASE-NEXT: mov x9, #8 // =0x8 ; BASE-NEXT: .LBB3_1: // %for.body ; BASE-NEXT: // =>This Inner Loop Header: Depth=1 -; BASE-NEXT: ld1w { z0.s }, p0/z, [x8, #-4, mul vl] -; BASE-NEXT: ld1w { z1.s }, p0/z, [x8] +; BASE-NEXT: ldr z0, [x8, #-4, mul vl] +; BASE-NEXT: ldr z1, [x8] ; BASE-NEXT: decw x2 ; BASE-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2] ; BASE-NEXT: addvl x8, x8, #1 ; BASE-NEXT: add z0.s, z0.s, z1.s ; BASE-NEXT: add z0.s, z0.s, z2.s -; BASE-NEXT: st1w { z0.s }, p0, [x1] +; BASE-NEXT: str z0, [x1] ; BASE-NEXT: addvl x1, x1, #1 ; BASE-NEXT: cbnz x2, .LBB3_1 ; BASE-NEXT: // %bb.2: // %for.exit @@ -182,14 +181,14 @@ define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) # ; PREINDEX-NEXT: mov x9, #8 // =0x8 ; PREINDEX-NEXT: .LBB3_1: // %for.body ; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; PREINDEX-NEXT: ld1w { z0.s }, p0/z, [x8, #-4, mul vl] -; PREINDEX-NEXT: ld1w { z1.s }, p0/z, [x8] +; PREINDEX-NEXT: ldr z0, [x8, #-4, mul vl] +; PREINDEX-NEXT: ldr z1, [x8] ; PREINDEX-NEXT: decw x2 ; PREINDEX-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2] ; PREINDEX-NEXT: addvl x8, x8, #1 ; PREINDEX-NEXT: add z0.s, z0.s, z1.s ; PREINDEX-NEXT: add z0.s, z0.s, z2.s -; PREINDEX-NEXT: st1w { z0.s }, p0, [x1] +; PREINDEX-NEXT: str z0, [x1] ; PREINDEX-NEXT: addvl x1, x1, #1 ; PREINDEX-NEXT: cbnz x2, .LBB3_1 ; PREINDEX-NEXT: // %bb.2: // %for.exit @@ -203,8 +202,8 @@ define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) # ; POSTINDEX-NEXT: mov x10, #8 // =0x8 ; POSTINDEX-NEXT: .LBB3_1: // %for.body ; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; POSTINDEX-NEXT: ld1w { z0.s }, p0/z, [x9, #-4, mul vl] -; POSTINDEX-NEXT: ld1w { z1.s }, p0/z, [x9] +; POSTINDEX-NEXT: ldr z0, [x9, #-4, mul vl] +; POSTINDEX-NEXT: ldr z1, [x9] ; POSTINDEX-NEXT: ld1w { z2.s }, p0/z, [x9, x10, lsl #2] ; POSTINDEX-NEXT: addvl x9, x9, #1 ; POSTINDEX-NEXT: add z0.s, z0.s, z1.s @@ -299,19 +298,16 @@ for.exit: define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 { ; BASE-LABEL: three_access_wide_gap: ; BASE: // %bb.0: // %entry -; BASE-NEXT: ptrue p0.s -; BASE-NEXT: rdvl x8, #8 -; BASE-NEXT: ptrue p1.b ; BASE-NEXT: .LBB5_1: // %for.body ; BASE-NEXT: // =>This Inner Loop Header: Depth=1 -; BASE-NEXT: ld1w { z0.s }, p0/z, [x0] -; BASE-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] +; BASE-NEXT: ldr z0, [x0] +; BASE-NEXT: ldr z1, [x0, #4, mul vl] ; BASE-NEXT: decw x2 -; BASE-NEXT: ld1b { z2.b }, p1/z, [x0, x8] +; BASE-NEXT: ldr z2, [x0, #8, mul vl] ; BASE-NEXT: addvl x0, x0, #1 ; BASE-NEXT: add z0.s, z0.s, z1.s ; BASE-NEXT: add z0.s, z0.s, z2.s -; BASE-NEXT: st1w { z0.s }, p0, [x1] +; BASE-NEXT: str z0, [x1] ; BASE-NEXT: addvl x1, x1, #1 ; BASE-NEXT: cbnz x2, .LBB5_1 ; BASE-NEXT: // %bb.2: // %for.exit @@ -319,19 +315,16 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 { ; ; PREINDEX-LABEL: three_access_wide_gap: ; PREINDEX: // %bb.0: // %entry -; PREINDEX-NEXT: ptrue p0.s -; PREINDEX-NEXT: rdvl x8, #8 -; PREINDEX-NEXT: ptrue p1.b ; PREINDEX-NEXT: .LBB5_1: // %for.body ; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; PREINDEX-NEXT: ld1w { z0.s }, p0/z, [x0] -; PREINDEX-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] +; PREINDEX-NEXT: ldr z0, [x0] +; PREINDEX-NEXT: ldr z1, [x0, #4, mul vl] ; PREINDEX-NEXT: decw x2 -; PREINDEX-NEXT: ld1b { z2.b }, p1/z, [x0, x8] +; PREINDEX-NEXT: ldr z2, [x0, #8, mul vl] ; PREINDEX-NEXT: addvl x0, x0, #1 ; PREINDEX-NEXT: add z0.s, z0.s, z1.s ; PREINDEX-NEXT: add z0.s, z0.s, z2.s -; PREINDEX-NEXT: st1w { z0.s }, p0, [x1] +; PREINDEX-NEXT: str z0, [x1] ; PREINDEX-NEXT: addvl x1, x1, #1 ; PREINDEX-NEXT: cbnz x2, .LBB5_1 ; PREINDEX-NEXT: // %bb.2: // %for.exit @@ -341,13 +334,11 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 { ; POSTINDEX: // %bb.0: // %entry ; POSTINDEX-NEXT: ptrue p0.s ; POSTINDEX-NEXT: mov x8, xzr -; POSTINDEX-NEXT: rdvl x9, #8 -; POSTINDEX-NEXT: ptrue p1.b ; POSTINDEX-NEXT: .LBB5_1: // %for.body ; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; POSTINDEX-NEXT: ld1w { z0.s }, p0/z, [x0] -; POSTINDEX-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] -; POSTINDEX-NEXT: ld1b { z2.b }, p1/z, [x0, x9] +; POSTINDEX-NEXT: ldr z0, [x0] +; POSTINDEX-NEXT: ldr z1, [x0, #4, mul vl] +; POSTINDEX-NEXT: ldr z2, [x0, #8, mul vl] ; POSTINDEX-NEXT: addvl x0, x0, #1 ; POSTINDEX-NEXT: add z0.s, z0.s, z1.s ; POSTINDEX-NEXT: add z0.s, z0.s, z2.s