Skip to content

[AArch64] Combine concat(binop, binop) into binop(concat, concat) #89911

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18647,14 +18647,12 @@ static SDValue performConcatVectorsCombine(SDNode *N,
if (DCI.isBeforeLegalizeOps())
return SDValue();

// Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
// destination size, combine into an avg of two contacts of the source
// vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
// concat(b, d))
// Optimise concat_vectors of two identical binops with a 128-bit destination
// size, combine into an binop of two contacts of the source vectors. eg:
// concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
(N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS) &&
N0->hasOneUse() && N1->hasOneUse()) {
DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
N1->hasOneUse()) {
SDValue N00 = N0->getOperand(0);
SDValue N01 = N0->getOperand(1);
SDValue N10 = N1->getOperand(0);
Expand Down
81 changes: 50 additions & 31 deletions llvm/test/CodeGen/AArch64/concatbinop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@
define <8 x i16> @concat_add(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; CHECK-LABEL: concat_add:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.4h, v2.4h, v3.4h
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My guess is that this isn't faster than the old sequence because of the dependency. The other case below is clearly an improvement, so I guess overall this change is beneficial?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that is the idea. If we change 2 x binop + concat -> binop + 2 x concat, then sometimes 'concat' is cheaper than the 'binop' and so it's a little cheaper overall. But if not it can lead to other simplifications and shouldn't be any worse.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, cheers, LGTM too

; CHECK-NEXT: ret
%x = add <4 x i16> %a, %b
%y = add <4 x i16> %c, %d
Expand All @@ -33,13 +37,9 @@ define <8 x i16> @concat_addtunc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
define <8 x i16> @concat_addtunc2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; CHECK-LABEL: concat_addtunc2:
; CHECK: // %bb.0:
; CHECK-NEXT: xtn v1.4h, v1.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: xtn v2.4h, v2.4s
; CHECK-NEXT: xtn v3.4h, v3.4s
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-NEXT: add v1.4h, v2.4h, v3.4h
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: uzp1 v1.8h, v1.8h, v3.8h
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%at = trunc <4 x i32> %a to <4 x i16>
%bt = trunc <4 x i32> %b to <4 x i16>
Expand All @@ -54,9 +54,13 @@ define <8 x i16> @concat_addtunc2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
define <8 x i16> @concat_sub(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; CHECK-LABEL: concat_sub:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.4h, v2.4h, v3.4h
; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = sub <4 x i16> %a, %b
%y = sub <4 x i16> %c, %d
Expand All @@ -67,9 +71,13 @@ define <8 x i16> @concat_sub(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16>
define <8 x i16> @concat_mul(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; CHECK-LABEL: concat_mul:
; CHECK: // %bb.0:
; CHECK-NEXT: mul v2.4h, v2.4h, v3.4h
; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = mul <4 x i16> %a, %b
%y = mul <4 x i16> %c, %d
Expand All @@ -80,9 +88,13 @@ define <8 x i16> @concat_mul(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16>
define <8 x i16> @concat_xor(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; CHECK-LABEL: concat_xor:
; CHECK: // %bb.0:
; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%x = xor <4 x i16> %a, %b
%y = xor <4 x i16> %c, %d
Expand All @@ -93,9 +105,13 @@ define <8 x i16> @concat_xor(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16>
define <8 x half> @concat_fadd(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
; CHECK-LABEL: concat_fadd:
; CHECK: // %bb.0:
; CHECK-NEXT: fadd v2.4h, v2.4h, v3.4h
; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = fadd <4 x half> %a, %b
%y = fadd <4 x half> %c, %d
Expand All @@ -106,9 +122,13 @@ define <8 x half> @concat_fadd(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x
define <8 x half> @concat_fmul(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
; CHECK-LABEL: concat_fmul:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul v2.4h, v2.4h, v3.4h
; CHECK-NEXT: fmul v0.4h, v0.4h, v1.4h
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: fmul v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = fmul <4 x half> %a, %b
%y = fmul <4 x half> %c, %d
Expand All @@ -119,9 +139,13 @@ define <8 x half> @concat_fmul(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x
define <8 x half> @concat_min(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
; CHECK-LABEL: concat_min:
; CHECK: // %bb.0:
; CHECK-NEXT: fminnm v2.4h, v2.4h, v3.4h
; CHECK-NEXT: fminnm v0.4h, v0.4h, v1.4h
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: fminnm v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
%y = call <4 x half> @llvm.minnum.v4f16(<4 x half> %c, <4 x half> %d)
Expand All @@ -146,21 +170,16 @@ define <16 x i8> @signOf_neon(ptr nocapture noundef readonly %a, ptr nocapture n
; CHECK-LABEL: signOf_neon:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: movi v0.8b, #1
; CHECK-NEXT: movi v0.16b, #1
; CHECK-NEXT: ldp q3, q4, [x1]
; CHECK-NEXT: cmhi v5.8h, v1.8h, v3.8h
; CHECK-NEXT: cmhi v6.8h, v2.8h, v4.8h
; CHECK-NEXT: cmhi v1.8h, v3.8h, v1.8h
; CHECK-NEXT: cmhi v2.8h, v4.8h, v2.8h
; CHECK-NEXT: xtn v3.8b, v5.8h
; CHECK-NEXT: xtn v4.8b, v6.8h
; CHECK-NEXT: xtn v1.8b, v1.8h
; CHECK-NEXT: xtn v2.8b, v2.8h
; CHECK-NEXT: and v3.8b, v3.8b, v0.8b
; CHECK-NEXT: and v4.8b, v4.8b, v0.8b
; CHECK-NEXT: orr v0.8b, v3.8b, v1.8b
; CHECK-NEXT: orr v1.8b, v4.8b, v2.8b
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: uzp1 v3.16b, v5.16b, v6.16b
; CHECK-NEXT: uzp1 v1.16b, v1.16b, v2.16b
; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
entry:
%0 = load <8 x i16>, ptr %a, align 2
Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AArch64/vecreduce-add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2825,10 +2825,11 @@ entry:
define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d2, #0x00ffff0000ffff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: movi v2.2d, #0x00ffff0000ffff
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
Expand Down Expand Up @@ -3578,10 +3579,11 @@ entry:
define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d2, #0x0000ff000000ff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: movi v2.2d, #0x0000ff000000ff
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
Expand Down
Loading