diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5b553b1136b34..38b5e0114903c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3425,19 +3425,30 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (isMUBUF(MIb) || isMTBUF(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb) && !isSMRD(MIb); + if (isFLAT(MIb)) + return isFLATScratch(MIb); + + return !isSMRD(MIb); } if (isSMRD(MIa)) { if (isSMRD(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); + if (isFLAT(MIb)) + return isFLATScratch(MIb); + + return !isMUBUF(MIb) && !isMTBUF(MIb); } if (isFLAT(MIa)) { - if (isFLAT(MIb)) + if (isFLAT(MIb)) { + if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) || + (isFLATGlobal(MIa) && isFLATScratch(MIb))) + return true; + return checkInstOffsetsDoNotOverlap(MIa, MIb); + } return false; } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll index 49e434e2dd30c..29c82db6f8204 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll @@ -5,15 +5,14 @@ define amdgpu_gfx void @example(<4 x i32> inreg %rsrc, ptr addrspace(5) %src, i3 ; CHECK-LABEL: example: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_add_nc_u32_e32 v3, 4, v0 +; CHECK-NEXT: s_clause 0x1 ; CHECK-NEXT: scratch_load_b32 v2, v0, off -; CHECK-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; CHECK-NEXT: scratch_load_b32 v3, v3, off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_b32 v2, v1, s[4:7], 0 offen -; CHECK-NEXT: scratch_load_b32 v0, v0, off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen offset:4 +; CHECK-NEXT: buffer_store_b64 v[2:3], v1, s[4:7], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] -; + %x0 = load i32, ptr addrspace(5) %src call void @llvm.amdgcn.raw.buffer.store.i32(i32 %x0, <4 x i32> %rsrc, i32 %dst, i32 0, i32 0) %src1 = getelementptr i8, ptr addrspace(5) %src, i32 4