From 90dee58c159cf8c6a5426f78d1ef914b91442265 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Thu, 28 Sep 2023 17:33:01 +0800
Subject: [PATCH 1/2] [PowerPC] Mask constant operands in ValueBit tracking

In IR or C code, left/right shift amount larger than value size is
undefined behavior. But in practise, backend lowering for
srl_parts/sra_parts/shl_parts produces add/sub of shift amounts, thus
constant shift amounts might be negative or larger than value size. And
the lowering depends on behavior in ISA.

PowerPC ISA says, the lowest 7 bits (6 bits if in 32-bit instruction)
will be taken, and if the highest among them is 1, result will be zero,
otherwise the low 6 bits (or 5 on 32-bit) are used as shift amount.

This patch emulates the behavior and avoids array overflow in bit
permutation's value bits calculator.
---
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp |  36 ++++--
 llvm/test/CodeGen/PowerPC/pr59074.ll        | 132 ++++++++++++++++++++
 2 files changed, 155 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr59074.ll
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 6a3710407bc4f..19427c6322373 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -1632,7 +1632,7 @@ class BitPermutationSelector {
     default: break;
     case ISD::ROTL:
       if (isa<ConstantSDNode>(V.getOperand(1))) {
-        unsigned RotAmt = V.getConstantOperandVal(1);
+        unsigned RotAmt = V.getConstantOperandVal(1) & (NumBits - 1);
 
         const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
 
@@ -1645,15 +1645,20 @@ class BitPermutationSelector {
     case ISD::SHL:
     case PPCISD::SHL:
       if (isa<ConstantSDNode>(V.getOperand(1))) {
-        unsigned ShiftAmt = V.getConstantOperandVal(1);
+        // sld takes 7 bits, slw takes 6.
+        unsigned ShiftAmt = V.getConstantOperandVal(1) & ((NumBits << 1) - 1);
 
         const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
 
-        for (unsigned i = ShiftAmt; i < NumBits; ++i)
-          Bits[i] = LHSBits[i - ShiftAmt];
-
-        for (unsigned i = 0; i < ShiftAmt; ++i)
-          Bits[i] = ValueBit(ValueBit::ConstZero);
+        if (ShiftAmt >= NumBits) {
+          for (unsigned i = 0; i < NumBits; ++i)
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        } else {
+          for (unsigned i = ShiftAmt; i < NumBits; ++i)
+            Bits[i] = LHSBits[i - ShiftAmt];
+          for (unsigned i = 0; i < ShiftAmt; ++i)
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        }
 
         return std::make_pair(Interesting = true, &Bits);
       }
@@ -1661,15 +1666,20 @@ class BitPermutationSelector {
     case ISD::SRL:
     case PPCISD::SRL:
       if (isa<ConstantSDNode>(V.getOperand(1))) {
-        unsigned ShiftAmt = V.getConstantOperandVal(1);
+        // srd takes lowest 7 bits, srw takes 6.
+        unsigned ShiftAmt = V.getConstantOperandVal(1) & ((NumBits << 1) - 1);
 
         const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
 
-        for (unsigned i = 0; i < NumBits - ShiftAmt; ++i)
-          Bits[i] = LHSBits[i + ShiftAmt];
-
-        for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i)
-          Bits[i] = ValueBit(ValueBit::ConstZero);
+        if (ShiftAmt >= NumBits) {
+          for (unsigned i = 0; i < NumBits; ++i)
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        } else {
+          for (unsigned i = 0; i < NumBits - ShiftAmt; ++i)
+            Bits[i] = LHSBits[i + ShiftAmt];
+          for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i)
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        }
 
         return std::make_pair(Interesting = true, &Bits);
       }
diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll
new file mode 100644
index 0000000000000..3e328c6ad9f0b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr59074.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s --check-prefix=LE64
+; RUN: llc -mtriple=powerpcle-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s --check-prefix=LE32
+; RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 < %s | FileCheck %s --check-prefix=BE64
+; RUN: llc -mtriple=powerpc-ibm-aix -mcpu=pwr7 < %s | FileCheck %s --check-prefix=BE32
+
+; To verify this doesn't crash due to array out of bound.
+define void @pr59074(ptr %0) {
+; LE64-LABEL: pr59074:
+; LE64:       # %bb.0: # %entry
+; LE64-NEXT:    lwz 6, 0(3)
+; LE64-NEXT:    li 7, 12
+; LE64-NEXT:    ld 4, 16(3)
+; LE64-NEXT:    ld 5, 24(3)
+; LE64-NEXT:    addi 6, 6, -12
+; LE64-NEXT:    std 4, 16(3)
+; LE64-NEXT:    std 5, 24(3)
+; LE64-NEXT:    srd 6, 7, 6
+; LE64-NEXT:    li 7, 0
+; LE64-NEXT:    std 7, 8(3)
+; LE64-NEXT:    std 6, 0(3)
+; LE64-NEXT:    blr
+;
+; LE32-LABEL: pr59074:
+; LE32:       # %bb.0: # %entry
+; LE32-NEXT:    stwu 1, -80(1)
+; LE32-NEXT:    .cfi_def_cfa_offset 80
+; LE32-NEXT:    lwz 4, 0(3)
+; LE32-NEXT:    xxlxor 0, 0, 0
+; LE32-NEXT:    li 5, 4
+; LE32-NEXT:    addi 6, 1, 16
+; LE32-NEXT:    li 7, 0
+; LE32-NEXT:    li 8, 12
+; LE32-NEXT:    xxswapd 0, 0
+; LE32-NEXT:    addi 4, 4, -12
+; LE32-NEXT:    rlwinm 9, 4, 29, 28, 31
+; LE32-NEXT:    stxvd2x 0, 6, 5
+; LE32-NEXT:    stw 7, 44(1)
+; LE32-NEXT:    stw 7, 40(1)
+; LE32-NEXT:    stw 7, 36(1)
+; LE32-NEXT:    stw 8, 16(1)
+; LE32-NEXT:    lwzux 5, 9, 6
+; LE32-NEXT:    li 6, 7
+; LE32-NEXT:    lwz 7, 8(9)
+; LE32-NEXT:    nand 6, 4, 6
+; LE32-NEXT:    lwz 8, 4(9)
+; LE32-NEXT:    clrlwi 4, 4, 29
+; LE32-NEXT:    lwz 9, 12(9)
+; LE32-NEXT:    clrlwi 6, 6, 27
+; LE32-NEXT:    subfic 11, 4, 32
+; LE32-NEXT:    srw 5, 5, 4
+; LE32-NEXT:    slwi 10, 7, 1
+; LE32-NEXT:    srw 7, 7, 4
+; LE32-NEXT:    slw 6, 10, 6
+; LE32-NEXT:    srw 10, 8, 4
+; LE32-NEXT:    slw 8, 8, 11
+; LE32-NEXT:    slw 11, 9, 11
+; LE32-NEXT:    srw 4, 9, 4
+; LE32-NEXT:    or 5, 8, 5
+; LE32-NEXT:    or 7, 11, 7
+; LE32-NEXT:    or 6, 10, 6
+; LE32-NEXT:    stw 4, 12(3)
+; LE32-NEXT:    stw 7, 8(3)
+; LE32-NEXT:    stw 5, 0(3)
+; LE32-NEXT:    stw 6, 4(3)
+; LE32-NEXT:    addi 1, 1, 80
+; LE32-NEXT:    blr
+;
+; BE64-LABEL: pr59074:
+; BE64:       # %bb.0: # %entry
+; BE64-NEXT:    lwz 6, 12(3)
+; BE64-NEXT:    li 7, 12
+; BE64-NEXT:    ld 4, 24(3)
+; BE64-NEXT:    ld 5, 16(3)
+; BE64-NEXT:    addi 6, 6, -12
+; BE64-NEXT:    std 4, 24(3)
+; BE64-NEXT:    std 5, 16(3)
+; BE64-NEXT:    srd 6, 7, 6
+; BE64-NEXT:    li 7, 0
+; BE64-NEXT:    std 7, 0(3)
+; BE64-NEXT:    std 6, 8(3)
+; BE64-NEXT:    blr
+;
+; BE32-LABEL: pr59074:
+; BE32:       # %bb.0: # %entry
+; BE32-NEXT:    lwz 4, 12(3)
+; BE32-NEXT:    xxlxor 0, 0, 0
+; BE32-NEXT:    addi 5, 1, -64
+; BE32-NEXT:    li 6, 12
+; BE32-NEXT:    li 7, 0
+; BE32-NEXT:    addi 8, 1, -48
+; BE32-NEXT:    li 10, 7
+; BE32-NEXT:    stxvw4x 0, 0, 5
+; BE32-NEXT:    addi 4, 4, -12
+; BE32-NEXT:    stw 6, -36(1)
+; BE32-NEXT:    stw 7, -40(1)
+; BE32-NEXT:    stw 7, -44(1)
+; BE32-NEXT:    rlwinm 9, 4, 29, 28, 31
+; BE32-NEXT:    stw 7, -48(1)
+; BE32-NEXT:    sub 5, 8, 9
+; BE32-NEXT:    nand 6, 4, 10
+; BE32-NEXT:    clrlwi 4, 4, 29
+; BE32-NEXT:    clrlwi 6, 6, 27
+; BE32-NEXT:    lwz 7, 4(5)
+; BE32-NEXT:    lwz 8, 8(5)
+; BE32-NEXT:    lwz 9, 0(5)
+; BE32-NEXT:    lwz 5, 12(5)
+; BE32-NEXT:    slwi 10, 7, 1
+; BE32-NEXT:    srw 11, 8, 4
+; BE32-NEXT:    srw 7, 7, 4
+; BE32-NEXT:    srw 5, 5, 4
+; BE32-NEXT:    slw 6, 10, 6
+; BE32-NEXT:    subfic 10, 4, 32
+; BE32-NEXT:    srw 4, 9, 4
+; BE32-NEXT:    slw 8, 8, 10
+; BE32-NEXT:    slw 10, 9, 10
+; BE32-NEXT:    or 6, 11, 6
+; BE32-NEXT:    or 7, 10, 7
+; BE32-NEXT:    or 5, 8, 5
+; BE32-NEXT:    stw 4, 0(3)
+; BE32-NEXT:    stw 6, 8(3)
+; BE32-NEXT:    stw 5, 12(3)
+; BE32-NEXT:    stw 7, 4(3)
+; BE32-NEXT:    blr
+entry:
+  %v1 = load <2 x i128>, <2 x i128>* %0
+  %v2 = insertelement <2 x i128> %v1, i128 12, i32 0
+  %v3 = sub <2 x i128> %v1, %v2
+  %v4 = lshr <2 x i128> %v2, %v3
+  store <2 x i128> %v4, <2 x i128>* %0
+  ret void
+}

From 6759869abed47f6c488cf17e16ce85d984c58a27 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Tue, 6 Feb 2024 11:10:42 +0800
Subject: [PATCH 2/2] Add assert

---
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 19427c6322373..9e5f0b36616d1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -1632,6 +1632,7 @@ class BitPermutationSelector {
     default: break;
     case ISD::ROTL:
       if (isa<ConstantSDNode>(V.getOperand(1))) {
+        assert(isPowerOf2_32(NumBits) && "rotl bits should be power of 2!");
         unsigned RotAmt = V.getConstantOperandVal(1) & (NumBits - 1);
 
         const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;