From dfd292784f37e684c9224a99d98d5c8a61020cd1 Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Tue, 19 Sep 2023 13:06:06 -0700
Subject: [PATCH] [mlir][sparse][gpu] free all buffers allocated for spGEMM

Yup, a bit of an oversight ;-)
---
 .../Transforms/SparseGPUCodegen.cpp           | 15 +++++++++--
 .../SparseTensor/GPU/gpu_spgemm_lib.mlir      | 26 ++++++++++++++-----
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index efdd3347558b4..91b346c8a9b4c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -795,10 +795,10 @@ rewriteSpGEMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
   Value rowC = e1.getResult(0);
   token = e1.getAsyncToken();
   auto e2 = genAllocBuffer(rewriter, loc, cTp.getCrdType(), zero, token);
-  Value colC = e2.getResult(0);
+  Value colC = e2.getResult(0);  // no free needed
   token = e2.getAsyncToken();
   auto e3 = genAllocBuffer(rewriter, loc, dnCType, zero, token);
-  Value valC = e3.getResult(0);
+  Value valC = e3.getResult(0);  // no free needed
   token = e3.getAsyncToken();
   Operation *spGenC =
       genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szm, szn, zero,
@@ -881,6 +881,17 @@ rewriteSpGEMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
   token = genCopyMemRef(rewriter, loc, rowH, rowC, token);
   token = genCopyMemRef(rewriter, loc, colH, colC, token);
   token = genCopyMemRef(rewriter, loc, valH, valC, token);
+  token = genDeallocMemRef(rewriter, loc, rowA, token);
+  token = genDeallocMemRef(rewriter, loc, colA, token);
+  token = genDeallocMemRef(rewriter, loc, valA, token);
+  token = genDeallocMemRef(rewriter, loc, rowB, token);
+  token = genDeallocMemRef(rewriter, loc, colB, token);
+  token = genDeallocMemRef(rewriter, loc, valB, token);
+  token = genDeallocMemRef(rewriter, loc, rowC, token);
+  token = genDeallocMemRef(rewriter, loc, colC, token);
+  token = genDeallocMemRef(rewriter, loc, valC, token);
+  token = genDeallocMemRef(rewriter, loc, buffer1, token);
+  token = genDeallocMemRef(rewriter, loc, buffer2, token);
   tokens.push_back(token);
   genBlockingWait(rewriter, loc, tokens);
   tokens.clear();
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
index 7b4c48dc34105..1bb51f4fcf518 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
@@ -5,7 +5,7 @@
 
 // CHECK-LABEL: func.func @matmulCSR(
 // CHECK-SAME:      %[[VAL_0:.*0]]: tensor<8x8xf32, #{{.*}}>,
-// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<8x8xf32, #{{.*}}>
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<8x8xf32, #{{.*}}>) -> tensor<8x8xf32, #{{.*}}> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_4:.*]] = arith.constant 9 : index
@@ -72,12 +72,24 @@
 // CHECK:           %[[VAL_88:.*]] = gpu.memcpy async {{\[}}%[[VAL_87]]] %[[VAL_81]], %[[VAL_49]] : memref<?xindex>, memref<?xindex>
 // CHECK:           %[[VAL_89:.*]] = gpu.memcpy async {{\[}}%[[VAL_88]]] %[[VAL_82]], %[[VAL_75]] : memref<?xindex>, memref<?xindex>
 // CHECK:           %[[VAL_90:.*]] = gpu.memcpy async {{\[}}%[[VAL_89]]] %[[VAL_83]], %[[VAL_77]] : memref<?xf32>, memref<?xf32>
-// CHECK:           gpu.wait {{\[}}%[[VAL_90]]]
-// CHECK:           %[[VAL_91:.*]] = bufferization.to_tensor %[[VAL_83]] : memref<?xf32>
-// CHECK:           %[[VAL_92:.*]] = bufferization.to_tensor %[[VAL_81]] : memref<?xindex>
-// CHECK:           %[[VAL_93:.*]] = bufferization.to_tensor %[[VAL_82]] : memref<?xindex>
-// CHECK:           %[[VAL_94:.*]] = sparse_tensor.pack %[[VAL_91]], %[[VAL_92]], %[[VAL_93]] : tensor<?xf32>, tensor<?xindex>, tensor<?xindex> to tensor<8x8xf32, #{{.*}}>
-// CHECK:           return %[[VAL_94]] : tensor<8x8xf32, #{{.*}}>
+// CHECK:           %[[VAL_91:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_92:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_93:.*]] = gpu.dealloc async {{.*}} : memref<?xf32>
+// CHECK:           %[[VAL_94:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_95:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_96:.*]] = gpu.dealloc async {{.*}} : memref<?xf32>
+// CHECK:           %[[VAL_97:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_98:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_99:.*]] = gpu.dealloc async {{.*}} : memref<?xf32>
+// CHECK:           %[[VAL_a0:.*]] = gpu.dealloc async {{.*}} : memref<?xi8>
+// CHECK:           %[[VAL_a1:.*]] = gpu.dealloc async {{.*}} : memref<?xi8>
+// CHECK:           gpu.wait [%[[VAL_a1]]]
+// CHECK:           %[[VAL_a2:.*]] = bufferization.to_tensor %[[VAL_83]] : memref<?xf32>
+// CHECK:           %[[VAL_a3:.*]] = bufferization.to_tensor %[[VAL_81]] : memref<?xindex>
+// CHECK:           %[[VAL_a4:.*]] = bufferization.to_tensor %[[VAL_82]] : memref<?xindex>
+// CHECK:           %[[VAL_a5:.*]] = sparse_tensor.pack %[[VAL_a2]], %[[VAL_a3]], %[[VAL_a4]] : tensor<?xf32>, tensor<?xindex>, tensor<?xindex> to tensor<8x8xf32, #{{.*}}>
+// CHECK:           return %[[VAL_a5]] : tensor<8x8xf32, #{{.*}}>
+// CHECK:         }
 func.func @matmulCSR(%A: tensor<8x8xf32, #CSR>,
                      %B: tensor<8x8xf32, #CSR>) -> tensor<8x8xf32, #CSR> {
   %init = bufferization.alloc_tensor() : tensor<8x8xf32, #CSR>