From dfd292784f37e684c9224a99d98d5c8a61020cd1 Mon Sep 17 00:00:00 2001 From: Aart Bik Date: Tue, 19 Sep 2023 13:06:06 -0700 Subject: [PATCH] [mlir][sparse][gpu] free all buffers allocated for spGEMM Yup, a bit of an oversight ;-) --- .../Transforms/SparseGPUCodegen.cpp | 15 +++++++++-- .../SparseTensor/GPU/gpu_spgemm_lib.mlir | 26 ++++++++++++++----- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp index efdd3347558b4..91b346c8a9b4c 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -795,10 +795,10 @@ rewriteSpGEMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT, Value rowC = e1.getResult(0); token = e1.getAsyncToken(); auto e2 = genAllocBuffer(rewriter, loc, cTp.getCrdType(), zero, token); - Value colC = e2.getResult(0); + Value colC = e2.getResult(0); // no free needed token = e2.getAsyncToken(); auto e3 = genAllocBuffer(rewriter, loc, dnCType, zero, token); - Value valC = e3.getResult(0); + Value valC = e3.getResult(0); // no free needed token = e3.getAsyncToken(); Operation *spGenC = genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szm, szn, zero, @@ -881,6 +881,17 @@ rewriteSpGEMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT, token = genCopyMemRef(rewriter, loc, rowH, rowC, token); token = genCopyMemRef(rewriter, loc, colH, colC, token); token = genCopyMemRef(rewriter, loc, valH, valC, token); + token = genDeallocMemRef(rewriter, loc, rowA, token); + token = genDeallocMemRef(rewriter, loc, colA, token); + token = genDeallocMemRef(rewriter, loc, valA, token); + token = genDeallocMemRef(rewriter, loc, rowB, token); + token = genDeallocMemRef(rewriter, loc, colB, token); + token = genDeallocMemRef(rewriter, loc, valB, token); + token = genDeallocMemRef(rewriter, loc, rowC, token); + token = genDeallocMemRef(rewriter, loc, colC, token); + token = genDeallocMemRef(rewriter, loc, valC, token); + token = genDeallocMemRef(rewriter, loc, buffer1, token); + token = genDeallocMemRef(rewriter, loc, buffer2, token); tokens.push_back(token); genBlockingWait(rewriter, loc, tokens); tokens.clear(); diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir index 7b4c48dc34105..1bb51f4fcf518 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir @@ -5,7 +5,7 @@ // CHECK-LABEL: func.func @matmulCSR( // CHECK-SAME: %[[VAL_0:.*0]]: tensor<8x8xf32, #{{.*}}>, -// CHECK-SAME: %[[VAL_1:.*1]]: tensor<8x8xf32, #{{.*}}> +// CHECK-SAME: %[[VAL_1:.*1]]: tensor<8x8xf32, #{{.*}}>) -> tensor<8x8xf32, #{{.*}}> { // CHECK: %[[VAL_2:.*]] = arith.constant 8 : index // CHECK: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK: %[[VAL_4:.*]] = arith.constant 9 : index @@ -72,12 +72,24 @@ // CHECK: %[[VAL_88:.*]] = gpu.memcpy async {{\[}}%[[VAL_87]]] %[[VAL_81]], %[[VAL_49]] : memref, memref // CHECK: %[[VAL_89:.*]] = gpu.memcpy async {{\[}}%[[VAL_88]]] %[[VAL_82]], %[[VAL_75]] : memref, memref // CHECK: %[[VAL_90:.*]] = gpu.memcpy async {{\[}}%[[VAL_89]]] %[[VAL_83]], %[[VAL_77]] : memref, memref -// CHECK: gpu.wait {{\[}}%[[VAL_90]]] -// CHECK: %[[VAL_91:.*]] = bufferization.to_tensor %[[VAL_83]] : memref -// CHECK: %[[VAL_92:.*]] = bufferization.to_tensor %[[VAL_81]] : memref -// CHECK: %[[VAL_93:.*]] = bufferization.to_tensor %[[VAL_82]] : memref -// CHECK: %[[VAL_94:.*]] = sparse_tensor.pack %[[VAL_91]], %[[VAL_92]], %[[VAL_93]] : tensor, tensor, tensor to tensor<8x8xf32, #{{.*}}> -// CHECK: return %[[VAL_94]] : tensor<8x8xf32, #{{.*}}> +// CHECK: %[[VAL_91:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_92:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_93:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_94:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_95:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_96:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_97:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_98:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_99:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_a0:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: %[[VAL_a1:.*]] = gpu.dealloc async {{.*}} : memref +// CHECK: gpu.wait [%[[VAL_a1]]] +// CHECK: %[[VAL_a2:.*]] = bufferization.to_tensor %[[VAL_83]] : memref +// CHECK: %[[VAL_a3:.*]] = bufferization.to_tensor %[[VAL_81]] : memref +// CHECK: %[[VAL_a4:.*]] = bufferization.to_tensor %[[VAL_82]] : memref +// CHECK: %[[VAL_a5:.*]] = sparse_tensor.pack %[[VAL_a2]], %[[VAL_a3]], %[[VAL_a4]] : tensor, tensor, tensor to tensor<8x8xf32, #{{.*}}> +// CHECK: return %[[VAL_a5]] : tensor<8x8xf32, #{{.*}}> +// CHECK: } func.func @matmulCSR(%A: tensor<8x8xf32, #CSR>, %B: tensor<8x8xf32, #CSR>) -> tensor<8x8xf32, #CSR> { %init = bufferization.alloc_tensor() : tensor<8x8xf32, #CSR>