Skip to content

Revert "[mlir][test][gpu] Migrate CUDA tests to the TargetAttr compilation workflow (#65768) #65848

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 9, 2023

Conversation

fabianmcg
Copy link
Contributor

This reverts commit d21b672.

@fabianmcg fabianmcg requested review from a team as code owners September 9, 2023 11:14
@fabianmcg fabianmcg merged commit 2c596ea into llvm:main Sep 9, 2023
@llvmbot
Copy link
Member

llvmbot commented Sep 9, 2023

@llvm/pr-subscribers-mlir

Changes

This reverts commit d21b672.

Patch is 27.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/65848.diff

25 Files Affected:

  • (modified) mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h (+11-1)
  • (modified) mlir/include/mlir/Conversion/Passes.td (+1)
  • (modified) mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp (+10-1)
  • (modified) mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp (+1-1)
  • (modified) mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir (+2-1)
  • (modified) mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir (+3-2)
  • (modified) mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir (+3-1)
  • (modified) mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir (+9-3)
  • (modified) mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir (+6-2)
  • (modified) mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/async.mlir (+2-2)
  • (modified) mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir (+4-1)
  • (modified) mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/printf.mlir (+2-1)
  • (modified) mlir/test/Integration/GPU/CUDA/shuffle.mlir (+3-1)
  • (modified) mlir/test/Integration/GPU/CUDA/two-modules.mlir (+3-1)
  • (modified) mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp (+42-34)
diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
index e0f4c71051e506a..46f29c6dd8b9267 100644
--- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
+++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
@@ -16,7 +16,9 @@ namespace mlir {
 class LLVMTypeConverter;
 class ConversionTarget;
 class RewritePatternSet;
-class Pass;
+
+template 
+class OperationPass;
 
 namespace gpu {
 class GPUModuleOp;
@@ -43,6 +45,14 @@ void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter,
 /// Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
 void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter,
                                              RewritePatternSet &patterns);
+
+/// Creates a pass that lowers GPU dialect operations to NVVM counterparts. The
+/// index bitwidth used for the lowering of the device side index computations
+/// is configurable.
+std::unique_ptr> createLowerGpuOpsToNVVMOpsPass(
+    unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
+    bool hasRedux = false);
+
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 3218760931b8cb0..ed37abf85275bf3 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -486,6 +486,7 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> {
 
 def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
   let summary = "Generate NVVM operations for gpu operations";
+  let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()";
   let dependentDialects = [
     "cf::ControlFlowDialect",
     "memref::MemRefDialect",
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 764b6a779b98c7a..06469dc82b3fc5d 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -210,7 +210,11 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern {
 /// code.
 struct LowerGpuOpsToNVVMOpsPass
     : public impl::ConvertGpuOpsToNVVMOpsBase {
-  using Base::Base;
+  LowerGpuOpsToNVVMOpsPass() = default;
+  LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux = false) {
+    this->indexBitwidth = indexBitwidth;
+    this->hasRedux = hasRedux;
+  }
 
   void runOnOperation() override {
     gpu::GPUModuleOp m = getOperation();
@@ -374,3 +378,8 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
                                    "__nv_tanh");
   populateOpPatterns(converter, patterns, "__nv_tanf", "__nv_tan");
 }
+
+std::unique_ptr>
+mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux) {
+  return std::make_unique(indexBitwidth, hasRedux);
+}
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index 24c4c4c43a93dea..a7fd5a25e68314e 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -52,7 +52,7 @@ void mlir::sparse_tensor::buildSparseCompiler(
     pm.addPass(createSparseGPUCodegenPass());
     pm.addNestedPass(createStripDebugInfoPass());
     pm.addNestedPass(createConvertSCFToCFPass());
-    pm.addNestedPass(createConvertGpuOpsToNVVMOps());
+    pm.addNestedPass(createLowerGpuOpsToNVVMOpsPass());
   }
 
   // TODO(springerm): Add sparse support to the BufferDeallocation pass and add
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
index 0cb06b7bf1d2001..2c1ae3ee840d0f8 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm -debug-only=serialize-to-isa \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \
 // RUN: 2>&1 | FileCheck %s
 
 // CHECK: Generated by LLVM NVPTX Back-End
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
index 80972f244ec02d7..8eb90fd3ca9946e 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -2,9 +2,10 @@
 // NOTE: this test requires gpu-sm80
 //
 // RUN: mlir-opt \
-// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
+// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" \
 // RUN: %s \
-// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \
+// RUN: | mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
+// RUN:            --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
index 8c991493a2b0174..8571c5ca5f3dc26 100644
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
@@ -1,7 +1,9 @@
 // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\
 // RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\
 // RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \
-// RUN:  -convert-arith-to-llvm -test-lower-to-nvvm | \
+// RUN:  -convert-arith-to-llvm -gpu-kernel-outlining |\
+// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
+// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
index f26c18c4ae3dd28..c671c1843862f94 100644
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
@@ -2,7 +2,9 @@
 // everything on the same thread.
 // RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
 // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN:  -test-lower-to-nvvm | \
+// RUN:  -gpu-kernel-outlining |\
+// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
+// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
@@ -13,7 +15,9 @@
 // RUN: mlir-opt %s  -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \
 // RUN:   -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
 // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN:  -test-lower-to-nvvm | \
+// RUN:  -gpu-kernel-outlining |\
+// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
+// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
@@ -23,7 +27,9 @@
 // RUN: mlir-opt %s  -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \
 // RUN:   -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
 // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN:  -test-lower-to-nvvm | \
+// RUN:  -gpu-kernel-outlining |\
+// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
+// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
index 591bf1b4fd18231..535ba52d66f0062 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \
+// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
index 51bd23f817b33f1..c4ca46521eeb4c7 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
@@ -3,7 +3,9 @@
 // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
 // This test also uses gpu.memcpy operations (instead of gpu.host_register).
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin{chip=sm_70}))' \
+// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm="use-bare-pointers-for-host=1 use-bare-pointers-for-kernels=1" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --entry-point-result=void \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
index 0307b3d504be9f6..ae410dce281b17a 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \
+// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
index b131b8682ddee06..f4324a14a36b64a 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
@@ -8,7 +10,9 @@
 
 // Same as above but with the memref bare pointer lowering convention.
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-to-llvm="use-bare-pointers-for-kernels=1" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
index 155423db7e05049..0a8d38f14527914 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
index e5047b6efa3bf25..bcd785d35291c66 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
index 163e9fdba60c1a9..aa4b0e8820479dc 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
index 381db2639c371f3..2e7d046c3921411 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
index 23c6c117e67f36b..32cfa27c8988a2c 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
index 3c5a100b5b90d57..30767b9495b6f20 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir
index d2a5127a34c3bdd..e6dd91ace974322 100644
--- a/mlir/test/Integration/GPU/CUDA/async.mlir
+++ b/mlir/test/Integration/GPU/CUDA/async.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
-// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \
 // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
 // RUN: | ...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants