Skip to content

Revert "[mlir][test][gpu] Migrate CUDA tests to the TargetAttr compilation workflow (#65768) #65848

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ namespace mlir {
class LLVMTypeConverter;
class ConversionTarget;
class RewritePatternSet;
class Pass;

template <typename OpT>
class OperationPass;

namespace gpu {
class GPUModuleOp;
Expand All @@ -43,6 +45,14 @@ void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter,
/// Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns);

/// Creates a pass that lowers GPU dialect operations to NVVM counterparts. The
/// index bitwidth used for the lowering of the device side index computations
/// is configurable.
std::unique_ptr<OperationPass<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass(
unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
bool hasRedux = false);

} // namespace mlir

#endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
1 change: 1 addition & 0 deletions mlir/include/mlir/Conversion/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,7 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> {

def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
let summary = "Generate NVVM operations for gpu operations";
let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()";
let dependentDialects = [
"cf::ControlFlowDialect",
"memref::MemRefDialect",
Expand Down
11 changes: 10 additions & 1 deletion mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,11 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
/// code.
struct LowerGpuOpsToNVVMOpsPass
: public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
using Base::Base;
LowerGpuOpsToNVVMOpsPass() = default;
LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux = false) {
this->indexBitwidth = indexBitwidth;
this->hasRedux = hasRedux;
}

void runOnOperation() override {
gpu::GPUModuleOp m = getOperation();
Expand Down Expand Up @@ -374,3 +378,8 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
"__nv_tanh");
populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan");
}

std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux) {
return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth, hasRedux);
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ void mlir::sparse_tensor::buildSparseCompiler(
pm.addPass(createSparseGPUCodegenPass());
pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
pm.addNestedPass<gpu::GPUModuleOp>(createLowerGpuOpsToNVVMOpsPass());
}

// TODO(springerm): Add sparse support to the BufferDeallocation pass and add
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm -debug-only=serialize-to-isa \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \
// RUN: 2>&1 | FileCheck %s

// CHECK: Generated by LLVM NVPTX Back-End
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
// NOTE: this test requires gpu-sm80
//
// RUN: mlir-opt \
// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" \
// RUN: %s \
// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \
// RUN: | mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\
// RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\
// RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \
// RUN: -convert-arith-to-llvm -test-lower-to-nvvm | \
// RUN: -convert-arith-to-llvm -gpu-kernel-outlining |\
// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
// RUN: -shared-libs=%mlir_cuda_runtime \
// RUN: -shared-libs=%mlir_c_runner_utils \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
// everything on the same thread.
// RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
// RUN: -test-lower-to-nvvm | \
// RUN: -gpu-kernel-outlining |\
// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
// RUN: -shared-libs=%mlir_cuda_runtime \
// RUN: -shared-libs=%mlir_c_runner_utils \
Expand All @@ -13,7 +15,9 @@
// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \
// RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
// RUN: -test-lower-to-nvvm | \
// RUN: -gpu-kernel-outlining |\
// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
// RUN: -shared-libs=%mlir_cuda_runtime \
// RUN: -shared-libs=%mlir_c_runner_utils \
Expand All @@ -23,7 +27,9 @@
// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \
// RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
// RUN: -test-lower-to-nvvm | \
// RUN: -gpu-kernel-outlining |\
// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
// RUN: -shared-libs=%mlir_cuda_runtime \
// RUN: -shared-libs=%mlir_c_runner_utils \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \
// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
// Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
// This test also uses gpu.memcpy operations (instead of gpu.host_register).
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin{chip=sm_70}))' \
// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm="use-bare-pointers-for-host=1 use-bare-pointers-for-kernels=1" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --entry-point-result=void \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \
// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
8 changes: 6 additions & 2 deletions mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand All @@ -8,7 +10,9 @@

// Same as above but with the memref bare pointer lowering convention.
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm="use-bare-pointers-for-kernels=1" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 3 additions & 1 deletion mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 3 additions & 1 deletion mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 3 additions & 1 deletion mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 3 additions & 1 deletion mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 3 additions & 1 deletion mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 3 additions & 1 deletion mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 2 additions & 2 deletions mlir/test/Integration/GPU/CUDA/async.mlir
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \
// RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \
// RUN: | mlir-cpu-runner \
Expand Down
5 changes: 4 additions & 1 deletion mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 3 additions & 1 deletion mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
3 changes: 2 additions & 1 deletion mlir/test/Integration/GPU/CUDA/printf.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 3 additions & 1 deletion mlir/test/Integration/GPU/CUDA/shuffle.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
4 changes: 3 additions & 1 deletion mlir/test/Integration/GPU/CUDA/two-modules.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
// RUN: | mlir-opt -gpu-to-llvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
Loading