Skip to content

[mlir][sparse][gpu] re-enable all GPU libgen tests #72185

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,7 @@ struct SparseCompilerOptions

/// Projects out the options for `createSparsificationPass`.
SparsificationOptions sparsificationOptions() const {
return SparsificationOptions(parallelization, enableGPULibgen,
enableRuntimeLibrary);
return SparsificationOptions(parallelization, enableRuntimeLibrary);
}

/// Projects out the options for `createConvertVectorToLLVMPass`.
Expand Down
15 changes: 6 additions & 9 deletions mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,11 @@ std::unique_ptr<Pass> createPreSparsificationRewritePass();

/// Options for the Sparsification pass.
struct SparsificationOptions {
SparsificationOptions(SparseParallelizationStrategy p, bool gpuLibgen,
bool enableRT)
: parallelizationStrategy(p), enableGPULibgen(gpuLibgen),
enableRuntimeLibrary(enableRT) {}
SparsificationOptions(SparseParallelizationStrategy p, bool enableRT)
: parallelizationStrategy(p), enableRuntimeLibrary(enableRT) {}
SparsificationOptions()
: SparsificationOptions(SparseParallelizationStrategy::kNone, false,
true) {}
: SparsificationOptions(SparseParallelizationStrategy::kNone, true) {}
SparseParallelizationStrategy parallelizationStrategy;
bool enableGPULibgen;
bool enableRuntimeLibrary;
};

Expand Down Expand Up @@ -196,7 +192,8 @@ void populateSparseGPULibgenPatterns(RewritePatternSet &patterns,
bool enableRT);

std::unique_ptr<Pass> createSparseGPUCodegenPass();
std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads);
std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads,
bool enableRT);

//===----------------------------------------------------------------------===//
// The SparseStorageSpecifierToLLVM pass.
Expand Down Expand Up @@ -225,7 +222,7 @@ std::unique_ptr<Pass> createSparsificationAndBufferizationPass(
const SparsificationOptions &sparsificationOptions,
bool createSparseDeallocs, bool enableRuntimeLibrary,
bool enableBufferInitialization, unsigned vectorLength,
bool enableVLAVectorization, bool enableSIMDIndex32);
bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen);

//===----------------------------------------------------------------------===//
// Registration.
Expand Down
12 changes: 6 additions & 6 deletions mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
"affine::AffineDialect",
"arith::ArithDialect",
"bufferization::BufferizationDialect",
"gpu::GPUDialect",
"LLVM::LLVMDialect",
"linalg::LinalgDialect",
"memref::MemRefDialect",
Expand All @@ -131,9 +130,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
"any-storage-any-loop",
"Enable sparse parallelization for any storage and loop."))}]>,
Option<"enableGPULibgen", "enable-gpu-libgen", "bool",
"false",
"Enable GPU acceleration by means of direct library calls (like cuSPARSE)">,
Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
"true", "Enable runtime library for manipulating sparse tensors">,
];
Expand Down Expand Up @@ -368,7 +364,9 @@ def SparseVectorization : Pass<"sparse-vectorization", "ModuleOp"> {
def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
let summary = "Generates GPU code during sparsification";
let description = [{
Enables the sparsifier to use GPU acceleration.
Enables the sparsifier to use GPU acceleration. When the number of GPU
threads is set to zero, the pass tries to enable GPU acceleration by
means of direct library calls (like cuSPARSE).
}];
let constructor = "mlir::createSparseGPUCodegenPass()";
let dependentDialects = [
Expand All @@ -381,7 +379,9 @@ def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
"sparse_tensor::SparseTensorDialect",
];
let options = [
Option<"numThreads", "num_threads", "int32_t", "1024", "Sets the number of GPU threads">,
Option<"numThreads", "num-threads", "int32_t", "1024", "Sets the number of GPU threads">,
Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
"true", "Enable runtime library for manipulating sparse tensors">,
];
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,25 @@

void mlir::sparse_tensor::buildSparseCompiler(
OpPassManager &pm, const SparseCompilerOptions &options) {
// Rewrite named linalg ops into generic ops.
pm.addNestedPass<func::FuncOp>(createLinalgGeneralizationPass());

// Sparsification and bufferization mini-pipeline.
pm.addPass(createSparsificationAndBufferizationPass(
getBufferizationOptionsForSparsification(
options.testBufferizationAnalysisOnly),
options.sparsificationOptions(), options.createSparseDeallocs,
options.enableRuntimeLibrary, options.enableBufferInitialization,
options.vectorLength,
/*enableVLAVectorization=*/options.armSVE,
/*enableSIMDIndex32=*/options.force32BitVectorIndices));
/*enableSIMDIndex32=*/options.force32BitVectorIndices,
options.enableGPULibgen));

// Bail-early for test setup.
if (options.testBufferizationAnalysisOnly)
return;

// Storage specifier lowering and bufferization wrap-up.
pm.addPass(createStorageSpecifierToLLVMPass());
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
pm.addNestedPass<func::FuncOp>(
Expand Down Expand Up @@ -72,8 +79,10 @@ void mlir::sparse_tensor::buildSparseCompiler(
pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
pm.addPass(createConvertMathToLibmPass());
pm.addPass(createConvertComplexToLibmPass());

// Repeat convert-vector-to-llvm.
pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));

pm.addPass(createConvertComplexToLLVMPass());
pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
pm.addPass(createConvertFuncToLLVMPass());
Expand Down
23 changes: 13 additions & 10 deletions mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,15 @@ struct SparsificationPass
SparsificationPass(const SparsificationPass &pass) = default;
SparsificationPass(const SparsificationOptions &options) {
parallelization = options.parallelizationStrategy;
enableGPULibgen = options.enableGPULibgen;
enableRuntimeLibrary = options.enableRuntimeLibrary;
}

void runOnOperation() override {
auto *ctx = &getContext();
// Translate strategy flags to strategy options.
SparsificationOptions options(parallelization, enableGPULibgen,
enableRuntimeLibrary);
// Apply GPU libgen (if requested), sparsification, and cleanup rewriting.
SparsificationOptions options(parallelization, enableRuntimeLibrary);
// Apply sparsification and cleanup rewriting.
RewritePatternSet patterns(ctx);
if (enableGPULibgen)
populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
populateSparsificationPatterns(patterns, options);
scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
Expand Down Expand Up @@ -323,12 +319,18 @@ struct SparseGPUCodegenPass
: public impl::SparseGPUCodegenBase<SparseGPUCodegenPass> {
SparseGPUCodegenPass() = default;
SparseGPUCodegenPass(const SparseGPUCodegenPass &pass) = default;
SparseGPUCodegenPass(unsigned nT) { numThreads = nT; }
SparseGPUCodegenPass(unsigned nT, bool enableRT) {
numThreads = nT;
enableRuntimeLibrary = enableRT;
}

void runOnOperation() override {
auto *ctx = &getContext();
RewritePatternSet patterns(ctx);
populateSparseGPUCodegenPatterns(patterns, numThreads);
if (numThreads == 0)
populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
else
populateSparseGPUCodegenPatterns(patterns, numThreads);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
}
};
Expand Down Expand Up @@ -457,8 +459,9 @@ std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass() {
return std::make_unique<SparseGPUCodegenPass>();
}

std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads) {
return std::make_unique<SparseGPUCodegenPass>(numThreads);
std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads,
bool enableRT) {
return std::make_unique<SparseGPUCodegenPass>(numThreads, enableRT);
}

std::unique_ptr<Pass> mlir::createStorageSpecifierToLLVMPass() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,16 @@ class SparsificationAndBufferizationPass
const SparsificationOptions &sparsificationOptions,
bool createSparseDeallocs, bool enableRuntimeLibrary,
bool enableBufferInitialization, unsigned vectorLength,
bool enableVLAVectorization, bool enableSIMDIndex32)
bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen)
: bufferizationOptions(bufferizationOptions),
sparsificationOptions(sparsificationOptions),
createSparseDeallocs(createSparseDeallocs),
enableRuntimeLibrary(enableRuntimeLibrary),
enableBufferInitialization(enableBufferInitialization),
vectorLength(vectorLength),
enableVLAVectorization(enableVLAVectorization),
enableSIMDIndex32(enableSIMDIndex32) {}
enableSIMDIndex32(enableSIMDIndex32), enableGPULibgen(enableGPULibgen) {
}

/// Bufferize all dense ops. This assumes that no further analysis is needed
/// and that all required buffer copies were already inserted by
Expand Down Expand Up @@ -139,6 +140,8 @@ class SparsificationAndBufferizationPass
// of `bufferization.alloc_tensor` ops.
{
OpPassManager pm("builtin.module");
if (enableGPULibgen)
pm.addPass(createSparseGPUCodegenPass(0, enableRuntimeLibrary));
pm.addPass(createSparseReinterpretMapPass(ReinterpretMapScope::kAll));
pm.addPass(createSparsificationPass(sparsificationOptions));
pm.addNestedPass<func::FuncOp>(createStageSparseOperationsPass());
Expand Down Expand Up @@ -177,6 +180,7 @@ class SparsificationAndBufferizationPass
unsigned vectorLength;
bool enableVLAVectorization;
bool enableSIMDIndex32;
bool enableGPULibgen;
};

} // namespace sparse_tensor
Expand Down Expand Up @@ -210,18 +214,19 @@ std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass() {
/*enableBufferInitialization=*/false,
/*vectorLength=*/0,
/*enableVLAVectorization=*/false,
/*enableSIMDIndex32=*/false);
/*enableSIMDIndex32=*/false,
/*enableGPULibgen=*/false);
}

std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass(
const bufferization::OneShotBufferizationOptions &bufferizationOptions,
const SparsificationOptions &sparsificationOptions,
bool createSparseDeallocs, bool enableRuntimeLibrary,
bool enableBufferInitialization, unsigned vectorLength,
bool enableVLAVectorization, bool enableSIMDIndex32) {
bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen) {
return std::make_unique<
mlir::sparse_tensor::SparsificationAndBufferizationPass>(
bufferizationOptions, sparsificationOptions, createSparseDeallocs,
enableRuntimeLibrary, enableBufferInitialization, vectorLength,
enableVLAVectorization, enableSIMDIndex32);
enableVLAVectorization, enableSIMDIndex32, enableGPULibgen);
}
3 changes: 1 addition & 2 deletions mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt %s --linalg-generalize-named-ops \
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt %s --linalg-generalize-named-ops \
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s

// CHECK-LABEL: func.func @matmul(
// CHECK-SAME: %[[VAL_0:.*0]]: tensor<?x?xf16>,
Expand Down
3 changes: 1 addition & 2 deletions mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt %s --linalg-generalize-named-ops \
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#SortedCOO = #sparse_tensor.encoding<{
map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#trait_sampled_dense_dense = {
indexing_maps = [
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#BSR = #sparse_tensor.encoding<{
map = (i, j) -> (
Expand Down
3 changes: 1 addition & 2 deletions mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt %s --linalg-generalize-named-ops \
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>

Expand Down
Empty file.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -85,32 +85,30 @@ module {
// A kernel that computes a BSR sampled dense matrix matrix multiplication
// using a "spy" function and in-place update of the sampling sparse matrix.
//
// TODO: re-enable the following test.
//
// func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
// %arga: tensor<?x?xf32>,
// %argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
// %result = linalg.generic #trait_SDDMM
// ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
// outs(%args: tensor<?x?xf32, #BSR>) {
// ^bb(%a: f32, %b: f32, %s: f32):
// %f0 = arith.constant 0.0 : f32
// %u = sparse_tensor.unary %s : f32 to f32
// present={
// ^bb0(%p: f32):
// %mul = arith.mulf %a, %b : f32
// sparse_tensor.yield %mul : f32
// }
// absent={}
// %r = sparse_tensor.reduce %s, %u, %f0 : f32 {
// ^bb0(%p: f32, %q: f32):
// %add = arith.addf %p, %q : f32
// sparse_tensor.yield %add : f32
// }
// linalg.yield %r : f32
// } -> tensor<?x?xf32, #BSR>
// return %result : tensor<?x?xf32, #BSR>
// }
func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
%arga: tensor<?x?xf32>,
%argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
%result = linalg.generic #trait_SDDMM
ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
outs(%args: tensor<?x?xf32, #BSR>) {
^bb(%a: f32, %b: f32, %s: f32):
%f0 = arith.constant 0.0 : f32
%u = sparse_tensor.unary %s : f32 to f32
present={
^bb0(%p: f32):
%mul = arith.mulf %a, %b : f32
sparse_tensor.yield %mul : f32
}
absent={}
%r = sparse_tensor.reduce %s, %u, %f0 : f32 {
^bb0(%p: f32, %q: f32):
%add = arith.addf %p, %q : f32
sparse_tensor.yield %add : f32
}
linalg.yield %r : f32
} -> tensor<?x?xf32, #BSR>
return %result : tensor<?x?xf32, #BSR>
}

func.func private @getTensorFilename(index) -> (!Filename)

Expand Down Expand Up @@ -153,15 +151,15 @@ module {
//
%fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
%m_csr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #CSR>
// %m_bsr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #BSR>
%m_bsr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #BSR>

// Call the kernel.
%0 = call @SDDMM(%m_csr, %a, %b)
: (tensor<?x?xf32, #CSR>,
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR>
// %1 = call @SDDMM_block(%m_bsr, %a, %b)
// : (tensor<?x?xf32, #BSR>,
// tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #BSR>
%1 = call @SDDMM_block(%m_bsr, %a, %b)
: (tensor<?x?xf32, #BSR>,
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #BSR>

//
// Print the result for verification. Note that the "spy" determines what
Expand All @@ -170,18 +168,18 @@ module {
// in the original zero positions).
//
// CHECK: ( 5, 10, 24, 19, 53, 42, 55, 56 )
// C_HECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 )
// CHECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 )
//
%v0 = sparse_tensor.values %0 : tensor<?x?xf32, #CSR> to memref<?xf32>
%vv0 = vector.transfer_read %v0[%c0], %d0 : memref<?xf32>, vector<8xf32>
vector.print %vv0 : vector<8xf32>
// %v1 = sparse_tensor.values %1 : tensor<?x?xf32, #BSR> to memref<?xf32>
// %vv1 = vector.transfer_read %v1[%c0], %d0 : memref<?xf32>, vector<12xf32>
// vector.print %vv1 : vector<12xf32>
%v1 = sparse_tensor.values %1 : tensor<?x?xf32, #BSR> to memref<?xf32>
%vv1 = vector.transfer_read %v1[%c0], %d0 : memref<?xf32>, vector<12xf32>
vector.print %vv1 : vector<12xf32>

// Release the resources.
bufferization.dealloc_tensor %0 : tensor<?x?xf32, #CSR>
// bufferization.dealloc_tensor %1 : tensor<?x?xf32, #BSR>
bufferization.dealloc_tensor %1 : tensor<?x?xf32, #BSR>

llvm.call @mgpuDestroySparseEnv() : () -> ()
return
Expand Down