Closed
Description
The following two func ops have different results when running OneShotBufferize:
// RUN: mlir-opt test.mlir --pass-pipeline="builtin.module(one-shot-bufferize{test-analysis-only=true print-conflicts=true}, canonicalize, cse, canonicalize)"
#map = affine_map<(d0) -> (d0 mod 256)>
module {
func.func @slice_bufferize_inplace(%2: tensor<2xf32>) -> tensor<2xf32> {
%cst = arith.constant 0.000000e+00 : f32
%3 = scf.forall (%arg0) in (1) shared_outs(%arg2 = %2) -> (tensor<2xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0] [2] [1] : tensor<2xf32> to tensor<2xf32>
%fill = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<2xf32>) -> tensor<2xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %fill into %arg2[%arg0] [2] [1] : tensor<2xf32> into tensor<2xf32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
return %3 : tensor<2xf32>
}
func.func @no_slice_bufferize_outplace(%2: tensor<2xf32>) -> tensor<2xf32> {
%cst = arith.constant 0.000000e+00 : f32
%3 = scf.forall (%arg0) in (1) shared_outs(%arg2 = %2) -> (tensor<2xf32>) {
%fill = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<2xf32>) -> tensor<2xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %fill into %arg2[%arg0] [2] [1] : tensor<2xf32> into tensor<2xf32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
return %3 : tensor<2xf32>
}
}
The result of bufferization analysis for these func ops is the following:
run mlir-opt test.mlir --pass-pipeline="builtin.module(one-shot-bufferize{test-analysis-only=true print-conflicts=true}, canonicalize, cse, canonicalize)"
module {
func.func @slice_bufferize_inplace(%arg0: tensor<2xf32>) -> tensor<2xf32> attributes {"W_1[NOT-WRITABLE: bbArg 0]"} {
%cst = arith.constant 0.000000e+00 : f32
%0 = scf.forall (%arg1) in (1) shared_outs(%arg2 = %arg0) -> (tensor<2xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg1] [2] [1] {__inplace_operands_attr__ = ["true", "none"]} : tensor<2xf32> to tensor<2xf32>
%1 = linalg.fill {__inplace_operands_attr__ = ["none", "true"]} ins(%cst : f32) outs(%extracted_slice : tensor<2xf32>) -> tensor<2xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %1 into %arg2[%arg1] [2] [1] {__inplace_operands_attr__ = ["true", "true", "none"]} : tensor<2xf32> into tensor<2xf32>
}
} {__inplace_operands_attr__ = ["false"], mapping = [#gpu.thread<linear_dim_0>]}
return {__inplace_operands_attr__ = ["true"]} %0 : tensor<2xf32>
}
func.func @no_slice_bufferize_outplace(%arg0: tensor<2xf32>) -> tensor<2xf32> attributes {"W_0[NOT-WRITABLE: bbArg 0]"} {
%cst = arith.constant 0.000000e+00 : f32
%0 = scf.forall (%arg1) in (1) shared_outs(%arg2 = %arg0) -> (tensor<2xf32>) {
%1 = linalg.fill {"C_0[CONFL-WRITE: 1]", __inplace_operands_attr__ = ["none", "false"]} ins(%cst : f32) outs(%arg2 : tensor<2xf32>) -> tensor<2xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %1 into %arg2[%arg1] [2] [1] {"C_0[READ: 1]", __inplace_operands_attr__ = ["true", "true", "none"]} : tensor<2xf32> into tensor<2xf32>
}
} {"C_0[DEF: bbArg 1]", __inplace_operands_attr__ = ["false"], mapping = [#gpu.thread<linear_dim_0>]}
return {__inplace_operands_attr__ = ["true"]} %0 : tensor<2xf32>
}
}
The key difference is that the linalg.fill
op bufferizes in place when there is an extract_slice producer, but it bufferizes out of place when the block argument is the init arg of the fill. The extract_slice is a full slice, so both IRs are equivalent, and they should bufferize the same way. This seems like a bug in the bufferization analysis somewhere.