Skip to content

[mlir] Op needs producer extract_slice op to bufferize in place #112435

Closed
@Max191

Description

@Max191

The following two func ops have different results when running OneShotBufferize:

// RUN:  mlir-opt test.mlir --pass-pipeline="builtin.module(one-shot-bufferize{test-analysis-only=true print-conflicts=true}, canonicalize, cse, canonicalize)"
#map = affine_map<(d0) -> (d0 mod 256)>
module {
  func.func @slice_bufferize_inplace(%2: tensor<2xf32>) -> tensor<2xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %3 = scf.forall (%arg0) in (1) shared_outs(%arg2 = %2) -> (tensor<2xf32>) {
      %extracted_slice = tensor.extract_slice %arg2[%arg0] [2] [1] : tensor<2xf32> to tensor<2xf32>
      %fill = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<2xf32>) -> tensor<2xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %fill into %arg2[%arg0] [2] [1] : tensor<2xf32> into tensor<2xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_0>]}
    return %3 : tensor<2xf32>
  }

  func.func @no_slice_bufferize_outplace(%2: tensor<2xf32>) -> tensor<2xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %3 = scf.forall (%arg0) in (1) shared_outs(%arg2 = %2) -> (tensor<2xf32>) {
      %fill = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<2xf32>) -> tensor<2xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %fill into %arg2[%arg0] [2] [1] : tensor<2xf32> into tensor<2xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_0>]}
    return %3 : tensor<2xf32>
  }
}

The result of bufferization analysis for these func ops is the following:

run mlir-opt test.mlir --pass-pipeline="builtin.module(one-shot-bufferize{test-analysis-only=true print-conflicts=true}, canonicalize, cse, canonicalize)"

module {
  func.func @slice_bufferize_inplace(%arg0: tensor<2xf32>) -> tensor<2xf32> attributes {"W_1[NOT-WRITABLE: bbArg 0]"} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = scf.forall (%arg1) in (1) shared_outs(%arg2 = %arg0) -> (tensor<2xf32>) {
      %extracted_slice = tensor.extract_slice %arg2[%arg1] [2] [1] {__inplace_operands_attr__ = ["true", "none"]} : tensor<2xf32> to tensor<2xf32>
      %1 = linalg.fill {__inplace_operands_attr__ = ["none", "true"]} ins(%cst : f32) outs(%extracted_slice : tensor<2xf32>) -> tensor<2xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %1 into %arg2[%arg1] [2] [1] {__inplace_operands_attr__ = ["true", "true", "none"]} : tensor<2xf32> into tensor<2xf32>
      }
    } {__inplace_operands_attr__ = ["false"], mapping = [#gpu.thread<linear_dim_0>]}
    return {__inplace_operands_attr__ = ["true"]} %0 : tensor<2xf32>
  }
  func.func @no_slice_bufferize_outplace(%arg0: tensor<2xf32>) -> tensor<2xf32> attributes {"W_0[NOT-WRITABLE: bbArg 0]"} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = scf.forall (%arg1) in (1) shared_outs(%arg2 = %arg0) -> (tensor<2xf32>) {
      %1 = linalg.fill {"C_0[CONFL-WRITE: 1]", __inplace_operands_attr__ = ["none", "false"]} ins(%cst : f32) outs(%arg2 : tensor<2xf32>) -> tensor<2xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %1 into %arg2[%arg1] [2] [1] {"C_0[READ: 1]", __inplace_operands_attr__ = ["true", "true", "none"]} : tensor<2xf32> into tensor<2xf32>
      }
    } {"C_0[DEF: bbArg 1]", __inplace_operands_attr__ = ["false"], mapping = [#gpu.thread<linear_dim_0>]}
    return {__inplace_operands_attr__ = ["true"]} %0 : tensor<2xf32>
  }
}

The key difference is that the linalg.fill op bufferizes in place when there is an extract_slice producer, but it bufferizes out of place when the block argument is the init arg of the fill. The extract_slice is a full slice, so both IRs are equivalent, and they should bufferize the same way. This seems like a bug in the bufferization analysis somewhere.

Metadata

Metadata

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions