Skip to content

Conversation

rolfmorel
Copy link
Contributor

@rolfmorel rolfmorel commented May 12, 2025

Fixes:

  1. config.strictMode = GreedyRewriteStrictness::ExistingOps; to config.setStrictness(GreedyRewriteStrictness::ExistingOps);
  2. vectorOp.getSource(); to vectorOp.getBase();
  3. ToMemrefOp -> ToBufferOp

@rolfmorel
Copy link
Contributor Author

rolfmorel commented May 12, 2025

Bumping LLVM breaks --propagate-pack-and-unpack it seems. Here's test/Passes/pass-matmul-fuse.mlir:

// RUN: tpp-opt %s -pack-matmul="block-factors=32,32,32" -propagate-pack-and-unpack -canonicalize -tile-consumer-and-fuse-producers | FileCheck %s

#map = affine_map<(d0, d1) -> (d0, d1)>

func.func @matmul_and_relu(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) -> tensor<128x128xf32> {
  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
                     outs(%arg2: tensor<128x128xf32>)
    -> tensor<128x128xf32>
  %c0 = arith.constant 0.0 : f32
  %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%0: tensor<128x128xf32>) {
    ^bb0(%out: f32):
      %2 = arith.maximumf %out, %c0 : f32
      linalg.yield %2 : f32
    } -> tensor<128x128xf32>
  return %1 : tensor<128x128xf32>
}

Running bin/tpp-opt REPO_PATH/test/Passes/pass-matmul-fuse.mlir -pack-matmul="block-factors=32,32,32" -propagate-pack-and-unpack -canonicalize yields

#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module {
  func.func @matmul_and_relu(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) -> tensor<128x128xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = tensor.empty() : tensor<4x4x32x32xf32>
    %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%0 : tensor<4x4x32x32xf32>) {
    ^bb0(%out: f32):
      %2 = arith.maximumf %out, %cst : f32
      linalg.yield %2 : f32
    } -> tensor<4x4x32x32xf32>
    %unpack = linalg.unpack %1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg2 : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
    return %unpack : tensor<128x128xf32>
  }
}

So the dependency chain with the matmul was broken.

bin/tpp-opt REPO_PATH/test/Passes/pass-matmul-fuse.mlir -pack-matmul="block-factors=32,32,32" -canonicalize still has the matmul:

#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @matmul_and_relu(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) -> tensor<128x128xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = tensor.empty() : tensor<4x4x32x32xf32>
    %pack = linalg.pack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %0 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
    %1 = tensor.empty() : tensor<4x4x32x32xf32>
    %pack_0 = linalg.pack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
    %2 = tensor.empty() : tensor<4x4x32x32xf32>
    %pack_1 = linalg.pack %arg2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %2 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_0 : tensor<4x4x32x32xf32>, tensor<4x4x32x32xf32>) outs(%pack_1 : tensor<4x4x32x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %5 = arith.mulf %in, %in_2 : f32
      %6 = arith.addf %out, %5 : f32
      linalg.yield %6 : f32
    } -> tensor<4x4x32x32xf32>
    %unpack = linalg.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg2 : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
    %4 = linalg.generic {indexing_maps = [#map3], iterator_types = ["parallel", "parallel"]} outs(%unpack : tensor<128x128xf32>) {
    ^bb0(%out: f32):
      %5 = arith.maximumf %out, %cst : f32
      linalg.yield %5 : f32
    } -> tensor<128x128xf32>
    return %4 : tensor<128x128xf32>
  }
}

@adam-smnk, @rengolin, any clue what it might be?

@rengolin
Copy link
Contributor

What's the output after -pack-matmul="block-factors=32,32,32" -propagate-pack-and-unpack, before -canonicalize?

@rengolin
Copy link
Contributor

My guess is that pack is being treated as non-modifying (from args, to empty), so it may think matmul acts on empty tensors, and therefore is equivalent to an empty itself.

@rolfmorel
Copy link
Contributor Author

What's the output after -pack-matmul="block-factors=32,32,32" -propagate-pack-and-unpack, before -canonicalize?

$ build/bin/tpp-opt test/Passes/pass-matmul-fuse.mlir -pack-matmul="block-factors=32,32,32" -propagate-pack-and-unpack:

#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module {
  func.func @matmul_and_relu(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) -> tensor<128x128xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = tensor.empty() : tensor<4x4x32x32xf32>
    %pack = linalg.pack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %0 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
    %1 = tensor.empty() : tensor<4x4x32x32xf32>
    %pack_0 = linalg.pack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
    %2 = tensor.empty() : tensor<4x4x32x32xf32>
    %pack_1 = linalg.pack %arg2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %2 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_0 : tensor<4x4x32x32xf32>, tensor<4x4x32x32xf32>) outs(%pack_1 : tensor<4x4x32x32xf32>) {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %6 = arith.mulf %in, %in_3 : f32
      %7 = arith.addf %out, %6 : f32
      linalg.yield %7 : f32
    } -> tensor<4x4x32x32xf32>
    %unpack = linalg.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg2 : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
    %4 = tensor.empty() : tensor<4x4x32x32xf32>
    %5 = linalg.generic {indexing_maps = [#map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%4 : tensor<4x4x32x32xf32>) {
    ^bb0(%out: f32):
      %6 = arith.maximumf %out, %cst : f32
      linalg.yield %6 : f32
    } -> tensor<4x4x32x32xf32>
    %unpack_2 = linalg.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %unpack : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
    return %unpack_2 : tensor<128x128xf32>
  }
}

My guess is that pack is being treated as non-modifying (from args, to empty), so it may think matmul acts on empty tensors, and therefore is equivalent to an empty itself.

It is indeed the case that usage of the (unpacked) matmul result is substituted for a new tensor.empty.

@adam-smnk
Copy link
Contributor

The issue occurs after PushDownUnPackOpThroughGenericOp pattern.
Validated that this upstream PR broke the propagation: llvm/llvm-project#138332

@rengolin
Copy link
Contributor

My guess is that pack is being treated as non-modifying (from args, to empty), so it may think matmul acts on empty tensors, and therefore is equivalent to an empty itself.

It is indeed the case that usage of the (unpacked) matmul result is substituted for a new tensor.empty.

Oh, but it's even worse. The propagation isn't getting rid of the unpack in the middle and for some reason it created an empty for the second operation. It should have gotten rid of the unpack and replaced the users with %3.

I think this is broken on our side.

@adam-smnk
Copy link
Contributor

Smaller reproducer - input:

#map = affine_map<(d0, d1) -> (d0, d1)>
func.func @broken_propagation(
    %arg0: tensor<4x8x32x32xf32>, %arg1: tensor<128x256xf32>)
    -> tensor<128x256xf32> {
  %e = tensor.empty() : tensor<128x256xf32>
  %unpack = linalg.unpack %arg0
    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
    into %e : tensor<4x8x32x32xf32> -> tensor<128x256xf32>
  %0 = linalg.generic {indexing_maps = [#map, #map],
    iterator_types = ["parallel", "parallel"]}
    ins(%arg1 : tensor<128x256xf32>)
    outs(%unpack : tensor<128x256xf32>) {
  ^bb0(%in: f32, %out: f32):
    %mul = arith.mulf %in, %out : f32
    linalg.yield %mul : f32
  } -> tensor<128x256xf32>
  return %0 : tensor<128x256xf32>
}

Output after tpp-opt -propagate-pack-and-unpack -canonicalize -cse:

#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module {
  func.func @broken_propagation(%arg0: tensor<4x8x32x32xf32>, %arg1: tensor<128x256xf32>) -> tensor<128x256xf32> {
    %0 = tensor.empty() : tensor<128x256xf32>
    %1 = tensor.empty() : tensor<4x8x32x32xf32>
    %pack = linalg.pack %arg1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<128x256xf32> -> tensor<4x8x32x32xf32>
    %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%pack : tensor<4x8x32x32xf32>) outs(%1 : tensor<4x8x32x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %3 = arith.mulf %in, %out : f32
      linalg.yield %3 : f32
    } -> tensor<4x8x32x32xf32>
    %unpack = linalg.unpack %2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %0 : tensor<4x8x32x32xf32> -> tensor<128x256xf32>
    return %unpack : tensor<128x256xf32>
  }
}

Outs which would contain %arg0 data get replaced by uninitialized %1 empty tensor.

@adam-smnk
Copy link
Contributor

@rolfmorel Should be fine after: llvm/llvm-project#140103, could you bump again?

@rolfmorel rolfmorel force-pushed the bump-llvm-elementwise-py branch from 271e025 to 5ea5717 Compare May 16, 2025 09:49
@rolfmorel
Copy link
Contributor Author

Awesome - great work on tracking down the issue and getting it fixed upstream, @adam-smnk !

This bump should now be good to go. Could someone please approve?

@rolfmorel rolfmorel requested review from adam-smnk and rengolin May 16, 2025 09:58
@rolfmorel rolfmorel merged commit 6ed8495 into libxsmm:main May 16, 2025
13 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants