-
Notifications
You must be signed in to change notification settings - Fork 34
Bump llvm - get updated Python bindings for linalg.elementwise #1045
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Bumping LLVM breaks // RUN: tpp-opt %s -pack-matmul="block-factors=32,32,32" -propagate-pack-and-unpack -canonicalize -tile-consumer-and-fuse-producers | FileCheck %s
#map = affine_map<(d0, d1) -> (d0, d1)>
func.func @matmul_and_relu(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) -> tensor<128x128xf32> {
%0 = linalg.matmul ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
outs(%arg2: tensor<128x128xf32>)
-> tensor<128x128xf32>
%c0 = arith.constant 0.0 : f32
%1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%0: tensor<128x128xf32>) {
^bb0(%out: f32):
%2 = arith.maximumf %out, %c0 : f32
linalg.yield %2 : f32
} -> tensor<128x128xf32>
return %1 : tensor<128x128xf32>
} Running #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module {
func.func @matmul_and_relu(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) -> tensor<128x128xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<4x4x32x32xf32>
%1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%0 : tensor<4x4x32x32xf32>) {
^bb0(%out: f32):
%2 = arith.maximumf %out, %cst : f32
linalg.yield %2 : f32
} -> tensor<4x4x32x32xf32>
%unpack = linalg.unpack %1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg2 : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
return %unpack : tensor<128x128xf32>
}
} So the dependency chain with the matmul was broken.
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @matmul_and_relu(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) -> tensor<128x128xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<4x4x32x32xf32>
%pack = linalg.pack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %0 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
%1 = tensor.empty() : tensor<4x4x32x32xf32>
%pack_0 = linalg.pack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
%2 = tensor.empty() : tensor<4x4x32x32xf32>
%pack_1 = linalg.pack %arg2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %2 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
%3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_0 : tensor<4x4x32x32xf32>, tensor<4x4x32x32xf32>) outs(%pack_1 : tensor<4x4x32x32xf32>) {
^bb0(%in: f32, %in_2: f32, %out: f32):
%5 = arith.mulf %in, %in_2 : f32
%6 = arith.addf %out, %5 : f32
linalg.yield %6 : f32
} -> tensor<4x4x32x32xf32>
%unpack = linalg.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg2 : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
%4 = linalg.generic {indexing_maps = [#map3], iterator_types = ["parallel", "parallel"]} outs(%unpack : tensor<128x128xf32>) {
^bb0(%out: f32):
%5 = arith.maximumf %out, %cst : f32
linalg.yield %5 : f32
} -> tensor<128x128xf32>
return %4 : tensor<128x128xf32>
}
} @adam-smnk, @rengolin, any clue what it might be? |
What's the output after |
My guess is that |
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module {
func.func @matmul_and_relu(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) -> tensor<128x128xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<4x4x32x32xf32>
%pack = linalg.pack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %0 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
%1 = tensor.empty() : tensor<4x4x32x32xf32>
%pack_0 = linalg.pack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
%2 = tensor.empty() : tensor<4x4x32x32xf32>
%pack_1 = linalg.pack %arg2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %2 : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
%3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_0 : tensor<4x4x32x32xf32>, tensor<4x4x32x32xf32>) outs(%pack_1 : tensor<4x4x32x32xf32>) {
^bb0(%in: f32, %in_3: f32, %out: f32):
%6 = arith.mulf %in, %in_3 : f32
%7 = arith.addf %out, %6 : f32
linalg.yield %7 : f32
} -> tensor<4x4x32x32xf32>
%unpack = linalg.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg2 : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
%4 = tensor.empty() : tensor<4x4x32x32xf32>
%5 = linalg.generic {indexing_maps = [#map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%4 : tensor<4x4x32x32xf32>) {
^bb0(%out: f32):
%6 = arith.maximumf %out, %cst : f32
linalg.yield %6 : f32
} -> tensor<4x4x32x32xf32>
%unpack_2 = linalg.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %unpack : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
return %unpack_2 : tensor<128x128xf32>
}
}
It is indeed the case that usage of the (unpacked) matmul result is substituted for a new |
The issue occurs after |
Oh, but it's even worse. The propagation isn't getting rid of the I think this is broken on our side. |
Smaller reproducer - input: #map = affine_map<(d0, d1) -> (d0, d1)>
func.func @broken_propagation(
%arg0: tensor<4x8x32x32xf32>, %arg1: tensor<128x256xf32>)
-> tensor<128x256xf32> {
%e = tensor.empty() : tensor<128x256xf32>
%unpack = linalg.unpack %arg0
inner_dims_pos = [0, 1] inner_tiles = [32, 32]
into %e : tensor<4x8x32x32xf32> -> tensor<128x256xf32>
%0 = linalg.generic {indexing_maps = [#map, #map],
iterator_types = ["parallel", "parallel"]}
ins(%arg1 : tensor<128x256xf32>)
outs(%unpack : tensor<128x256xf32>) {
^bb0(%in: f32, %out: f32):
%mul = arith.mulf %in, %out : f32
linalg.yield %mul : f32
} -> tensor<128x256xf32>
return %0 : tensor<128x256xf32>
} Output after #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module {
func.func @broken_propagation(%arg0: tensor<4x8x32x32xf32>, %arg1: tensor<128x256xf32>) -> tensor<128x256xf32> {
%0 = tensor.empty() : tensor<128x256xf32>
%1 = tensor.empty() : tensor<4x8x32x32xf32>
%pack = linalg.pack %arg1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<128x256xf32> -> tensor<4x8x32x32xf32>
%2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%pack : tensor<4x8x32x32xf32>) outs(%1 : tensor<4x8x32x32xf32>) {
^bb0(%in: f32, %out: f32):
%3 = arith.mulf %in, %out : f32
linalg.yield %3 : f32
} -> tensor<4x8x32x32xf32>
%unpack = linalg.unpack %2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %0 : tensor<4x8x32x32xf32> -> tensor<128x256xf32>
return %unpack : tensor<128x256xf32>
}
} Outs which would contain |
@rolfmorel Should be fine after: llvm/llvm-project#140103, could you bump again? |
Get fix for broken improvement on packing propagation
271e025
to
5ea5717
Compare
Awesome - great work on tracking down the issue and getting it fixed upstream, @adam-smnk ! This bump should now be good to go. Could someone please approve? |
Fixes:
config.strictMode = GreedyRewriteStrictness::ExistingOps;
toconfig.setStrictness(GreedyRewriteStrictness::ExistingOps);
vectorOp.getSource();
tovectorOp.getBase();
base
Naming Across Vector Ops (NFC) llvm/llvm-project#137859ToMemrefOp
->ToBufferOp