-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[mlir][ArmSME] Add a tests showing liveness issues in the tile allocator #90447
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This test shows a few cases (not at all complete) where the current ArmSME tile allocator produces incorrect results. The plan is to resolve these issues with a future tile allocator that uses liveness information.
c0efec5
to
3298946
Compare
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-sme Author: Benjamin Maxwell (MacDue) ChangesThis test shows a few cases (not at all complete) where the current ArmSME tile allocator produces incorrect results. The plan is to resolve these issues with a future tile allocator that uses liveness information. Full diff: https://github.com/llvm/llvm-project/pull/90447.diff 1 Files Affected:
diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
new file mode 100644
index 00000000000000..e7ca77f08ecc3f
--- /dev/null
+++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
@@ -0,0 +1,272 @@
+// RUN: mlir-opt %s -allocate-arm-sme-tiles -split-input-file -verify-diagnostics | FileCheck %s
+
+// This file tests some simple aspects of using liveness in the SME tile allocator.
+
+// Note: This is an XFAIL the new allocator is not yet upstream, and the current
+// allocator gives incorrect results for these tests.
+// XFAIL: *
+
+// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+// CHECK-LIVE-RANGE-NEXT: @constant_with_multiple_users
+// CHECK-LIVE-RANGE: ^bb0:
+// CHECK-LIVE-RANGE: S arm_sme.zero
+// CHECK-LIVE-RANGE-NEXT: |S arm_sme.move_vector_to_tile_slice
+// CHECK-LIVE-RANGE-NEXT: || arm_sme.move_vector_to_tile_slice
+// CHECK-LIVE-RANGE-NEXT: |E test.some_use
+// CHECK-LIVE-RANGE-NEXT: E test.some_use
+
+// CHECK-LABEL: @constant_with_multiple_users(
+// CHECK-SAME: %[[VECTOR_A:.*]]: vector<[4]xf32>, %[[VECTOR_B:.*]]: vector<[4]xf32>
+func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) {
+ // CHECK-NEXT: %[[ZERO_TILE_0:.*]] = arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32>
+ // CHECK-NEXT: %[[ZERO_TILE_1:.*]] = arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32>
+ // CHECK-NEXT: %[[INSERT_TILE_1:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_A]], %[[ZERO_TILE_1]], %{{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+ // CHECK-NEXT: %[[INSERT_TILE_0:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_B]], %[[ZERO_TILE_0]], %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %zero = arm_sme.zero : vector<[4]x[4]xf32>
+ %tile_a = arm_sme.move_vector_to_tile_slice %a, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %tile_b = arm_sme.move_vector_to_tile_slice %b, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32>
+ "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+ return
+}
+
+// -----
+
+// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+// CHECK-LIVE-RANGE-NEXT: @value_with_multiple_users
+// CHECK-LIVE-RANGE: ^bb0:
+// CHECK-LIVE-RANGE-NEXT: |S arm_sme.move_vector_to_tile_slice
+// CHECK-LIVE-RANGE-NEXT: || arm_sme.move_vector_to_tile_slice
+// CHECK-LIVE-RANGE-NEXT: |E test.some_use
+// CHECK-LIVE-RANGE-NEXT: E test.some_use
+
+func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) {
+ // expected-error@below {{op failed to rectify tile operand with tile result (move required)}}
+ %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32>
+ "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+ return
+}
+
+// -----
+
+// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+// CHECK-LIVE-RANGE-NEXT: @reuse_tiles_after_initial_use
+// CHECK-LIVE-RANGE: ^bb0:
+// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile
+// CHECK-LIVE-RANGE-NEXT: |S arm_sme.get_tile
+// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.get_tile
+// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.get_tile
+// CHECK-LIVE-RANGE-NEXT: |||| test.dummy
+// CHECK-LIVE-RANGE-NEXT: |||| test.dummy
+// CHECK-LIVE-RANGE-NEXT: |||| test.dummy
+// CHECK-LIVE-RANGE-NEXT: E||| test.some_use
+// CHECK-LIVE-RANGE-NEXT: E|| test.some_use
+// CHECK-LIVE-RANGE-NEXT: E| test.some_use
+// CHECK-LIVE-RANGE-NEXT: E test.some_use
+// CHECK-LIVE-RANGE-NEXT: S arm_sme.zero
+// CHECK-LIVE-RANGE-NEXT: |S arm_sme.zero
+// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.zero
+// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.zero
+// CHECK-LIVE-RANGE-NEXT: |||| test.dummy
+// CHECK-LIVE-RANGE-NEXT: |||| test.dummy
+// CHECK-LIVE-RANGE-NEXT: |||| test.dummy
+// CHECK-LIVE-RANGE-NEXT: E||| test.some_use
+// CHECK-LIVE-RANGE-NEXT: E|| test.some_use
+// CHECK-LIVE-RANGE-NEXT: E| test.some_use
+// CHECK-LIVE-RANGE-NEXT: E test.some_use
+
+// CHECK-LABEL: @reuse_tiles_after_initial_use
+func.func @reuse_tiles_after_initial_use() {
+ // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
+ // CHECK: arm_sme.get_tile {tile_id = 1 : i32}
+ // CHECK: arm_sme.get_tile {tile_id = 2 : i32}
+ // CHECK: arm_sme.get_tile {tile_id = 3 : i32}
+ %tile_a = arm_sme.get_tile : vector<[4]x[4]xf32>
+ %tile_b = arm_sme.get_tile : vector<[4]x[4]xf32>
+ %tile_c = arm_sme.get_tile : vector<[4]x[4]xf32>
+ %tile_d = arm_sme.get_tile : vector<[4]x[4]xf32>
+ "test.dummy"(): () -> ()
+ "test.dummy"(): () -> ()
+ "test.dummy"(): () -> ()
+ "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> ()
+ // CHECK: arm_sme.zero {tile_id = 0 : i32}
+ // CHECK: arm_sme.zero {tile_id = 1 : i32}
+ // CHECK: arm_sme.zero {tile_id = 2 : i32}
+ // CHECK: arm_sme.zero {tile_id = 3 : i32}
+ %tile_1 = arm_sme.zero : vector<[4]x[4]xf32>
+ %tile_2 = arm_sme.zero : vector<[4]x[4]xf32>
+ %tile_3 = arm_sme.zero : vector<[4]x[4]xf32>
+ %tile_4 = arm_sme.zero : vector<[4]x[4]xf32>
+ "test.dummy"(): () -> ()
+ "test.dummy"(): () -> ()
+ "test.dummy"(): () -> ()
+ "test.some_use"(%tile_1) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_2) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_3) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_4) : (vector<[4]x[4]xf32>) -> ()
+ return
+}
+
+// -----
+
+// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+// CHECK-LIVE-RANGE-NEXT: @non_overlapping_branches
+// CHECK-LIVE-RANGE: ^bb1:
+// CHECK-LIVE-RANGE-NEXT: S arm_sme.zero
+// CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile
+// CHECK-LIVE-RANGE-NEXT: E cf.br
+// CHECK-LIVE-RANGE-NEXT: ^bb2:
+// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile
+// CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile
+// CHECK-LIVE-RANGE-NEXT: E cf.br
+
+// CHECK-LABEL: @non_overlapping_branches
+func.func @non_overlapping_branches(%cond: i1) {
+ // CHECK: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32>
+ // CHECK: arm_sme.get_tile {tile_id = 0 : i32} : vector<[4]x[4]xf32>
+ %tile = scf.if %cond -> vector<[4]x[4]xf32> {
+ // ^bb1:
+ %zero = arm_sme.zero : vector<[4]x[4]xf32>
+ scf.yield %zero : vector<[4]x[4]xf32>
+ } else {
+ // ^bb2:
+ %undef = arm_sme.get_tile : vector<[4]x[4]xf32>
+ scf.yield %undef : vector<[4]x[4]xf32>
+ }
+ "test.some_use"(%tile) : (vector<[4]x[4]xf32>) -> ()
+ return
+}
+
+// -----
+
+// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+// <deliberately omitted>
+
+// CHECK-LABEL: @constant_loop_init_with_multiple_users
+func.func @constant_loop_init_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>) {
+ // CHECK: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32>
+ // CHECK: arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32>
+ // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+ // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+ %init = arm_sme.zero : vector<[4]x[4]xf32>
+ %tile_a = scf.for %i = %c0 to %c10 step %c1 iter_args(%iter = %init) -> vector<[4]x[4]xf32> {
+ %new_tile = arm_sme.move_vector_to_tile_slice %a, %iter, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+ scf.yield %new_tile : vector<[4]x[4]xf32>
+ }
+ %tile_b = scf.for %i = %c0 to %c10 step %c1 iter_args(%iter = %init) -> vector<[4]x[4]xf32> {
+ %new_tile = arm_sme.move_vector_to_tile_slice %a, %iter, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+ scf.yield %new_tile : vector<[4]x[4]xf32>
+ }
+ "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+ return
+}
+
+// -----
+
+// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+// CHECK-LIVE-RANGE-NEXT: @run_out_of_tiles_but_avoid_spill
+// CHECK-LIVE-RANGE: ^bb2:
+// CHECK-LIVE-RANGE-NEXT: |S arm_sme.copy_tile
+// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.copy_tile
+// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.copy_tile
+// CHECK-LIVE-RANGE-NEXT: ||||S arm_sme.copy_tile
+// CHECK-LIVE-RANGE-NEXT: EEEEE cf.br
+
+// Note in the live ranges (above) there is five tile values, but we only have four tiles.
+
+// CHECK-LABEL: @run_out_of_tiles_but_avoid_spill
+func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) {
+ %init = arm_sme.zero : vector<[4]x[4]xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+ // Live = %init
+ scf.for %i = %c0 to %c10 step %c1 {
+ // CHECK: arm_sme.zero {tile_id = 1 : i32}
+ // CHECK: arm_sme.zero {tile_id = 2 : i32}
+ // CHECK: arm_sme.zero {tile_id = 3 : i32}
+ // CHECK: arm_sme.zero {tile_id = 0 : i32}
+ %tile_a, %tile_b, %tile_c, %tile_d = scf.for %j = %c0 to %c10 step %c1
+ iter_args(%iter_a = %init, %iter_b = %init, %iter_c = %init, %iter_d = %init)
+ -> (vector<[4]x[4]xf32>, vector<[4]x[4]xf32> , vector<[4]x[4]xf32> , vector<[4]x[4]xf32>) {
+ // ^bb2:
+ // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+ // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 2 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+ // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 3 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+ // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %new_a = arm_sme.move_vector_to_tile_slice %a, %iter_a, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %new_b = arm_sme.move_vector_to_tile_slice %b, %iter_b, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %new_c = arm_sme.move_vector_to_tile_slice %c, %iter_c, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %new_d = arm_sme.move_vector_to_tile_slice %d, %iter_d, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+ scf.yield %new_a, %new_b, %new_c, %new_d : vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>
+ }
+ // Live = %init, %tile_a, %tile_b, %tile_c, %tile_d (out of tiles!)
+ // This should be resolved by duplicating the arm_sme.zero (from folding
+ // arm_sme.copy_tile operations inserted by the tile allocator).
+ "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> ()
+ }
+ return
+}
+
+// -----
+
+// We should be able to avoid spills like this, but logic handling this case is
+// not implemented yet. Note tile ID >= 16 means a spill/in-memory tile.
+
+// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
+// CHECK-LIVE-RANGE-NEXT: @avoidable_spill
+// CHECK-LIVE-RANGE: ^bb2:
+// CHECK-LIVE-RANGE-NEXT: || test.some_use
+// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.move_vector_to_tile_slice
+// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.move_vector_to_tile_slice
+// CHECK-LIVE-RANGE-NEXT: ||||S arm_sme.move_vector_to_tile_slice
+// CHECK-LIVE-RANGE-NEXT: |||||S arm_sme.move_vector_to_tile_slice
+// CHECK-LIVE-RANGE-NEXT: ||E||| test.some_use
+// CHECK-LIVE-RANGE-NEXT: || E|| test.some_use
+// CHECK-LIVE-RANGE-NEXT: || E| test.some_use
+// CHECK-LIVE-RANGE-NEXT: || E test.some_use
+// CHECK-LIVE-RANGE-NEXT: || arith.addi
+// CHECK-LIVE-RANGE-NEXT: EE cf.br
+
+// Note in the live ranges (above) there is two constant live-ins (first two ranges),
+// which gives six overlapping live ranges. The allocator currently will spill the
+// first constant (which results in a real spill at it's use), however, this could
+// be avoided by using the knowledge that at the first "test.some_use" there's
+// actually only two live ranges (so we can fix this be duplicating the constant).
+
+// CHECK-LABEL: @avoidable_spill
+func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) {
+ // CHECK: arm_sme.zero {tile_id = 16 : i32} : vector<[4]x[4]xf32>
+ %zero = arm_sme.zero : vector<[4]x[4]xf32>
+ %tile = arm_sme.get_tile : vector<[4]x[4]xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+ scf.for %i = %c0 to %c10 step %c1 {
+ // So spilled here (unnecessarily).
+ // The arm_sme.zero op could be moved into the loop to avoid this.
+ "test.some_use"(%zero) : (vector<[4]x[4]xf32>) -> ()
+ %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %tile_c = arm_sme.move_vector_to_tile_slice %c, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32>
+ %tile_d = arm_sme.move_vector_to_tile_slice %d, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32>
+ // %zero is still live here (due the the backedge)
+ "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> ()
+ "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> ()
+ }
+ return
+}
|
81743e0
to
5b75874
Compare
5b75874
to
fde6523
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for preparing this - I find these tests very helpful 🙏🏻 Also thank you for addressing my comments.
I've left one more small suggestion, otherwise LGTM
This test shows a few cases (not at all complete) where the current ArmSME tile allocator produces incorrect results. The plan is to resolve these issues with a future tile allocator that uses liveness information.