From 32989461e7287d2d055eebba8ddc45868bca1c7d Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 26 Apr 2024 15:09:20 +0000 Subject: [PATCH 1/4] [mlir][ArmSME] Add a tests showing liveness issues in the tile allocator This test shows a few cases (not at all complete) where the current ArmSME tile allocator produces incorrect results. The plan is to resolve these issues with a future tile allocator that uses liveness information. --- .../ArmSME/tile-allocation-liveness.mlir | 272 ++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir new file mode 100644 index 0000000000000..e7ca77f08ecc3 --- /dev/null +++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir @@ -0,0 +1,272 @@ +// RUN: mlir-opt %s -allocate-arm-sme-tiles -split-input-file -verify-diagnostics | FileCheck %s + +// This file tests some simple aspects of using liveness in the SME tile allocator. + +// Note: This is an XFAIL the new allocator is not yet upstream, and the current +// allocator gives incorrect results for these tests. +// XFAIL: * + +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// CHECK-LIVE-RANGE-NEXT: @constant_with_multiple_users +// CHECK-LIVE-RANGE: ^bb0: +// CHECK-LIVE-RANGE: S arm_sme.zero +// CHECK-LIVE-RANGE-NEXT: |S arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: || arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: |E test.some_use +// CHECK-LIVE-RANGE-NEXT: E test.some_use + +// CHECK-LABEL: @constant_with_multiple_users( +// CHECK-SAME: %[[VECTOR_A:.*]]: vector<[4]xf32>, %[[VECTOR_B:.*]]: vector<[4]xf32> +func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) { + // CHECK-NEXT: %[[ZERO_TILE_0:.*]] = arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> + // CHECK-NEXT: %[[ZERO_TILE_1:.*]] = arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32> + // CHECK-NEXT: %[[INSERT_TILE_1:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_A]], %[[ZERO_TILE_1]], %{{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK-NEXT: %[[INSERT_TILE_0:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_B]], %[[ZERO_TILE_0]], %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + %zero = arm_sme.zero : vector<[4]x[4]xf32> + %tile_a = arm_sme.move_vector_to_tile_slice %a, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_b = arm_sme.move_vector_to_tile_slice %b, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32> + "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () + return +} + +// ----- + +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// CHECK-LIVE-RANGE-NEXT: @value_with_multiple_users +// CHECK-LIVE-RANGE: ^bb0: +// CHECK-LIVE-RANGE-NEXT: |S arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: || arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: |E test.some_use +// CHECK-LIVE-RANGE-NEXT: E test.some_use + +func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) { + // expected-error@below {{op failed to rectify tile operand with tile result (move required)}} + %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32> + "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () + return +} + +// ----- + +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// CHECK-LIVE-RANGE-NEXT: @reuse_tiles_after_initial_use +// CHECK-LIVE-RANGE: ^bb0: +// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: |S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: |||| test.dummy +// CHECK-LIVE-RANGE-NEXT: |||| test.dummy +// CHECK-LIVE-RANGE-NEXT: |||| test.dummy +// CHECK-LIVE-RANGE-NEXT: E||| test.some_use +// CHECK-LIVE-RANGE-NEXT: E|| test.some_use +// CHECK-LIVE-RANGE-NEXT: E| test.some_use +// CHECK-LIVE-RANGE-NEXT: E test.some_use +// CHECK-LIVE-RANGE-NEXT: S arm_sme.zero +// CHECK-LIVE-RANGE-NEXT: |S arm_sme.zero +// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.zero +// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.zero +// CHECK-LIVE-RANGE-NEXT: |||| test.dummy +// CHECK-LIVE-RANGE-NEXT: |||| test.dummy +// CHECK-LIVE-RANGE-NEXT: |||| test.dummy +// CHECK-LIVE-RANGE-NEXT: E||| test.some_use +// CHECK-LIVE-RANGE-NEXT: E|| test.some_use +// CHECK-LIVE-RANGE-NEXT: E| test.some_use +// CHECK-LIVE-RANGE-NEXT: E test.some_use + +// CHECK-LABEL: @reuse_tiles_after_initial_use +func.func @reuse_tiles_after_initial_use() { + // CHECK: arm_sme.get_tile {tile_id = 0 : i32} + // CHECK: arm_sme.get_tile {tile_id = 1 : i32} + // CHECK: arm_sme.get_tile {tile_id = 2 : i32} + // CHECK: arm_sme.get_tile {tile_id = 3 : i32} + %tile_a = arm_sme.get_tile : vector<[4]x[4]xf32> + %tile_b = arm_sme.get_tile : vector<[4]x[4]xf32> + %tile_c = arm_sme.get_tile : vector<[4]x[4]xf32> + %tile_d = arm_sme.get_tile : vector<[4]x[4]xf32> + "test.dummy"(): () -> () + "test.dummy"(): () -> () + "test.dummy"(): () -> () + "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> () + // CHECK: arm_sme.zero {tile_id = 0 : i32} + // CHECK: arm_sme.zero {tile_id = 1 : i32} + // CHECK: arm_sme.zero {tile_id = 2 : i32} + // CHECK: arm_sme.zero {tile_id = 3 : i32} + %tile_1 = arm_sme.zero : vector<[4]x[4]xf32> + %tile_2 = arm_sme.zero : vector<[4]x[4]xf32> + %tile_3 = arm_sme.zero : vector<[4]x[4]xf32> + %tile_4 = arm_sme.zero : vector<[4]x[4]xf32> + "test.dummy"(): () -> () + "test.dummy"(): () -> () + "test.dummy"(): () -> () + "test.some_use"(%tile_1) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_2) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_3) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_4) : (vector<[4]x[4]xf32>) -> () + return +} + +// ----- + +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// CHECK-LIVE-RANGE-NEXT: @non_overlapping_branches +// CHECK-LIVE-RANGE: ^bb1: +// CHECK-LIVE-RANGE-NEXT: S arm_sme.zero +// CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile +// CHECK-LIVE-RANGE-NEXT: E cf.br +// CHECK-LIVE-RANGE-NEXT: ^bb2: +// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile +// CHECK-LIVE-RANGE-NEXT: E cf.br + +// CHECK-LABEL: @non_overlapping_branches +func.func @non_overlapping_branches(%cond: i1) { + // CHECK: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> + // CHECK: arm_sme.get_tile {tile_id = 0 : i32} : vector<[4]x[4]xf32> + %tile = scf.if %cond -> vector<[4]x[4]xf32> { + // ^bb1: + %zero = arm_sme.zero : vector<[4]x[4]xf32> + scf.yield %zero : vector<[4]x[4]xf32> + } else { + // ^bb2: + %undef = arm_sme.get_tile : vector<[4]x[4]xf32> + scf.yield %undef : vector<[4]x[4]xf32> + } + "test.some_use"(%tile) : (vector<[4]x[4]xf32>) -> () + return +} + +// ----- + +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// + +// CHECK-LABEL: @constant_loop_init_with_multiple_users +func.func @constant_loop_init_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>) { + // CHECK: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> + // CHECK: arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32> + // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + %init = arm_sme.zero : vector<[4]x[4]xf32> + %tile_a = scf.for %i = %c0 to %c10 step %c1 iter_args(%iter = %init) -> vector<[4]x[4]xf32> { + %new_tile = arm_sme.move_vector_to_tile_slice %a, %iter, %i : vector<[4]xf32> into vector<[4]x[4]xf32> + scf.yield %new_tile : vector<[4]x[4]xf32> + } + %tile_b = scf.for %i = %c0 to %c10 step %c1 iter_args(%iter = %init) -> vector<[4]x[4]xf32> { + %new_tile = arm_sme.move_vector_to_tile_slice %a, %iter, %i : vector<[4]xf32> into vector<[4]x[4]xf32> + scf.yield %new_tile : vector<[4]x[4]xf32> + } + "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () + return +} + +// ----- + +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// CHECK-LIVE-RANGE-NEXT: @run_out_of_tiles_but_avoid_spill +// CHECK-LIVE-RANGE: ^bb2: +// CHECK-LIVE-RANGE-NEXT: |S arm_sme.copy_tile +// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.copy_tile +// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.copy_tile +// CHECK-LIVE-RANGE-NEXT: ||||S arm_sme.copy_tile +// CHECK-LIVE-RANGE-NEXT: EEEEE cf.br + +// Note in the live ranges (above) there is five tile values, but we only have four tiles. + +// CHECK-LABEL: @run_out_of_tiles_but_avoid_spill +func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) { + %init = arm_sme.zero : vector<[4]x[4]xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + // Live = %init + scf.for %i = %c0 to %c10 step %c1 { + // CHECK: arm_sme.zero {tile_id = 1 : i32} + // CHECK: arm_sme.zero {tile_id = 2 : i32} + // CHECK: arm_sme.zero {tile_id = 3 : i32} + // CHECK: arm_sme.zero {tile_id = 0 : i32} + %tile_a, %tile_b, %tile_c, %tile_d = scf.for %j = %c0 to %c10 step %c1 + iter_args(%iter_a = %init, %iter_b = %init, %iter_c = %init, %iter_d = %init) + -> (vector<[4]x[4]xf32>, vector<[4]x[4]xf32> , vector<[4]x[4]xf32> , vector<[4]x[4]xf32>) { + // ^bb2: + // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 2 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 3 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + %new_a = arm_sme.move_vector_to_tile_slice %a, %iter_a, %i : vector<[4]xf32> into vector<[4]x[4]xf32> + %new_b = arm_sme.move_vector_to_tile_slice %b, %iter_b, %i : vector<[4]xf32> into vector<[4]x[4]xf32> + %new_c = arm_sme.move_vector_to_tile_slice %c, %iter_c, %i : vector<[4]xf32> into vector<[4]x[4]xf32> + %new_d = arm_sme.move_vector_to_tile_slice %d, %iter_d, %i : vector<[4]xf32> into vector<[4]x[4]xf32> + scf.yield %new_a, %new_b, %new_c, %new_d : vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32> + } + // Live = %init, %tile_a, %tile_b, %tile_c, %tile_d (out of tiles!) + // This should be resolved by duplicating the arm_sme.zero (from folding + // arm_sme.copy_tile operations inserted by the tile allocator). + "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> () + } + return +} + +// ----- + +// We should be able to avoid spills like this, but logic handling this case is +// not implemented yet. Note tile ID >= 16 means a spill/in-memory tile. + +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// CHECK-LIVE-RANGE-NEXT: @avoidable_spill +// CHECK-LIVE-RANGE: ^bb2: +// CHECK-LIVE-RANGE-NEXT: || test.some_use +// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: ||||S arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: |||||S arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: ||E||| test.some_use +// CHECK-LIVE-RANGE-NEXT: || E|| test.some_use +// CHECK-LIVE-RANGE-NEXT: || E| test.some_use +// CHECK-LIVE-RANGE-NEXT: || E test.some_use +// CHECK-LIVE-RANGE-NEXT: || arith.addi +// CHECK-LIVE-RANGE-NEXT: EE cf.br + +// Note in the live ranges (above) there is two constant live-ins (first two ranges), +// which gives six overlapping live ranges. The allocator currently will spill the +// first constant (which results in a real spill at it's use), however, this could +// be avoided by using the knowledge that at the first "test.some_use" there's +// actually only two live ranges (so we can fix this be duplicating the constant). + +// CHECK-LABEL: @avoidable_spill +func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) { + // CHECK: arm_sme.zero {tile_id = 16 : i32} : vector<[4]x[4]xf32> + %zero = arm_sme.zero : vector<[4]x[4]xf32> + %tile = arm_sme.get_tile : vector<[4]x[4]xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + scf.for %i = %c0 to %c10 step %c1 { + // So spilled here (unnecessarily). + // The arm_sme.zero op could be moved into the loop to avoid this. + "test.some_use"(%zero) : (vector<[4]x[4]xf32>) -> () + %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_c = arm_sme.move_vector_to_tile_slice %c, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_d = arm_sme.move_vector_to_tile_slice %d, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> + // %zero is still live here (due the the backedge) + "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> () + } + return +} From fde6523ab7e46b646b813d576e28168138a7dc3d Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 29 Apr 2024 16:15:52 +0000 Subject: [PATCH 2/4] Add `CHECK-BAD` tests for the current tile allocator --- .../ArmSME/tile-allocation-liveness.mlir | 72 ++++++++++++++++--- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir index e7ca77f08ecc3..5fc80e0ab3a89 100644 --- a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir +++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir @@ -1,10 +1,12 @@ -// RUN: mlir-opt %s -allocate-arm-sme-tiles -split-input-file -verify-diagnostics | FileCheck %s +// RUN: mlir-opt %s -allocate-arm-sme-tiles -split-input-file -verify-diagnostics | FileCheck %s --check-prefix=CHECK-BAD -// This file tests some simple aspects of using liveness in the SME tile allocator. - -// Note: This is an XFAIL the new allocator is not yet upstream, and the current -// allocator gives incorrect results for these tests. -// XFAIL: * +// This file tests some aspects of liveness issues in the SME tile allocator. +// These tests were designed with a new liveness-based tile allocator in mind, +// with the current tile allocator these tests all give incorrect results (which +// is documented by `CHECK-BAD`). +// +// Currently only the `CHECK-BAD` tests are run (as the new liveness based +// allocator is not yet available -- so all other tests fail). // CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: // CHECK-LIVE-RANGE-NEXT: @constant_with_multiple_users @@ -15,6 +17,14 @@ // CHECK-LIVE-RANGE-NEXT: |E test.some_use // CHECK-LIVE-RANGE-NEXT: E test.some_use +// Incorrect result! The second `move_vector_to_tile_slice` overwrites the first (which is still live). +// +// CHECK-BAD-LABEL: @constant_with_multiple_users( +// CHECK-BAD-SAME: %[[VECTOR_A:.*]]: vector<[4]xf32>, %[[VECTOR_B:.*]]: vector<[4]xf32> +// CHECK-BAD: %[[ZERO_TILE:.*]] = arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> +// CHECK-BAD: %[[INSERT_TILE_1:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_A]], %[[ZERO_TILE]], %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> +// CHECK-BAD: %[[INSERT_TILE_0:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_B]], %[[ZERO_TILE]], %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK-LABEL: @constant_with_multiple_users( // CHECK-SAME: %[[VECTOR_A:.*]]: vector<[4]xf32>, %[[VECTOR_B:.*]]: vector<[4]xf32> func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) { @@ -40,8 +50,10 @@ func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32> // CHECK-LIVE-RANGE-NEXT: |E test.some_use // CHECK-LIVE-RANGE-NEXT: E test.some_use +// (No CHECK-BAD -- the current tile allocator ignores this case) + func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) { - // expected-error@below {{op failed to rectify tile operand with tile result (move required)}} + // A future allocator should error here (as `%`tile would need to be copied). %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32> %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32> "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () @@ -77,6 +89,19 @@ func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]x // CHECK-LIVE-RANGE-NEXT: E| test.some_use // CHECK-LIVE-RANGE-NEXT: E test.some_use +// CHECK-BAD-LABEL: @reuse_tiles_after_initial_use +// CHECK-BAD: arm_sme.get_tile {tile_id = 0 : i32} +// CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32} +// CHECK-BAD: arm_sme.get_tile {tile_id = 2 : i32} +// CHECK-BAD: arm_sme.get_tile {tile_id = 3 : i32} +// +// -> Spills after the fourth tile (unnecessary): +// +// CHECK-BAD: arm_sme.zero {tile_id = 16 : i32} +// CHECK-BAD: arm_sme.zero {tile_id = 17 : i32} +// CHECK-BAD: arm_sme.zero {tile_id = 18 : i32} +// CHECK-BAD: arm_sme.zero {tile_id = 19 : i32} + // CHECK-LABEL: @reuse_tiles_after_initial_use func.func @reuse_tiles_after_initial_use() { // CHECK: arm_sme.get_tile {tile_id = 0 : i32} @@ -98,9 +123,14 @@ func.func @reuse_tiles_after_initial_use() { // CHECK: arm_sme.zero {tile_id = 1 : i32} // CHECK: arm_sme.zero {tile_id = 2 : i32} // CHECK: arm_sme.zero {tile_id = 3 : i32} + // Unnecessary spills: + // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}} %tile_1 = arm_sme.zero : vector<[4]x[4]xf32> + // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}} %tile_2 = arm_sme.zero : vector<[4]x[4]xf32> + // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}} %tile_3 = arm_sme.zero : vector<[4]x[4]xf32> + // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}} %tile_4 = arm_sme.zero : vector<[4]x[4]xf32> "test.dummy"(): () -> () "test.dummy"(): () -> () @@ -125,6 +155,12 @@ func.func @reuse_tiles_after_initial_use() { // CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile // CHECK-LIVE-RANGE-NEXT: E cf.br +// Incorrect result! Both branches should yield the result via the same tile. +// +// CHECK-BAD-LABEL: @non_overlapping_branches +// CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> +// CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32} : vector<[4]x[4]xf32> + // CHECK-LABEL: @non_overlapping_branches func.func @non_overlapping_branches(%cond: i1) { // CHECK: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> @@ -147,6 +183,13 @@ func.func @non_overlapping_branches(%cond: i1) { // CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: // +// Incorrect result! Everything assigned to tile 0 (which means values that are still live are overwritten). +// +// CHECK-BAD-LABEL: @constant_loop_init_with_multiple_users +// CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> +// CHECK-BAD: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> +// CHECK-BAD: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK-LABEL: @constant_loop_init_with_multiple_users func.func @constant_loop_init_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>) { // CHECK: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> @@ -183,6 +226,12 @@ func.func @constant_loop_init_with_multiple_users(%a: vector<[4]xf32>, %b: vecto // Note in the live ranges (above) there is five tile values, but we only have four tiles. +// Incorrect result! Everything assigned to tile 0 (which means values that are still live are overwritten). +// +// CHECK-BAD-LABEL: @run_out_of_tiles_but_avoid_spill +// CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} +// CHECK-BAD-COUNT-4: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK-LABEL: @run_out_of_tiles_but_avoid_spill func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) { %init = arm_sme.zero : vector<[4]x[4]xf32> @@ -239,13 +288,20 @@ func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]x // CHECK-LIVE-RANGE-NEXT: || E test.some_use // CHECK-LIVE-RANGE-NEXT: || arith.addi // CHECK-LIVE-RANGE-NEXT: EE cf.br - +// // Note in the live ranges (above) there is two constant live-ins (first two ranges), // which gives six overlapping live ranges. The allocator currently will spill the // first constant (which results in a real spill at it's use), however, this could // be avoided by using the knowledge that at the first "test.some_use" there's // actually only two live ranges (so we can fix this be duplicating the constant). +// Incorrect result! Everything other than zero assigned to tile 1 (which means values that are still live are overwritten). +// +// CHECK-BAD-LABEL: @avoidable_spill +// CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} +// CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32} +// CHECK-BAD-COUNT-4: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32} + // CHECK-LABEL: @avoidable_spill func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) { // CHECK: arm_sme.zero {tile_id = 16 : i32} : vector<[4]x[4]xf32> From 277f684015ddbbd83ce3eb366607b159add0c5f6 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 30 Apr 2024 15:01:08 +0000 Subject: [PATCH 3/4] Remove new allocator checks --- .../ArmSME/tile-allocation-liveness.mlir | 182 ++---------------- 1 file changed, 17 insertions(+), 165 deletions(-) diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir index 5fc80e0ab3a89..b5dac8733e61c 100644 --- a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir +++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir @@ -1,37 +1,18 @@ // RUN: mlir-opt %s -allocate-arm-sme-tiles -split-input-file -verify-diagnostics | FileCheck %s --check-prefix=CHECK-BAD // This file tests some aspects of liveness issues in the SME tile allocator. -// These tests were designed with a new liveness-based tile allocator in mind, -// with the current tile allocator these tests all give incorrect results (which -// is documented by `CHECK-BAD`). -// -// Currently only the `CHECK-BAD` tests are run (as the new liveness based -// allocator is not yet available -- so all other tests fail). - -// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: -// CHECK-LIVE-RANGE-NEXT: @constant_with_multiple_users -// CHECK-LIVE-RANGE: ^bb0: -// CHECK-LIVE-RANGE: S arm_sme.zero -// CHECK-LIVE-RANGE-NEXT: |S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: || arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: |E test.some_use -// CHECK-LIVE-RANGE-NEXT: E test.some_use +// These tests were designed with a new liveness-based tile allocator in mind +// (where the names of test cases make more sense), with the current tile +// allocator these tests all give incorrect results (which is documented by +// `CHECK-BAD`). // Incorrect result! The second `move_vector_to_tile_slice` overwrites the first (which is still live). // -// CHECK-BAD-LABEL: @constant_with_multiple_users( -// CHECK-BAD-SAME: %[[VECTOR_A:.*]]: vector<[4]xf32>, %[[VECTOR_B:.*]]: vector<[4]xf32> +// CHECK-BAD-LABEL: @constant_with_multiple_users // CHECK-BAD: %[[ZERO_TILE:.*]] = arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> -// CHECK-BAD: %[[INSERT_TILE_1:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_A]], %[[ZERO_TILE]], %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> -// CHECK-BAD: %[[INSERT_TILE_0:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_B]], %[[ZERO_TILE]], %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - -// CHECK-LABEL: @constant_with_multiple_users( -// CHECK-SAME: %[[VECTOR_A:.*]]: vector<[4]xf32>, %[[VECTOR_B:.*]]: vector<[4]xf32> +// CHECK-BAD: %[[INSERT_TILE_1:.*]] = arm_sme.move_vector_to_tile_slice %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> +// CHECK-BAD: %[[INSERT_TILE_0:.*]] = arm_sme.move_vector_to_tile_slice %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) { - // CHECK-NEXT: %[[ZERO_TILE_0:.*]] = arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> - // CHECK-NEXT: %[[ZERO_TILE_1:.*]] = arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32> - // CHECK-NEXT: %[[INSERT_TILE_1:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_A]], %[[ZERO_TILE_1]], %{{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - // CHECK-NEXT: %[[INSERT_TILE_0:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_B]], %[[ZERO_TILE_0]], %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> %zero = arm_sme.zero : vector<[4]x[4]xf32> %tile_a = arm_sme.move_vector_to_tile_slice %a, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32> %tile_b = arm_sme.move_vector_to_tile_slice %b, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32> @@ -42,18 +23,9 @@ func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32> // ----- -// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: -// CHECK-LIVE-RANGE-NEXT: @value_with_multiple_users -// CHECK-LIVE-RANGE: ^bb0: -// CHECK-LIVE-RANGE-NEXT: |S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: || arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: |E test.some_use -// CHECK-LIVE-RANGE-NEXT: E test.some_use - // (No CHECK-BAD -- the current tile allocator ignores this case) - func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) { - // A future allocator should error here (as `%`tile would need to be copied). + // A future allocator should error here (as `%tile` would need to be copied). %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32> %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32> "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () @@ -63,51 +35,12 @@ func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]x // ----- -// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: -// CHECK-LIVE-RANGE-NEXT: @reuse_tiles_after_initial_use -// CHECK-LIVE-RANGE: ^bb0: -// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile -// CHECK-LIVE-RANGE-NEXT: |S arm_sme.get_tile -// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.get_tile -// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.get_tile -// CHECK-LIVE-RANGE-NEXT: |||| test.dummy -// CHECK-LIVE-RANGE-NEXT: |||| test.dummy -// CHECK-LIVE-RANGE-NEXT: |||| test.dummy -// CHECK-LIVE-RANGE-NEXT: E||| test.some_use -// CHECK-LIVE-RANGE-NEXT: E|| test.some_use -// CHECK-LIVE-RANGE-NEXT: E| test.some_use -// CHECK-LIVE-RANGE-NEXT: E test.some_use -// CHECK-LIVE-RANGE-NEXT: S arm_sme.zero -// CHECK-LIVE-RANGE-NEXT: |S arm_sme.zero -// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.zero -// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.zero -// CHECK-LIVE-RANGE-NEXT: |||| test.dummy -// CHECK-LIVE-RANGE-NEXT: |||| test.dummy -// CHECK-LIVE-RANGE-NEXT: |||| test.dummy -// CHECK-LIVE-RANGE-NEXT: E||| test.some_use -// CHECK-LIVE-RANGE-NEXT: E|| test.some_use -// CHECK-LIVE-RANGE-NEXT: E| test.some_use -// CHECK-LIVE-RANGE-NEXT: E test.some_use - // CHECK-BAD-LABEL: @reuse_tiles_after_initial_use -// CHECK-BAD: arm_sme.get_tile {tile_id = 0 : i32} -// CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32} -// CHECK-BAD: arm_sme.get_tile {tile_id = 2 : i32} -// CHECK-BAD: arm_sme.get_tile {tile_id = 3 : i32} -// -// -> Spills after the fourth tile (unnecessary): -// -// CHECK-BAD: arm_sme.zero {tile_id = 16 : i32} -// CHECK-BAD: arm_sme.zero {tile_id = 17 : i32} -// CHECK-BAD: arm_sme.zero {tile_id = 18 : i32} -// CHECK-BAD: arm_sme.zero {tile_id = 19 : i32} - -// CHECK-LABEL: @reuse_tiles_after_initial_use func.func @reuse_tiles_after_initial_use() { - // CHECK: arm_sme.get_tile {tile_id = 0 : i32} - // CHECK: arm_sme.get_tile {tile_id = 1 : i32} - // CHECK: arm_sme.get_tile {tile_id = 2 : i32} - // CHECK: arm_sme.get_tile {tile_id = 3 : i32} + // CHECK-BAD: arm_sme.get_tile {tile_id = 0 : i32} + // CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32} + // CHECK-BAD: arm_sme.get_tile {tile_id = 2 : i32} + // CHECK-BAD: arm_sme.get_tile {tile_id = 3 : i32} %tile_a = arm_sme.get_tile : vector<[4]x[4]xf32> %tile_b = arm_sme.get_tile : vector<[4]x[4]xf32> %tile_c = arm_sme.get_tile : vector<[4]x[4]xf32> @@ -119,10 +52,11 @@ func.func @reuse_tiles_after_initial_use() { "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> () "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> () - // CHECK: arm_sme.zero {tile_id = 0 : i32} - // CHECK: arm_sme.zero {tile_id = 1 : i32} - // CHECK: arm_sme.zero {tile_id = 2 : i32} - // CHECK: arm_sme.zero {tile_id = 3 : i32} + // -> Spills after the fourth tile (unnecessary): + // CHECK-BAD: arm_sme.zero {tile_id = 16 : i32} + // CHECK-BAD: arm_sme.zero {tile_id = 17 : i32} + // CHECK-BAD: arm_sme.zero {tile_id = 18 : i32} + // CHECK-BAD: arm_sme.zero {tile_id = 19 : i32} // Unnecessary spills: // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}} %tile_1 = arm_sme.zero : vector<[4]x[4]xf32> @@ -144,33 +78,16 @@ func.func @reuse_tiles_after_initial_use() { // ----- -// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: -// CHECK-LIVE-RANGE-NEXT: @non_overlapping_branches -// CHECK-LIVE-RANGE: ^bb1: -// CHECK-LIVE-RANGE-NEXT: S arm_sme.zero -// CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile -// CHECK-LIVE-RANGE-NEXT: E cf.br -// CHECK-LIVE-RANGE-NEXT: ^bb2: -// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile -// CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile -// CHECK-LIVE-RANGE-NEXT: E cf.br - // Incorrect result! Both branches should yield the result via the same tile. // // CHECK-BAD-LABEL: @non_overlapping_branches // CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> // CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32} : vector<[4]x[4]xf32> - -// CHECK-LABEL: @non_overlapping_branches func.func @non_overlapping_branches(%cond: i1) { - // CHECK: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> - // CHECK: arm_sme.get_tile {tile_id = 0 : i32} : vector<[4]x[4]xf32> %tile = scf.if %cond -> vector<[4]x[4]xf32> { - // ^bb1: %zero = arm_sme.zero : vector<[4]x[4]xf32> scf.yield %zero : vector<[4]x[4]xf32> } else { - // ^bb2: %undef = arm_sme.get_tile : vector<[4]x[4]xf32> scf.yield %undef : vector<[4]x[4]xf32> } @@ -180,22 +97,13 @@ func.func @non_overlapping_branches(%cond: i1) { // ----- -// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: -// - // Incorrect result! Everything assigned to tile 0 (which means values that are still live are overwritten). // // CHECK-BAD-LABEL: @constant_loop_init_with_multiple_users // CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> // CHECK-BAD: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> // CHECK-BAD: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - -// CHECK-LABEL: @constant_loop_init_with_multiple_users func.func @constant_loop_init_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>) { - // CHECK: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> - // CHECK: arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32> - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index @@ -215,52 +123,26 @@ func.func @constant_loop_init_with_multiple_users(%a: vector<[4]xf32>, %b: vecto // ----- -// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: -// CHECK-LIVE-RANGE-NEXT: @run_out_of_tiles_but_avoid_spill -// CHECK-LIVE-RANGE: ^bb2: -// CHECK-LIVE-RANGE-NEXT: |S arm_sme.copy_tile -// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.copy_tile -// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.copy_tile -// CHECK-LIVE-RANGE-NEXT: ||||S arm_sme.copy_tile -// CHECK-LIVE-RANGE-NEXT: EEEEE cf.br - -// Note in the live ranges (above) there is five tile values, but we only have four tiles. - // Incorrect result! Everything assigned to tile 0 (which means values that are still live are overwritten). // // CHECK-BAD-LABEL: @run_out_of_tiles_but_avoid_spill // CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} // CHECK-BAD-COUNT-4: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - -// CHECK-LABEL: @run_out_of_tiles_but_avoid_spill func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) { %init = arm_sme.zero : vector<[4]x[4]xf32> %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index - // Live = %init scf.for %i = %c0 to %c10 step %c1 { - // CHECK: arm_sme.zero {tile_id = 1 : i32} - // CHECK: arm_sme.zero {tile_id = 2 : i32} - // CHECK: arm_sme.zero {tile_id = 3 : i32} - // CHECK: arm_sme.zero {tile_id = 0 : i32} %tile_a, %tile_b, %tile_c, %tile_d = scf.for %j = %c0 to %c10 step %c1 iter_args(%iter_a = %init, %iter_b = %init, %iter_c = %init, %iter_d = %init) -> (vector<[4]x[4]xf32>, vector<[4]x[4]xf32> , vector<[4]x[4]xf32> , vector<[4]x[4]xf32>) { - // ^bb2: - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 2 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 3 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> %new_a = arm_sme.move_vector_to_tile_slice %a, %iter_a, %i : vector<[4]xf32> into vector<[4]x[4]xf32> %new_b = arm_sme.move_vector_to_tile_slice %b, %iter_b, %i : vector<[4]xf32> into vector<[4]x[4]xf32> %new_c = arm_sme.move_vector_to_tile_slice %c, %iter_c, %i : vector<[4]xf32> into vector<[4]x[4]xf32> %new_d = arm_sme.move_vector_to_tile_slice %d, %iter_d, %i : vector<[4]xf32> into vector<[4]x[4]xf32> scf.yield %new_a, %new_b, %new_c, %new_d : vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32> } - // Live = %init, %tile_a, %tile_b, %tile_c, %tile_d (out of tiles!) - // This should be resolved by duplicating the arm_sme.zero (from folding - // arm_sme.copy_tile operations inserted by the tile allocator). "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> () @@ -271,54 +153,24 @@ func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]x // ----- -// We should be able to avoid spills like this, but logic handling this case is -// not implemented yet. Note tile ID >= 16 means a spill/in-memory tile. - -// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: -// CHECK-LIVE-RANGE-NEXT: @avoidable_spill -// CHECK-LIVE-RANGE: ^bb2: -// CHECK-LIVE-RANGE-NEXT: || test.some_use -// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: ||||S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: |||||S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: ||E||| test.some_use -// CHECK-LIVE-RANGE-NEXT: || E|| test.some_use -// CHECK-LIVE-RANGE-NEXT: || E| test.some_use -// CHECK-LIVE-RANGE-NEXT: || E test.some_use -// CHECK-LIVE-RANGE-NEXT: || arith.addi -// CHECK-LIVE-RANGE-NEXT: EE cf.br -// -// Note in the live ranges (above) there is two constant live-ins (first two ranges), -// which gives six overlapping live ranges. The allocator currently will spill the -// first constant (which results in a real spill at it's use), however, this could -// be avoided by using the knowledge that at the first "test.some_use" there's -// actually only two live ranges (so we can fix this be duplicating the constant). - // Incorrect result! Everything other than zero assigned to tile 1 (which means values that are still live are overwritten). // // CHECK-BAD-LABEL: @avoidable_spill // CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} // CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32} // CHECK-BAD-COUNT-4: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32} - -// CHECK-LABEL: @avoidable_spill func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) { - // CHECK: arm_sme.zero {tile_id = 16 : i32} : vector<[4]x[4]xf32> %zero = arm_sme.zero : vector<[4]x[4]xf32> %tile = arm_sme.get_tile : vector<[4]x[4]xf32> %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index scf.for %i = %c0 to %c10 step %c1 { - // So spilled here (unnecessarily). - // The arm_sme.zero op could be moved into the loop to avoid this. "test.some_use"(%zero) : (vector<[4]x[4]xf32>) -> () %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> %tile_c = arm_sme.move_vector_to_tile_slice %c, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> %tile_d = arm_sme.move_vector_to_tile_slice %d, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> - // %zero is still live here (due the the backedge) "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> () From 6fd383dcb45736aebabc1138a85bec9a3ef61905 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 1 May 2024 12:54:45 +0000 Subject: [PATCH 4/4] Add CHECK-NOT --- mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir index b5dac8733e61c..2dedcb2fbc24e 100644 --- a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir +++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir @@ -23,7 +23,10 @@ func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32> // ----- -// (No CHECK-BAD -- the current tile allocator ignores this case) +// (No tile IDs -- the current tile allocator ignores this case) + +// CHECK-BAD-LABEL: @value_with_multiple_users +// CHECK-BAD-NOT: tile_id func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) { // A future allocator should error here (as `%tile` would need to be copied). %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32>