From 09d12e9c0c3f3730c47858ce8c28c73ed34fbbeb Mon Sep 17 00:00:00 2001 From: Nilanjana Basu Date: Fri, 17 Nov 2023 17:38:04 -0800 Subject: [PATCH 1/9] [LV] Pre-committing tests for changing loop interleaving count computation (#70272) Added tests for evaluating changes to loop interleaving count computation and for removing loop interleaving threshold in subsequent patches. --- .../LoopVectorize/AArch64/interleave_count.ll | 107 ++++++++ .../LoopVectorize/X86/unroll-small-loops.ll | 254 ++++++++++++++++-- 2 files changed, 344 insertions(+), 17 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll new file mode 100644 index 0000000000000..061cdb5643671 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll @@ -0,0 +1,107 @@ +; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s +; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed + +target triple = "aarch64-linux-gnu" + +%pair = type { i8, i8 } + +; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose +; IC 2 since there is no remainder loop run needed when the vector loop runs. +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0 + %tmp1 = load i8, ptr %tmp0, align 1 + %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1 + %tmp3 = load i8, ptr %tmp2, align 1 + %add = add i8 %tmp1, %tmp3 + %qi = getelementptr i8, ptr %q, i64 %i + store i8 %add, ptr %qi, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, 32 + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + +; TODO: For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose +; IC 1 since there may be a remainder loop that needs to run after the vector loop. +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0 + %tmp1 = load i8, ptr %tmp0, align 1 + %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1 + %tmp3 = load i8, ptr %tmp2, align 1 + %add = add i8 %tmp1, %tmp3 + %qi = getelementptr i8, ptr %q, i64 %i + store i8 %add, ptr %qi, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, 33 + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + +; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the +; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop +; won't need to run +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0 + %tmp1 = load i8, ptr %tmp0, align 1 + %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1 + %tmp3 = load i8, ptr %tmp2, align 1 + %add = add i8 %tmp1, %tmp3 + %qi = getelementptr i8, ptr %q, i64 %i + store i8 %add, ptr %qi, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body, !prof !0 + +for.end: + ret void +} + +; TODO: For a loop with unknown trip count but a profile showing an approx TC estimate of 33, +; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the +; remainder loop will need to run +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0 + %tmp1 = load i8, ptr %tmp0, align 1 + %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1 + %tmp3 = load i8, ptr %tmp2, align 1 + %add = add i8 %tmp1, %tmp3 + %qi = getelementptr i8, ptr %q, i64 %i + store i8 %add, ptr %qi, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body, !prof !1 + +for.end: + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 31} +!1 = !{!"branch_weights", i32 1, i32 32} diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll index 290be569bc125..5b79d6af9ed9c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll @@ -6,37 +6,257 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -; We don't unroll this loop because it has a small constant trip count. +; We don't unroll this loop because it has a small constant trip count +; that is not profitable for generating a scalar epilogue ; -; CHECK-VECTOR-LABEL: @foo( +; CHECK-VECTOR-LABEL: @foo_trip_count_8( ; CHECK-VECTOR: load <4 x i32> ; CHECK-VECTOR-NOT: load <4 x i32> ; CHECK-VECTOR: store <4 x i32> ; CHECK-VECTOR-NOT: store <4 x i32> ; CHECK-VECTOR: ret ; -; CHECK-SCALAR-LABEL: @foo( +; CHECK-SCALAR-LABEL: @foo_trip_count_8( ; CHECK-SCALAR: load i32, ptr ; CHECK-SCALAR-NOT: load i32, ptr ; CHECK-SCALAR: store i32 ; CHECK-SCALAR-NOT: store i32 ; CHECK-SCALAR: ret -define i32 @foo(ptr nocapture %A) nounwind uwtable ssp { - br label %1 +define void @foo_trip_count_8(ptr nocapture %A) nounwind uwtable ssp { +entry: + br label %for.body -;