diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt index 6bb5a2e3b1..1c7a03eada 100644 --- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt +++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt @@ -17,3 +17,12 @@ llvm_test_executable(LoopVectorizationBenchmarks ) target_link_libraries(LoopVectorizationBenchmarks benchmark) + +llvm_test_run() + +llvm_test_executable(LoopInterleavingBenchmarks + main.cpp + LoopInterleaving.cpp +) + +target_link_libraries(LoopInterleavingBenchmarks benchmark) diff --git a/MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp b/MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp new file mode 100644 index 0000000000..2e8111c9a3 --- /dev/null +++ b/MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp @@ -0,0 +1,403 @@ +// This program tests performance impact of Interleaving Count with varying loop +// iteration count for different types of loops, such as loops with or +// without reductions inside it, loops with different vectorization widths. +#include +#include +#include + +#include "benchmark/benchmark.h" + +#define ELEMENTS 2048 +#define ALIGNED16 __attribute__((aligned(16))) + +static std::mt19937 rng; +unsigned int g_sum = 0; + +int A[ELEMENTS] ALIGNED16; +int B[ELEMENTS] ALIGNED16; +int C[ELEMENTS] ALIGNED16; +int D[ELEMENTS] ALIGNED16; +int E[ELEMENTS] ALIGNED16; +int F[ELEMENTS] ALIGNED16; + +// Initialize arrays with random numbers. +static void init_data(unsigned N) { + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + for (unsigned I = 0; I < N; I++) { + A[I] = distrib(rng); + B[I] = distrib(rng); + C[I] = distrib(rng); + D[I] = distrib(rng); + E[I] = distrib(rng); + F[I] = distrib(rng); + } +} + +static void __attribute__((always_inline)) +runBenchForLoopInterleaving(benchmark::State &state, int (*Fn)(int), + int Iterations) { + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + init_data(ELEMENTS); + for (auto _ : state) { + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + benchmark::DoNotOptimize(D); + benchmark::DoNotOptimize(E); + benchmark::DoNotOptimize(F); + benchmark::ClobberMemory(); + g_sum += Fn(Iterations); + } +} + +#define STRINGIFY(a) #a + +// Loops without Reduction with different vectorization configurations + +static int __attribute__((noinline)) loopNoReductionAutoVec(int Iterations) { +#pragma clang loop unroll(disable) + for (int J = 0; J < Iterations; J++) { + A[J] = B[J] + C[J]; + } + return 0; +} + +static int __attribute__((noinline)) bigLoopNoReductionAutoVec(int Iterations) { +#pragma clang loop unroll(disable) + for (int J = 0; J < Iterations; J++) { + A[J] = B[J] + C[J]; + D[J]++; + E[J] *= 2; + F[J] /= 5; + } + return 0; +} + +#define loopNoReductionWithVecHint(vw, ic) \ + static int __attribute__((noinline)) \ + loopWithVW##vw##IC##ic(int Iterations) { \ + _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ + ic))) for (int J = 0; J < Iterations; J++) { \ + A[J] = B[J] + C[J]; \ + } \ + return 0; \ + } + +#define bigLoopNoReductionWithVecHint(vw, ic) \ + static int __attribute__((noinline)) \ + bigLoopWithVW##vw##IC##ic(int Iterations) { \ + _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ + ic))) for (int J = 0; J < Iterations; J++) { \ + A[J] = B[J] + C[J]; \ + D[J]++; \ + E[J] *= 2; \ + F[J] /= 5; \ + } \ + return 0; \ + } + +// Loops with Reduction with different vectorization configurations + +static int __attribute__((noinline)) loopWithReductionAutoVec(int Iterations) { + unsigned sum = 0; +#pragma clang loop unroll(disable) + for (int J = 0; J < Iterations; J++) { + sum += A[J]; + } + return sum; +} + +static int __attribute__((noinline)) +bigLoopWithReductionAutoVec(int Iterations) { + unsigned sum = 0; +#pragma clang loop unroll(disable) + for (int J = 0; J < Iterations; J++) { + sum += A[J]; + D[J]++; + E[J] *= 2; + F[J] /= 5; + } + return sum; +} + +#define loopWithReductionWithVecHint(vw, ic) \ + static int __attribute__((noinline)) \ + loopWithReductionWithVW##vw##IC##ic(int Iterations) { \ + unsigned sum = 0; \ + _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ + ic))) for (int J = 0; J < Iterations; J++) { \ + sum += A[J]; \ + } \ + return sum; \ + } + +#define bigLoopWithReductionWithVecHint(vw, ic) \ + static int __attribute__((noinline)) \ + bigLoopWithReductionWithVW##vw##IC##ic(int Iterations) { \ + unsigned sum = 0; \ + _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ + ic))) for (int J = 0; J < Iterations; J++) { \ + sum += A[J]; \ + D[J]++; \ + E[J] *= 2; \ + F[J] /= 5; \ + } \ + return sum; \ + } + +// We are evaluating 4 types of loops for different vectorization configurations +// 1) Loops without reductions +// 2) Loops with reductions +// 3) Bigger loop bodies without reductions +// 4) Bigger loop bodies with some reductions +// For each, we are evaluating the following vectorization configurations of +// vectorization width (VW), interleaving count (IC): +// 1) automatically selected by the compiler (without vectorization hint) +// 2) VW=4, IC=1 +// 3) VW=4, IC=2 +// 4) VW=4, IC=4 +// 5) VW=1, IC=1 +// 6) VW=1, IC=2 +// 7) VW=1, IC=4 +// Of these, configurations 5-7 are skipped for loop type 1 & 3). +// Creating a function for the above configurations with different Vectorization +// Hints: +loopNoReductionWithVecHint(4, 1); +loopNoReductionWithVecHint(4, 2); +loopNoReductionWithVecHint(4, 4); +loopWithReductionWithVecHint(4, 1); +loopWithReductionWithVecHint(4, 2); +loopWithReductionWithVecHint(4, 4); +loopWithReductionWithVecHint(1, 1); +loopWithReductionWithVecHint(1, 2); +loopWithReductionWithVecHint(1, 4); +bigLoopNoReductionWithVecHint(4, 1); +bigLoopNoReductionWithVecHint(4, 2); +bigLoopNoReductionWithVecHint(4, 4); +bigLoopWithReductionWithVecHint(4, 1); +bigLoopWithReductionWithVecHint(4, 2); +bigLoopWithReductionWithVecHint(4, 4); +bigLoopWithReductionWithVecHint(1, 1); +bigLoopWithReductionWithVecHint(1, 2); +bigLoopWithReductionWithVecHint(1, 4); + +#define ADD_BENCHMARK(Itr) \ + void benchAutoVecForLoopTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr); \ + } \ + BENCHMARK(benchAutoVecForLoopTC##Itr); \ + void benchForIC1VW4LoopTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithVW4IC1, Itr); \ + } \ + BENCHMARK(benchForIC1VW4LoopTC##Itr); \ + void benchForIC2VW4LoopTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithVW4IC2, Itr); \ + } \ + BENCHMARK(benchForIC2VW4LoopTC##Itr); \ + void benchForIC4VW4LoopTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithVW4IC4, Itr); \ + } \ + BENCHMARK(benchForIC4VW4LoopTC##Itr); \ + void benchForLoopWithReductionAutoVecTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithReductionAutoVec, Itr); \ + } \ + BENCHMARK(benchForLoopWithReductionAutoVecTC##Itr); \ + void benchForIC1VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC1, Itr); \ + } \ + BENCHMARK(benchForIC1VW4LoopWithReductionTC##Itr); \ + void benchForIC2VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC2, Itr); \ + } \ + BENCHMARK(benchForIC2VW4LoopWithReductionTC##Itr); \ + void benchForIC4VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC4, Itr); \ + } \ + BENCHMARK(benchForIC4VW4LoopWithReductionTC##Itr); \ + void benchForIC1VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC1, Itr); \ + } \ + BENCHMARK(benchForIC1VW1LoopWithReductionTC##Itr); \ + void benchForIC2VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC2, Itr); \ + } \ + BENCHMARK(benchForIC2VW1LoopWithReductionTC##Itr); \ + void benchForIC4VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC4, Itr); \ + } \ + BENCHMARK(benchForIC4VW1LoopWithReductionTC##Itr); \ + void benchAutoVecForBigLoopTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr); \ + } \ + BENCHMARK(benchAutoVecForBigLoopTC##Itr); \ + void benchForIC1VW4BigLoopTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithVW4IC1, Itr); \ + } \ + BENCHMARK(benchForIC1VW4BigLoopTC##Itr); \ + void benchForIC2VW4BigLoopTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithVW4IC2, Itr); \ + } \ + BENCHMARK(benchForIC2VW4BigLoopTC##Itr); \ + void benchForIC4VW4BigLoopTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithVW4IC4, Itr); \ + } \ + BENCHMARK(benchForIC4VW4BigLoopTC##Itr); \ + void benchForBigLoopWithReductionAutoVecTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithReductionAutoVec, Itr); \ + } \ + BENCHMARK(benchForBigLoopWithReductionAutoVecTC##Itr); \ + void benchForIC1VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC1, Itr); \ + } \ + BENCHMARK(benchForIC1VW4BigLoopWithReductionTC##Itr); \ + void benchForIC2VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC2, Itr); \ + } \ + BENCHMARK(benchForIC2VW4BigLoopWithReductionTC##Itr); \ + void benchForIC4VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC4, Itr); \ + } \ + BENCHMARK(benchForIC4VW4BigLoopWithReductionTC##Itr); \ + void benchForIC1VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC1, Itr); \ + } \ + BENCHMARK(benchForIC1VW1BigLoopWithReductionTC##Itr); \ + void benchForIC2VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC2, Itr); \ + } \ + BENCHMARK(benchForIC2VW1BigLoopWithReductionTC##Itr); \ + void benchForIC4VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \ + runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC4, Itr); \ + } \ + BENCHMARK(benchForIC4VW1BigLoopWithReductionTC##Itr); + +ADD_BENCHMARK(1) +ADD_BENCHMARK(2) +ADD_BENCHMARK(3) +ADD_BENCHMARK(4) +ADD_BENCHMARK(5) +ADD_BENCHMARK(6) +ADD_BENCHMARK(7) +ADD_BENCHMARK(8) +ADD_BENCHMARK(9) +ADD_BENCHMARK(10) +ADD_BENCHMARK(11) +ADD_BENCHMARK(12) +ADD_BENCHMARK(13) +ADD_BENCHMARK(14) +ADD_BENCHMARK(15) +ADD_BENCHMARK(16) +ADD_BENCHMARK(17) +ADD_BENCHMARK(18) +ADD_BENCHMARK(19) +ADD_BENCHMARK(20) +ADD_BENCHMARK(21) +ADD_BENCHMARK(22) +ADD_BENCHMARK(23) +ADD_BENCHMARK(24) +ADD_BENCHMARK(25) +ADD_BENCHMARK(26) +ADD_BENCHMARK(27) +ADD_BENCHMARK(28) +ADD_BENCHMARK(29) +ADD_BENCHMARK(30) +ADD_BENCHMARK(31) +ADD_BENCHMARK(32) +ADD_BENCHMARK(33) +ADD_BENCHMARK(34) +ADD_BENCHMARK(35) +ADD_BENCHMARK(36) +ADD_BENCHMARK(37) +ADD_BENCHMARK(38) +ADD_BENCHMARK(39) +ADD_BENCHMARK(40) +ADD_BENCHMARK(41) +ADD_BENCHMARK(42) +ADD_BENCHMARK(43) +ADD_BENCHMARK(44) +ADD_BENCHMARK(45) +ADD_BENCHMARK(46) +ADD_BENCHMARK(47) +ADD_BENCHMARK(48) +ADD_BENCHMARK(49) +ADD_BENCHMARK(50) +ADD_BENCHMARK(51) +ADD_BENCHMARK(52) +ADD_BENCHMARK(53) +ADD_BENCHMARK(54) +ADD_BENCHMARK(55) +ADD_BENCHMARK(56) +ADD_BENCHMARK(57) +ADD_BENCHMARK(58) +ADD_BENCHMARK(59) +ADD_BENCHMARK(60) +ADD_BENCHMARK(61) +ADD_BENCHMARK(62) +ADD_BENCHMARK(63) +ADD_BENCHMARK(64) +ADD_BENCHMARK(65) +ADD_BENCHMARK(66) +ADD_BENCHMARK(67) +ADD_BENCHMARK(68) +ADD_BENCHMARK(69) +ADD_BENCHMARK(70) +ADD_BENCHMARK(71) +ADD_BENCHMARK(72) +ADD_BENCHMARK(73) +ADD_BENCHMARK(74) +ADD_BENCHMARK(75) +ADD_BENCHMARK(76) +ADD_BENCHMARK(77) +ADD_BENCHMARK(78) +ADD_BENCHMARK(79) +ADD_BENCHMARK(80) +ADD_BENCHMARK(81) +ADD_BENCHMARK(82) +ADD_BENCHMARK(83) +ADD_BENCHMARK(84) +ADD_BENCHMARK(85) +ADD_BENCHMARK(86) +ADD_BENCHMARK(87) +ADD_BENCHMARK(88) +ADD_BENCHMARK(89) +ADD_BENCHMARK(90) +ADD_BENCHMARK(91) +ADD_BENCHMARK(92) +ADD_BENCHMARK(93) +ADD_BENCHMARK(94) +ADD_BENCHMARK(95) +ADD_BENCHMARK(96) +ADD_BENCHMARK(97) +ADD_BENCHMARK(98) +ADD_BENCHMARK(99) +ADD_BENCHMARK(100) +ADD_BENCHMARK(101) +ADD_BENCHMARK(102) +ADD_BENCHMARK(103) +ADD_BENCHMARK(104) +ADD_BENCHMARK(105) +ADD_BENCHMARK(106) +ADD_BENCHMARK(107) +ADD_BENCHMARK(108) +ADD_BENCHMARK(109) +ADD_BENCHMARK(110) +ADD_BENCHMARK(111) +ADD_BENCHMARK(112) +ADD_BENCHMARK(113) +ADD_BENCHMARK(114) +ADD_BENCHMARK(115) +ADD_BENCHMARK(116) +ADD_BENCHMARK(117) +ADD_BENCHMARK(118) +ADD_BENCHMARK(119) +ADD_BENCHMARK(120) +ADD_BENCHMARK(121) +ADD_BENCHMARK(122) +ADD_BENCHMARK(123) +ADD_BENCHMARK(124) +ADD_BENCHMARK(125) +ADD_BENCHMARK(126) +ADD_BENCHMARK(127) +ADD_BENCHMARK(128)