From f365ac737c27d3c9e5978d3c64787749d89355a7 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Wed, 5 Jun 2024 16:43:04 +0100 Subject: [PATCH 01/12] Add partial reduction add intrinsic --- llvm/docs/LangRef.rst | 33 +++++++- llvm/include/llvm/IR/Intrinsics.td | 6 ++ .../SelectionDAG/SelectionDAGBuilder.cpp | 21 +++++ .../CodeGen/AArch64/partial-reduce-sdot-ir.ll | 76 +++++++++++++++++++ 4 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 9d7ade8eb523b..95f839e35b673 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -14250,7 +14250,7 @@ Arguments: """""""""" The first 4 arguments are similar to ``llvm.instrprof.increment``. The indexing is specific to callsites, meaning callsites are indexed from 0, independent from -the indexes used by the other intrinsics (such as +the indexes used by the other intrinsics (such as ``llvm.instrprof.increment[.step]``). The last argument is the called value of the callsite this intrinsic precedes. @@ -14264,7 +14264,7 @@ a buffer LLVM can use to perform counter increments (i.e. the lowering of ``llvm.instrprof.increment[.step]``. The address range following the counter buffer, ```` x ``sizeof(ptr)`` - sized, is expected to contain pointers to contexts of functions called from this function ("subcontexts"). -LLVM does not dereference into that memory region, just calculates GEPs. +LLVM does not dereference into that memory region, just calculates GEPs. The lowering of ``llvm.instrprof.callsite`` consists of: @@ -19209,6 +19209,35 @@ will be on any later loop iteration. This intrinsic will only return 0 if the input count is also 0. A non-zero input count will produce a non-zero result. +'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<8 x i32> %in) + declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<16 x i32> %in) + declare @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( %in) + declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %in) + +Overview: +""""""""" + +The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics do an integer +``ADD`` reduction of subvectors within a vector, returning each scalar result as +a lane within a vector. The return type is a vector type with an +element-type of the vector input and a width a factor of the vector input +(typically either half or quarter). + +Arguments: +"""""""""" + +The argument to this intrinsic must be a vector of integer values. + + '``llvm.experimental.vector.histogram.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 107442623ab7b..08c516bd1cea1 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2635,6 +2635,12 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType [llvm_anyvector_ty], [IntrNoMem]>; +//===-------------- Intrinsics to perform partial reduction ---------------===// + +def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; + //===----------------- Pointer Authentication Intrinsics ------------------===// // diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ba76456b5836a..f24723a45237d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7914,6 +7914,27 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, setValue(&I, Trunc); return; } + case Intrinsic::experimental_vector_partial_reduce_add: { + auto DL = getCurSDLoc(); + auto ReducedTy = EVT::getEVT(I.getType()); + auto OpNode = getValue(I.getOperand(0)); + auto Index = DAG.getVectorIdxConstant(0, DL); + auto FullTy = OpNode.getValueType(); + + auto ResultVector = DAG.getSplat(ReducedTy, DL, DAG.getConstant(0, DL, ReducedTy.getScalarType())); + unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements(); + + for(unsigned i = 0; i < ScaleFactor; i++) { + auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL); + auto TargetIndex = DAG.getVectorIdxConstant(i, DL); + auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex}); + N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N); + ResultVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex}); + } + + setValue(&I, ResultVector); + return; + } case Intrinsic::experimental_cttz_elts: { auto DL = getCurSDLoc(); SDValue Op = getValue(I.getOperand(0)); diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll new file mode 100644 index 0000000000000..6a5b3bd5ace2e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -force-vector-interleave=1 %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define void @partial_reduce_add( %wide.load.pre, %0, %1, i64 %index) #0 { +; CHECK-LABEL: partial_reduce_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: index z2.s, #0, #1 +; CHECK-NEXT: mov z4.s, w8 +; CHECK-NEXT: mov w8, #2 // =0x2 +; CHECK-NEXT: ptrue p2.s, vl1 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: mov z6.s, w8 +; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z4.s +; CHECK-NEXT: uaddv d3, p0, z0.s +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: uaddv d7, p0, z1.s +; CHECK-NEXT: uaddv d4, p0, z5.s +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl] +; CHECK-NEXT: mov z1.s, p2/m, w8 +; CHECK-NEXT: mov w8, #3 // =0x3 +; CHECK-NEXT: cmpeq p2.s, p0/z, z2.s, z6.s +; CHECK-NEXT: mov z5.s, w8 +; CHECK-NEXT: fmov x8, d7 +; CHECK-NEXT: uaddv d3, p0, z3.s +; CHECK-NEXT: mov z1.s, p1/m, w8 +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z5.s +; CHECK-NEXT: mov z1.s, p2/m, w8 +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: mov z1.s, p0/m, w8 +; CHECK-NEXT: addvl x8, x1, #1 +; CHECK-NEXT: .LBB0_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: cbnz x8, .LBB0_1 +; CHECK-NEXT: // %bb.2: // %middle.block +; CHECK-NEXT: ret +entry: + %2 = call i64 @llvm.vscale.i64() + %3 = mul i64 %2, 16 + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %4, %vector.body ] + %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %1) + %4 = or %partial.reduce, %vec.phi + %index.next = add i64 %index, %3 + %5 = icmp eq i64 %index.next, 0 + br i1 %5, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %6 = call i32 @llvm.vector.reduce.add.nxv4i32( %4) + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.vector.reduce.add.nxv4i32() #2 + +attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } From 102f9e40ddb33c2b4edaa0d1604d9d3dbe582928 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Mon, 10 Jun 2024 15:44:54 +0100 Subject: [PATCH 02/12] Change partial reduction intrinsic to take the accumulator as an operand --- llvm/docs/LangRef.rst | 22 ++- llvm/include/llvm/IR/Intrinsics.td | 4 +- .../SelectionDAG/SelectionDAGBuilder.cpp | 11 +- llvm/lib/IR/Verifier.cpp | 13 ++ .../CodeGen/AArch64/partial-reduce-sdot-ir.ll | 176 +++++++++++++----- 5 files changed, 164 insertions(+), 62 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 95f839e35b673..640b5062090f2 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -19218,24 +19218,26 @@ This is an overloaded intrinsic. :: - declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<8 x i32> %in) - declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<16 x i32> %in) - declare @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( %in) - declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %in) + declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accum, <8 x i32> %in) + declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %accum, <16 x i32> %in) + declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %accum, %in) + declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %accum, %in) Overview: """"""""" -The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics do an integer -``ADD`` reduction of subvectors within a vector, returning each scalar result as -a lane within a vector. The return type is a vector type with an -element-type of the vector input and a width a factor of the vector input -(typically either half or quarter). +The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics perform an integer +``ADD`` reduction of subvectors within a vector, before adding the resulting vector +to the provided accumulator vector. The return type is a vector type that matches +the type of the accumulator vector. Arguments: """""""""" -The argument to this intrinsic must be a vector of integer values. +The first argument is the accumulator vector, or a `zeroinitializer`. The type of +this argument must match the return type. The second argument is the vector to reduce +into the accumulator, the width of this vector must be a positive integer multiple of +the accumulator vector/return type. '``llvm.experimental.vector.histogram.*``' Intrinsic diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 08c516bd1cea1..5b3e3d2387463 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2637,8 +2637,8 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType //===-------------- Intrinsics to perform partial reduction ---------------===// -def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], +def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>], + [llvm_anyvector_ty, llvm_anyvector_ty], [IntrNoMem]>; //===----------------- Pointer Authentication Intrinsics ------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index f24723a45237d..b9f7aa80b48cc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7917,22 +7917,23 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::experimental_vector_partial_reduce_add: { auto DL = getCurSDLoc(); auto ReducedTy = EVT::getEVT(I.getType()); - auto OpNode = getValue(I.getOperand(0)); - auto Index = DAG.getVectorIdxConstant(0, DL); + auto OpNode = getValue(I.getOperand(1)); auto FullTy = OpNode.getValueType(); - auto ResultVector = DAG.getSplat(ReducedTy, DL, DAG.getConstant(0, DL, ReducedTy.getScalarType())); + auto Accumulator = getValue(I.getOperand(0)); unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements(); for(unsigned i = 0; i < ScaleFactor; i++) { auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL); auto TargetIndex = DAG.getVectorIdxConstant(i, DL); + auto ExistingValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ReducedTy.getScalarType(), {Accumulator, TargetIndex}); auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex}); N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N); - ResultVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex}); + N = DAG.getNode(ISD::ADD, DL, ReducedTy.getScalarType(), ExistingValue, N); + Accumulator = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {Accumulator, N, TargetIndex}); } - setValue(&I, ResultVector); + setValue(&I, Accumulator); return; } case Intrinsic::experimental_cttz_elts: { diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 684e54444621b..21371bdd4bf6d 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6131,6 +6131,19 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { } break; } + case Intrinsic::experimental_vector_partial_reduce_add: { + VectorType *AccTy = cast(Call.getArgOperand(0)->getType()); + VectorType *VecTy = cast(Call.getArgOperand(1)->getType()); + + auto VecWidth = VecTy->getElementCount().getKnownMinValue(); + auto AccWidth = AccTy->getElementCount().getKnownMinValue(); + + Check((VecWidth % AccWidth) == 0, "Invalid vector widths for partial " + "reduction. The width of the input vector " + "must be a postive integer multiple of " + "the width of the accumulator vector."); + break; + } case Intrinsic::experimental_noalias_scope_decl: { NoAliasScopeDecls.push_back(cast(&Call)); break; diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll index 6a5b3bd5ace2e..ccdcd1b740a27 100644 --- a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll +++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll @@ -4,72 +4,158 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" -define void @partial_reduce_add( %wide.load.pre, %0, %1, i64 %index) #0 { +define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 { +; CHECK-LABEL: partial_reduce_add_fixed: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: addv s1, v1.4s +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: mov v0.s[0], w8 +; CHECK-NEXT: ret +entry: + %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0) + ret <4 x i32> %partial.reduce +} + +define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 { +; CHECK-LABEL: partial_reduce_add_fixed_half: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: addv s1, v1.4s +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov w10, v0.s[1] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: add w9, w9, w8 +; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: mov v0.s[0], w9 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: ret +entry: + %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0) + ret <4 x i32> %partial.reduce +} + +define @partial_reduce_add( %accumulator, %0) #0 { ; CHECK-LABEL: partial_reduce_add: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: uaddv d1, p0, z1.s +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov z0.s, p0/m, w8 +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32( %accumulator, %0) + ret %partial.reduce +} + +define @partial_reduce_add_half( %accumulator, %0) #0 { +; CHECK-LABEL: partial_reduce_add_half: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z2.s, #0, #1 -; CHECK-NEXT: mov z4.s, w8 -; CHECK-NEXT: mov w8, #2 // =0x2 -; CHECK-NEXT: ptrue p2.s, vl1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: uaddv d1, p0, z1.s +; CHECK-NEXT: ptrue p1.s, vl1 +; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: add w10, w10, w8 +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: mov z0.s, p1/m, w10 +; CHECK-NEXT: mov z0.s, p0/m, w8 +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %accumulator, %0) + ret %partial.reduce +} + +define @partial_reduce_add_quart( %accumulator, %0) #0 { +; CHECK-LABEL: partial_reduce_add_quart: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: fmov w10, s0 ; CHECK-NEXT: mov z6.s, w8 -; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z4.s -; CHECK-NEXT: uaddv d3, p0, z0.s -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: uaddv d7, p0, z1.s -; CHECK-NEXT: uaddv d4, p0, z5.s +; CHECK-NEXT: index z5.s, #0, #1 +; CHECK-NEXT: ptrue p2.s, vl1 +; CHECK-NEXT: uaddv d1, p0, z1.s +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: uaddv d2, p0, z2.s +; CHECK-NEXT: uaddv d3, p0, z3.s +; CHECK-NEXT: cmpeq p1.s, p0/z, z5.s, z6.s +; CHECK-NEXT: uaddv d4, p0, z4.s +; CHECK-NEXT: fmov x8, d1 ; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: fmov x8, d3 -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl] +; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: mov w10, #2 // =0x2 ; CHECK-NEXT: mov z1.s, p2/m, w8 -; CHECK-NEXT: mov w8, #3 // =0x3 -; CHECK-NEXT: cmpeq p2.s, p0/z, z2.s, z6.s -; CHECK-NEXT: mov z5.s, w8 -; CHECK-NEXT: fmov x8, d7 -; CHECK-NEXT: uaddv d3, p0, z3.s +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov z6.s, w10 +; CHECK-NEXT: mov w10, v0.s[2] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: cmpeq p2.s, p0/z, z5.s, z6.s +; CHECK-NEXT: mov z2.s, w9 +; CHECK-NEXT: fmov x9, d3 ; CHECK-NEXT: mov z1.s, p1/m, w8 -; CHECK-NEXT: fmov x8, d4 -; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z5.s -; CHECK-NEXT: mov z1.s, p2/m, w8 -; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: add w9, w10, w9 +; CHECK-NEXT: cmpeq p0.s, p0/z, z5.s, z2.s +; CHECK-NEXT: mov z1.s, p2/m, w9 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: mov z1.s, p0/m, w8 -; CHECK-NEXT: addvl x8, x1, #1 -; CHECK-NEXT: .LBB0_1: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: orr z0.d, z1.d, z0.d -; CHECK-NEXT: cbnz x8, .LBB0_1 -; CHECK-NEXT: // %bb.2: // %middle.block +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: - %2 = call i64 @llvm.vscale.i64() - %3 = mul i64 %2, 16 - br label %vector.body - -vector.body: ; preds = %vector.body, %entry - %vec.phi = phi [ zeroinitializer, %entry ], [ %4, %vector.body ] - %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %1) - %4 = or %partial.reduce, %vec.phi - %index.next = add i64 %index, %3 - %5 = icmp eq i64 %index.next, 0 - br i1 %5, label %middle.block, label %vector.body + %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %accumulator, %0) + ret %partial.reduce +} -middle.block: ; preds = %vector.body - %6 = call i32 @llvm.vector.reduce.add.nxv4i32( %4) - ret void +define @partial_reduce_add_half_8( %accumulator, %0) #0 { +; CHECK-LABEL: partial_reduce_add_half_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add z2.s, z2.s, z3.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: index z3.s, #0, #1 +; CHECK-NEXT: mov z4.s, w8 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: ptrue p1.s, vl1 +; CHECK-NEXT: uaddv d2, p0, z2.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z3.s, z4.s +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: add w10, w10, w8 +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: mov z0.s, p1/m, w10 +; CHECK-NEXT: mov z0.s, p0/m, w8 +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32( %accumulator, %0) + ret %partial.reduce } ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -declare i64 @llvm.vscale.i64() #1 +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(, ) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(, ) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(, ) #1 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32() #1 +declare @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(, ) #1 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare i32 @llvm.vector.reduce.add.nxv4i32() #2 +declare i32 @llvm.vector.reduce.add.nxv8i32() #2 attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" } attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } From b0c126ec70ded502201854668d37926a8b526572 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Mon, 10 Jun 2024 16:00:04 +0100 Subject: [PATCH 03/12] Rename test file --- .../{partial-reduce-sdot-ir.ll => partial-reduction-add.ll} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llvm/test/CodeGen/AArch64/{partial-reduce-sdot-ir.ll => partial-reduction-add.ll} (100%) diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll similarity index 100% rename from llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll rename to llvm/test/CodeGen/AArch64/partial-reduction-add.ll From b8115585df7ca16a3fdb8f66016b4c9037d6865a Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Wed, 12 Jun 2024 15:06:52 +0100 Subject: [PATCH 04/12] Fix docs build error --- llvm/docs/LangRef.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 640b5062090f2..5b89c3973ff18 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -19210,7 +19210,7 @@ This intrinsic will only return 0 if the input count is also 0. A non-zero input count will produce a non-zero result. '``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" From 0786587c7c45de756da55c20e47e45996ef243ea Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Mon, 17 Jun 2024 13:35:04 +0100 Subject: [PATCH 05/12] Update LangRef.rst for partial reduction intrinsic --- llvm/docs/LangRef.rst | 21 ++++++++-------- .../CodeGen/AArch64/partial-reduction-add.ll | 24 ++++++------------- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 5b89c3973ff18..0f77b8cc77f60 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -14250,7 +14250,7 @@ Arguments: """""""""" The first 4 arguments are similar to ``llvm.instrprof.increment``. The indexing is specific to callsites, meaning callsites are indexed from 0, independent from -the indexes used by the other intrinsics (such as +the indexes used by the other intrinsics (such as ``llvm.instrprof.increment[.step]``). The last argument is the called value of the callsite this intrinsic precedes. @@ -14264,7 +14264,7 @@ a buffer LLVM can use to perform counter increments (i.e. the lowering of ``llvm.instrprof.increment[.step]``. The address range following the counter buffer, ```` x ``sizeof(ptr)`` - sized, is expected to contain pointers to contexts of functions called from this function ("subcontexts"). -LLVM does not dereference into that memory region, just calculates GEPs. +LLVM does not dereference into that memory region, just calculates GEPs. The lowering of ``llvm.instrprof.callsite`` consists of: @@ -19226,18 +19226,19 @@ This is an overloaded intrinsic. Overview: """"""""" -The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics perform an integer -``ADD`` reduction of subvectors within a vector, before adding the resulting vector -to the provided accumulator vector. The return type is a vector type that matches -the type of the accumulator vector. +The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the +input vector down to the number of elements dictated by the result vector, and +then adds the resulting vector to the accumulator vector. The return type is a +vector type that matches the type of the accumulator vector. Arguments: """""""""" -The first argument is the accumulator vector, or a `zeroinitializer`. The type of -this argument must match the return type. The second argument is the vector to reduce -into the accumulator, the width of this vector must be a positive integer multiple of -the accumulator vector/return type. +The first argument is the accumulator vector. The type of this argument must match the +return type. The second argument is the vector to reduceinto the accumulator, the length +of this vector must be a positive integer multiple of the accumulator vector/return type. +The arguments must be either be both fixed or both scalable vectors, and must have +matching element types. '``llvm.experimental.vector.histogram.*``' Intrinsic diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll index ccdcd1b740a27..325efbf757c47 100644 --- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll +++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll @@ -141,22 +141,12 @@ entry: ret %partial.reduce } -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(, ) #1 +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(, ) -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(, ) #1 +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i32 @llvm.vector.reduce.add.nxv8i32() -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(, ) #1 - -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -declare @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(, ) #1 - -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.vector.reduce.add.nxv4i32() #2 -declare i32 @llvm.vector.reduce.add.nxv8i32() #2 - -attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" } -attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } -attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "target-features"="+sve2" } From a9a1028cbee9c04d6e976ffd3ea486e6fd389cb2 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Mon, 17 Jun 2024 15:31:39 +0100 Subject: [PATCH 06/12] Update LangRef.rst for partial reduction intrinsic --- llvm/docs/LangRef.rst | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 0f77b8cc77f60..076be0308c0a8 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -19218,27 +19218,26 @@ This is an overloaded intrinsic. :: - declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accum, <8 x i32> %in) - declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %accum, <16 x i32> %in) - declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %accum, %in) - declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %accum, %in) + declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b) + declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b) + declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %a, %b) + declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %a, %b) Overview: """"""""" The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the -input vector down to the number of elements dictated by the result vector, and -then adds the resulting vector to the accumulator vector. The return type is a -vector type that matches the type of the accumulator vector. +second operand vector down to the number of elements dictated by the result +vector, and then adds the resulting vector to the first operand vector. The +return type is a vector type that matches the type of the first operand vector. Arguments: """""""""" -The first argument is the accumulator vector. The type of this argument must match the -return type. The second argument is the vector to reduceinto the accumulator, the length -of this vector must be a positive integer multiple of the accumulator vector/return type. -The arguments must be either be both fixed or both scalable vectors, and must have -matching element types. +Both arguments must be vectors of matching element types. The first argument type must +match the return type, while the second argument type must have a vector length that is a +positive integer multiple of the first vector/return type. The arguments must be either be +both fixed or both scalable vectors. '``llvm.experimental.vector.histogram.*``' Intrinsic From c01d6c6f7def7d0917c3d67bcd560dc6118a1cab Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Tue, 18 Jun 2024 11:52:21 +0100 Subject: [PATCH 07/12] Update LangRef.rst for partial reduction intrinsic --- llvm/docs/LangRef.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 076be0308c0a8..02537fe835083 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -19227,16 +19227,16 @@ Overview: """"""""" The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the -second operand vector down to the number of elements dictated by the result -vector, and then adds the resulting vector to the first operand vector. The -return type is a vector type that matches the type of the first operand vector. +concatenation of the two vector operands down to the number of elements dictated +by the result type. The result type is a vector type that matches the type of the +first operand vector. Arguments: """""""""" Both arguments must be vectors of matching element types. The first argument type must -match the return type, while the second argument type must have a vector length that is a -positive integer multiple of the first vector/return type. The arguments must be either be +match the result type, while the second argument type must have a vector length that is a +positive integer multiple of the first vector/result type. The arguments must be either be both fixed or both scalable vectors. From fadffccc476e18562f12e768c071904d80f9c913 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Thu, 20 Jun 2024 16:27:25 +0100 Subject: [PATCH 08/12] Implement generic lowering for the partial reduction intrinsic --- .../SelectionDAG/SelectionDAGBuilder.cpp | 38 +++++-- .../CodeGen/AArch64/partial-reduction-add.ll | 99 +++---------------- 2 files changed, 43 insertions(+), 94 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index b9f7aa80b48cc..7e7b9eba96510 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7920,20 +7920,38 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, auto OpNode = getValue(I.getOperand(1)); auto FullTy = OpNode.getValueType(); - auto Accumulator = getValue(I.getOperand(0)); - unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements(); + unsigned Stride = ReducedTy.getVectorMinNumElements(); + unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride; + // Collect all of the subvectors + SmallVector Subvectors; + Subvectors.push_back(getValue(I.getOperand(0))); for(unsigned i = 0; i < ScaleFactor; i++) { - auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL); - auto TargetIndex = DAG.getVectorIdxConstant(i, DL); - auto ExistingValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ReducedTy.getScalarType(), {Accumulator, TargetIndex}); - auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex}); - N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N); - N = DAG.getNode(ISD::ADD, DL, ReducedTy.getScalarType(), ExistingValue, N); - Accumulator = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {Accumulator, N, TargetIndex}); + auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, DL); + Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex})); + } + + while(Subvectors.size() >= 2) { + SmallVector NewSubvectors; + for(unsigned i = 0; i < Subvectors.size(); i+=2) { + unsigned j = i + 1; + auto A = Subvectors[i]; + if(j >= Subvectors.size()) { + unsigned OldLastIdx = NewSubvectors.size()-1; + auto OldLast = NewSubvectors[OldLastIdx]; + NewSubvectors[OldLastIdx] = DAG.getNode(ISD::ADD, DL, ReducedTy, {OldLast, A}); + break; + } + auto B = Subvectors[j]; + auto Node = DAG.getNode(ISD::ADD, DL, ReducedTy, {A, B}); + NewSubvectors.push_back(Node); + } + Subvectors = NewSubvectors; } + + assert(Subvectors.size() == 1 && "There should only be one subvector after tree flattening"); - setValue(&I, Accumulator); + setValue(&I, Subvectors[0]); return; } case Intrinsic::experimental_cttz_elts: { diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll index 325efbf757c47..ee57026eeb350 100644 --- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll +++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -force-vector-interleave=1 %s | FileCheck %s +; RUN: llc -force-vector-interleave=1 -o - %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -7,11 +7,7 @@ target triple = "aarch64-none-unknown-elf" define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 { ; CHECK-LABEL: partial_reduce_add_fixed: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addv s1, v1.4s -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: mov v0.s[0], w8 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret entry: %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0) @@ -21,14 +17,8 @@ entry: define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 { ; CHECK-LABEL: partial_reduce_add_fixed_half: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addv s1, v1.4s -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov w10, v0.s[1] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: add w9, w9, w8 -; CHECK-NEXT: add w8, w10, w8 -; CHECK-NEXT: mov v0.s[0], w9 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0) @@ -38,13 +28,7 @@ entry: define @partial_reduce_add( %accumulator, %0) #0 { ; CHECK-LABEL: partial_reduce_add: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: uaddv d1, p0, z1.s -; CHECK-NEXT: ptrue p0.s, vl1 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov z0.s, p0/m, w8 +; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32( %accumulator, %0) @@ -54,20 +38,8 @@ entry: define @partial_reduce_add_half( %accumulator, %0) #0 { ; CHECK-LABEL: partial_reduce_add_half: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: index z2.s, #0, #1 -; CHECK-NEXT: mov z3.s, w8 -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: uaddv d1, p0, z1.s -; CHECK-NEXT: ptrue p1.s, vl1 -; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: add w10, w10, w8 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: mov z0.s, p1/m, w10 -; CHECK-NEXT: mov z0.s, p0/m, w8 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z2.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %accumulator, %0) @@ -77,40 +49,10 @@ entry: define @partial_reduce_add_quart( %accumulator, %0) #0 { ; CHECK-LABEL: partial_reduce_add_quart: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: mov z6.s, w8 -; CHECK-NEXT: index z5.s, #0, #1 -; CHECK-NEXT: ptrue p2.s, vl1 -; CHECK-NEXT: uaddv d1, p0, z1.s -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: uaddv d2, p0, z2.s -; CHECK-NEXT: uaddv d3, p0, z3.s -; CHECK-NEXT: cmpeq p1.s, p0/z, z5.s, z6.s -; CHECK-NEXT: uaddv d4, p0, z4.s -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: add w8, w10, w8 -; CHECK-NEXT: mov w10, #2 // =0x2 -; CHECK-NEXT: mov z1.s, p2/m, w8 -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: mov z6.s, w10 -; CHECK-NEXT: mov w10, v0.s[2] -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: mov w9, #3 // =0x3 -; CHECK-NEXT: cmpeq p2.s, p0/z, z5.s, z6.s -; CHECK-NEXT: mov z2.s, w9 -; CHECK-NEXT: fmov x9, d3 -; CHECK-NEXT: mov z1.s, p1/m, w8 -; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: add w9, w10, w9 -; CHECK-NEXT: cmpeq p0.s, p0/z, z5.s, z2.s -; CHECK-NEXT: mov z1.s, p2/m, w9 -; CHECK-NEXT: fmov x9, d4 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov z1.s, p0/m, w8 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: add z2.s, z2.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z1.s, z2.s, z4.s +; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %accumulator, %0) @@ -120,21 +62,10 @@ entry: define @partial_reduce_add_half_8( %accumulator, %0) #0 { ; CHECK-LABEL: partial_reduce_add_half_8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add z2.s, z2.s, z3.s -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: index z3.s, #0, #1 -; CHECK-NEXT: mov z4.s, w8 -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: ptrue p1.s, vl1 -; CHECK-NEXT: uaddv d2, p0, z2.s -; CHECK-NEXT: cmpeq p0.s, p0/z, z3.s, z4.s -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: add w10, w10, w8 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: mov z0.s, p1/m, w10 -; CHECK-NEXT: mov z0.s, p0/m, w8 +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z1.s, z1.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z4.s +; CHECK-NEXT: add z1.s, z1.s, z5.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32( %accumulator, %0) From 7c428bd884212993b9d9c5f274d251246bfa6842 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Tue, 25 Jun 2024 16:20:22 +0100 Subject: [PATCH 09/12] Use deque instead of SmallVector for generic lowering --- .../SelectionDAG/SelectionDAGBuilder.cpp | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 7e7b9eba96510..42c2b98b07306 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -108,6 +108,7 @@ #include #include #include +#include using namespace llvm; using namespace PatternMatch; @@ -7924,29 +7925,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride; // Collect all of the subvectors - SmallVector Subvectors; + std::deque Subvectors; Subvectors.push_back(getValue(I.getOperand(0))); for(unsigned i = 0; i < ScaleFactor; i++) { auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, DL); Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex})); } - while(Subvectors.size() >= 2) { - SmallVector NewSubvectors; - for(unsigned i = 0; i < Subvectors.size(); i+=2) { - unsigned j = i + 1; - auto A = Subvectors[i]; - if(j >= Subvectors.size()) { - unsigned OldLastIdx = NewSubvectors.size()-1; - auto OldLast = NewSubvectors[OldLastIdx]; - NewSubvectors[OldLastIdx] = DAG.getNode(ISD::ADD, DL, ReducedTy, {OldLast, A}); - break; - } - auto B = Subvectors[j]; - auto Node = DAG.getNode(ISD::ADD, DL, ReducedTy, {A, B}); - NewSubvectors.push_back(Node); - } - Subvectors = NewSubvectors; + // Flatten the subvector tree + while(Subvectors.size() > 1) { + Subvectors.push_back(DAG.getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]})); + Subvectors.pop_front(); + Subvectors.pop_front(); } assert(Subvectors.size() == 1 && "There should only be one subvector after tree flattening"); From 913ac876d4b3203fc19a4b12691ec96d24f1d7f6 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Wed, 3 Jul 2024 14:18:36 +0100 Subject: [PATCH 10/12] Address final comments --- .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 13 ++++++------- llvm/lib/IR/Verifier.cpp | 6 +++--- llvm/test/CodeGen/AArch64/partial-reduction-add.ll | 14 +++++++------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 42c2b98b07306..dc2a670085256 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7916,10 +7916,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::experimental_vector_partial_reduce_add: { - auto DL = getCurSDLoc(); - auto ReducedTy = EVT::getEVT(I.getType()); - auto OpNode = getValue(I.getOperand(1)); - auto FullTy = OpNode.getValueType(); + SDValue OpNode = getValue(I.getOperand(1)); + EVT ReducedTy = EVT::getEVT(I.getType()); + EVT FullTy = OpNode.getValueType(); unsigned Stride = ReducedTy.getVectorMinNumElements(); unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride; @@ -7928,13 +7927,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, std::deque Subvectors; Subvectors.push_back(getValue(I.getOperand(0))); for(unsigned i = 0; i < ScaleFactor; i++) { - auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, DL); - Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex})); + auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl); + Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy, {OpNode, SourceIndex})); } // Flatten the subvector tree while(Subvectors.size() > 1) { - Subvectors.push_back(DAG.getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]})); + Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy, {Subvectors[0], Subvectors[1]})); Subvectors.pop_front(); Subvectors.pop_front(); } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 21371bdd4bf6d..c34389fb7ee01 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6135,12 +6135,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { VectorType *AccTy = cast(Call.getArgOperand(0)->getType()); VectorType *VecTy = cast(Call.getArgOperand(1)->getType()); - auto VecWidth = VecTy->getElementCount().getKnownMinValue(); - auto AccWidth = AccTy->getElementCount().getKnownMinValue(); + unsigned VecWidth = VecTy->getElementCount().getKnownMinValue(); + unsigned AccWidth = AccTy->getElementCount().getKnownMinValue(); Check((VecWidth % AccWidth) == 0, "Invalid vector widths for partial " "reduction. The width of the input vector " - "must be a postive integer multiple of " + "must be a positive integer multiple of " "the width of the accumulator vector."); break; } diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll index ee57026eeb350..ae681ee54e687 100644 --- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll +++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll @@ -18,7 +18,7 @@ define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32 ; CHECK-LABEL: partial_reduce_add_fixed_half: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NEXT: ret entry: %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0) @@ -39,7 +39,7 @@ define @partial_reduce_add_half( %accumulat ; CHECK-LABEL: partial_reduce_add_half: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %accumulator, %0) @@ -49,10 +49,10 @@ entry: define @partial_reduce_add_quart( %accumulator, %0) #0 { ; CHECK-LABEL: partial_reduce_add_quart: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add z2.s, z2.s, z3.s -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: add z1.s, z2.s, z4.s ; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z2.s, z2.s, z3.s +; CHECK-NEXT: add z0.s, z4.s, z0.s +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %accumulator, %0) @@ -64,8 +64,8 @@ define @partial_reduce_add_half_8( %accumul ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: add z0.s, z0.s, z2.s ; CHECK-NEXT: add z1.s, z1.s, z3.s -; CHECK-NEXT: add z0.s, z0.s, z4.s -; CHECK-NEXT: add z1.s, z1.s, z5.s +; CHECK-NEXT: add z0.s, z4.s, z0.s +; CHECK-NEXT: add z1.s, z5.s, z1.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32( %accumulator, %0) From 631208f94358e58c35e5e016c270f59a9d0b7fda Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Wed, 3 Jul 2024 14:33:57 +0100 Subject: [PATCH 11/12] Fix formatting --- .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 13 ++++++++----- llvm/lib/IR/Verifier.cpp | 9 +++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index dc2a670085256..075cb77d933e7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -104,11 +104,11 @@ #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/Local.h" #include +#include #include #include #include #include -#include using namespace llvm; using namespace PatternMatch; @@ -7926,19 +7926,22 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, // Collect all of the subvectors std::deque Subvectors; Subvectors.push_back(getValue(I.getOperand(0))); - for(unsigned i = 0; i < ScaleFactor; i++) { + for (unsigned i = 0; i < ScaleFactor; i++) { auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl); - Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy, {OpNode, SourceIndex})); + Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy, + {OpNode, SourceIndex})); } // Flatten the subvector tree while(Subvectors.size() > 1) { - Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy, {Subvectors[0], Subvectors[1]})); + Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy, + {Subvectors[0], Subvectors[1]})); Subvectors.pop_front(); Subvectors.pop_front(); } - assert(Subvectors.size() == 1 && "There should only be one subvector after tree flattening"); + assert(Subvectors.size() == 1 && + "There should only be one subvector after tree flattening"); setValue(&I, Subvectors[0]); return; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index c34389fb7ee01..ddefeb5a03e97 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6138,10 +6138,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { unsigned VecWidth = VecTy->getElementCount().getKnownMinValue(); unsigned AccWidth = AccTy->getElementCount().getKnownMinValue(); - Check((VecWidth % AccWidth) == 0, "Invalid vector widths for partial " - "reduction. The width of the input vector " - "must be a positive integer multiple of " - "the width of the accumulator vector."); + Check((VecWidth % AccWidth) == 0, + "Invalid vector widths for partial " + "reduction. The width of the input vector " + "must be a positive integer multiple of " + "the width of the accumulator vector."); break; } case Intrinsic::experimental_noalias_scope_decl: { From f8eec059bb4e7706cdf525271e147f82e319fc89 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Wed, 3 Jul 2024 14:54:11 +0100 Subject: [PATCH 12/12] Fix formatting using git-clang-format --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 075cb77d933e7..76bac44f61e9c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7929,17 +7929,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, for (unsigned i = 0; i < ScaleFactor; i++) { auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl); Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy, - {OpNode, SourceIndex})); + {OpNode, SourceIndex})); } // Flatten the subvector tree - while(Subvectors.size() > 1) { + while (Subvectors.size() > 1) { Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy, {Subvectors[0], Subvectors[1]})); Subvectors.pop_front(); Subvectors.pop_front(); } - + assert(Subvectors.size() == 1 && "There should only be one subvector after tree flattening");