-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[IR][LangRef] Add partial reduction add intrinsic #94499
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f365ac7
102f9e4
b0c126e
b811558
0786587
a9a1028
c01d6c6
fadffcc
7c428bd
913ac87
631208f
f8eec05
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -104,6 +104,7 @@ | |
#include "llvm/TargetParser/Triple.h" | ||
#include "llvm/Transforms/Utils/Local.h" | ||
#include <cstddef> | ||
#include <deque> | ||
#include <iterator> | ||
#include <limits> | ||
#include <optional> | ||
|
@@ -7914,6 +7915,37 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, | |
setValue(&I, Trunc); | ||
return; | ||
} | ||
case Intrinsic::experimental_vector_partial_reduce_add: { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can pass this through as an INTRINSIC_WO_CHAIN node, at least for targets that support it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to be careful because I don't think common code exists to type legalise arbitrary INTRINSIC_WO_CHAIN calls (given their nature). Presumably we'll just follow the precedent set for I can't help but think as some point we'll just want to restrict the "same element type" restrict of |
||
SDValue OpNode = getValue(I.getOperand(1)); | ||
EVT ReducedTy = EVT::getEVT(I.getType()); | ||
EVT FullTy = OpNode.getValueType(); | ||
|
||
unsigned Stride = ReducedTy.getVectorMinNumElements(); | ||
unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride; | ||
|
||
// Collect all of the subvectors | ||
std::deque<SDValue> Subvectors; | ||
Subvectors.push_back(getValue(I.getOperand(0))); | ||
for (unsigned i = 0; i < ScaleFactor; i++) { | ||
auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl); | ||
Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy, | ||
{OpNode, SourceIndex})); | ||
} | ||
|
||
// Flatten the subvector tree | ||
while (Subvectors.size() > 1) { | ||
Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy, | ||
{Subvectors[0], Subvectors[1]})); | ||
Subvectors.pop_front(); | ||
Subvectors.pop_front(); | ||
} | ||
|
||
assert(Subvectors.size() == 1 && | ||
"There should only be one subvector after tree flattening"); | ||
|
||
setValue(&I, Subvectors[0]); | ||
return; | ||
} | ||
case Intrinsic::experimental_cttz_elts: { | ||
auto DL = getCurSDLoc(); | ||
SDValue Op = getValue(I.getOperand(0)); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6131,6 +6131,20 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { | |
} | ||
break; | ||
} | ||
case Intrinsic::experimental_vector_partial_reduce_add: { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess my matcher class suggestion would remove the need for this code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above for my 2c. |
||
VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType()); | ||
VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType()); | ||
|
||
unsigned VecWidth = VecTy->getElementCount().getKnownMinValue(); | ||
unsigned AccWidth = AccTy->getElementCount().getKnownMinValue(); | ||
|
||
Check((VecWidth % AccWidth) == 0, | ||
"Invalid vector widths for partial " | ||
"reduction. The width of the input vector " | ||
"must be a positive integer multiple of " | ||
"the width of the accumulator vector."); | ||
break; | ||
} | ||
case Intrinsic::experimental_noalias_scope_decl: { | ||
NoAliasScopeDecls.push_back(cast<IntrinsicInst>(&Call)); | ||
break; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | ||
; RUN: llc -force-vector-interleave=1 -o - %s | FileCheck %s | ||
|
||
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" | ||
target triple = "aarch64-none-unknown-elf" | ||
|
||
define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 { | ||
; CHECK-LABEL: partial_reduce_add_fixed: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s | ||
; CHECK-NEXT: ret | ||
entry: | ||
%partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0) | ||
ret <4 x i32> %partial.reduce | ||
} | ||
|
||
define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 { | ||
; CHECK-LABEL: partial_reduce_add_fixed_half: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s | ||
; CHECK-NEXT: add v0.4s, v2.4s, v0.4s | ||
; CHECK-NEXT: ret | ||
entry: | ||
%partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0) | ||
ret <4 x i32> %partial.reduce | ||
} | ||
|
||
define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) #0 { | ||
; CHECK-LABEL: partial_reduce_add: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: add z0.s, z0.s, z1.s | ||
; CHECK-NEXT: ret | ||
entry: | ||
%partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) | ||
ret <vscale x 4 x i32> %partial.reduce | ||
} | ||
|
||
define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) #0 { | ||
; CHECK-LABEL: partial_reduce_add_half: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: add z0.s, z0.s, z1.s | ||
; CHECK-NEXT: add z0.s, z2.s, z0.s | ||
; CHECK-NEXT: ret | ||
entry: | ||
%partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) | ||
ret <vscale x 4 x i32> %partial.reduce | ||
} | ||
|
||
define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is reducing into the first 4 elements of the accumulator; it doesn't work correctly with vscale. |
||
; CHECK-LABEL: partial_reduce_add_quart: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: add z0.s, z0.s, z1.s | ||
; CHECK-NEXT: add z2.s, z2.s, z3.s | ||
; CHECK-NEXT: add z0.s, z4.s, z0.s | ||
; CHECK-NEXT: add z0.s, z2.s, z0.s | ||
; CHECK-NEXT: ret | ||
entry: | ||
%partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) | ||
ret <vscale x 4 x i32> %partial.reduce | ||
} | ||
|
||
define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) #0 { | ||
; CHECK-LABEL: partial_reduce_add_half_8: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: add z0.s, z0.s, z2.s | ||
; CHECK-NEXT: add z1.s, z1.s, z3.s | ||
; CHECK-NEXT: add z0.s, z4.s, z0.s | ||
; CHECK-NEXT: add z1.s, z5.s, z1.s | ||
; CHECK-NEXT: ret | ||
entry: | ||
%partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) | ||
ret <vscale x 8 x i32> %partial.reduce | ||
} | ||
|
||
declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) | ||
declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>) | ||
declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>) | ||
declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>) | ||
|
||
declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) | ||
declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>) | ||
|
||
attributes #0 = { "target-features"="+sve2" } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think adding a new matcher class to constrain the second parameter to the restrictions you defined in the langref would be helpful (same element type, width an integer multiple).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given this is an experimental intrinsic is it worth implementing that plumbing?
Also, the matcher classes typically exist to allow for fewer explicit types when creating a call, which in this instance is not possible because both vector lengths are unknown (or to put another way, there's no 1-1 link between them).
Personally I think there verifier route is better, plus it allow for a more user friendly error message.