From f365ac737c27d3c9e5978d3c64787749d89355a7 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Wed, 5 Jun 2024 16:43:04 +0100
Subject: [PATCH 01/12] Add partial reduction add intrinsic

---
 llvm/docs/LangRef.rst                         | 33 +++++++-
 llvm/include/llvm/IR/Intrinsics.td            |  6 ++
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 21 +++++
 .../CodeGen/AArch64/partial-reduce-sdot-ir.ll | 76 +++++++++++++++++++
 4 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 9d7ade8eb523b..95f839e35b673 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -14250,7 +14250,7 @@ Arguments:
 """"""""""
 The first 4 arguments are similar to ``llvm.instrprof.increment``. The indexing
 is specific to callsites, meaning callsites are indexed from 0, independent from
-the indexes used by the other intrinsics (such as 
+the indexes used by the other intrinsics (such as
 ``llvm.instrprof.increment[.step]``).
 
 The last argument is the called value of the callsite this intrinsic precedes.
@@ -14264,7 +14264,7 @@ a buffer LLVM can use to perform counter increments (i.e. the lowering of
 ``llvm.instrprof.increment[.step]``. The address range following the counter
 buffer, ``<num-counters>`` x ``sizeof(ptr)`` - sized, is expected to contain
 pointers to contexts of functions called from this function ("subcontexts").
-LLVM does not dereference into that memory region, just calculates GEPs. 
+LLVM does not dereference into that memory region, just calculates GEPs.
 
 The lowering of ``llvm.instrprof.callsite`` consists of:
 
@@ -19209,6 +19209,35 @@ will be on any later loop iteration.
 This intrinsic will only return 0 if the input count is also 0. A non-zero input
 count will produce a non-zero result.
 
+'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<8 x i32> %in)
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<16 x i32> %in)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 8 x i32> %in)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %in)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics do an integer
+``ADD`` reduction of subvectors within a vector, returning each scalar result as
+a lane within a vector. The return type is a vector type with an
+element-type of the vector input and a width a factor of the vector input
+(typically either half or quarter).
+
+Arguments:
+""""""""""
+
+The argument to this intrinsic must be a vector of integer values.
+
+
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 107442623ab7b..08c516bd1cea1 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2635,6 +2635,12 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
                                                      [llvm_anyvector_ty],
                                                      [IntrNoMem]>;
 
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                                       [llvm_anyvector_ty],
+                                                                       [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ba76456b5836a..f24723a45237d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7914,6 +7914,27 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Trunc);
     return;
   }
+  case Intrinsic::experimental_vector_partial_reduce_add: {
+    auto DL = getCurSDLoc();
+    auto ReducedTy = EVT::getEVT(I.getType());
+    auto OpNode = getValue(I.getOperand(0));
+    auto Index = DAG.getVectorIdxConstant(0, DL);
+    auto FullTy = OpNode.getValueType();
+
+    auto ResultVector = DAG.getSplat(ReducedTy, DL, DAG.getConstant(0, DL, ReducedTy.getScalarType()));
+    unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements();
+
+    for(unsigned i = 0; i < ScaleFactor; i++) {
+      auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL);
+      auto TargetIndex = DAG.getVectorIdxConstant(i, DL);
+      auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex});
+      N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N);
+      ResultVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex});
+    }
+
+    setValue(&I, ResultVector);
+    return;
+  }
   case Intrinsic::experimental_cttz_elts: {
     auto DL = getCurSDLoc();
     SDValue Op = getValue(I.getOperand(0));
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
new file mode 100644
index 0000000000000..6a5b3bd5ace2e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-vector-interleave=1 %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define void @partial_reduce_add(<vscale x 16 x i8> %wide.load.pre, <vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %index) #0 {
+; CHECK-LABEL: partial_reduce_add:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z2.s, #0, #1
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    mov w8, #2 // =0x2
+; CHECK-NEXT:    ptrue p2.s, vl1
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    mov z6.s, w8
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z2.s, z4.s
+; CHECK-NEXT:    uaddv d3, p0, z0.s
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    uaddv d7, p0, z1.s
+; CHECK-NEXT:    uaddv d4, p0, z5.s
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    mov z1.s, p2/m, w8
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    cmpeq p2.s, p0/z, z2.s, z6.s
+; CHECK-NEXT:    mov z5.s, w8
+; CHECK-NEXT:    fmov x8, d7
+; CHECK-NEXT:    uaddv d3, p0, z3.s
+; CHECK-NEXT:    mov z1.s, p1/m, w8
+; CHECK-NEXT:    fmov x8, d4
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z5.s
+; CHECK-NEXT:    mov z1.s, p2/m, w8
+; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    mov z1.s, p0/m, w8
+; CHECK-NEXT:    addvl x8, x1, #1
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    orr z0.d, z1.d, z0.d
+; CHECK-NEXT:    cbnz x8, .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %middle.block
+; CHECK-NEXT:    ret
+entry:
+  %2 = call i64 @llvm.vscale.i64()
+  %3 = mul i64 %2, 16
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %4, %vector.body ]
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %1)
+  %4 = or <vscale x 4 x i32> %partial.reduce, %vec.phi
+  %index.next = add i64 %index, %3
+  %5 = icmp eq i64 %index.next, 0
+  br i1 %5, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %6 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %4)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #2
+
+attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

From 102f9e40ddb33c2b4edaa0d1604d9d3dbe582928 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Mon, 10 Jun 2024 15:44:54 +0100
Subject: [PATCH 02/12] Change partial reduction intrinsic to take the
 accumulator as an operand

---
 llvm/docs/LangRef.rst                         |  22 ++-
 llvm/include/llvm/IR/Intrinsics.td            |   4 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  11 +-
 llvm/lib/IR/Verifier.cpp                      |  13 ++
 .../CodeGen/AArch64/partial-reduce-sdot-ir.ll | 176 +++++++++++++-----
 5 files changed, 164 insertions(+), 62 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 95f839e35b673..640b5062090f2 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19218,24 +19218,26 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<8 x i32> %in)
-      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<16 x i32> %in)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 8 x i32> %in)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %in)
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accum, <8 x i32> %in)
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %accum, <16 x i32> %in)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accum, <vscale x 8 x i32> %in)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accum, <vscale x 16 x i32> %in)
 
 Overview:
 """""""""
 
-The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics do an integer
-``ADD`` reduction of subvectors within a vector, returning each scalar result as
-a lane within a vector. The return type is a vector type with an
-element-type of the vector input and a width a factor of the vector input
-(typically either half or quarter).
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics perform an integer
+``ADD`` reduction of subvectors within a vector, before adding the resulting vector
+to the provided accumulator vector. The return type is a vector type that matches
+the type of the accumulator vector.
 
 Arguments:
 """"""""""
 
-The argument to this intrinsic must be a vector of integer values.
+The first argument is the accumulator vector, or a `zeroinitializer`. The type of
+this argument must match the return type. The second argument is the vector to reduce
+into the accumulator, the width of this vector must be a positive integer multiple of
+the accumulator vector/return type.
 
 
 '``llvm.experimental.vector.histogram.*``' Intrinsic
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 08c516bd1cea1..5b3e3d2387463 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2637,8 +2637,8 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
 
 //===-------------- Intrinsics to perform partial reduction ---------------===//
 
-def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                                       [llvm_anyvector_ty],
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                                                                       [llvm_anyvector_ty, llvm_anyvector_ty],
                                                                        [IntrNoMem]>;
 
 //===----------------- Pointer Authentication Intrinsics ------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f24723a45237d..b9f7aa80b48cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7917,22 +7917,23 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::experimental_vector_partial_reduce_add: {
     auto DL = getCurSDLoc();
     auto ReducedTy = EVT::getEVT(I.getType());
-    auto OpNode = getValue(I.getOperand(0));
-    auto Index = DAG.getVectorIdxConstant(0, DL);
+    auto OpNode = getValue(I.getOperand(1));
     auto FullTy = OpNode.getValueType();
 
-    auto ResultVector = DAG.getSplat(ReducedTy, DL, DAG.getConstant(0, DL, ReducedTy.getScalarType()));
+    auto Accumulator = getValue(I.getOperand(0));
     unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements();
 
     for(unsigned i = 0; i < ScaleFactor; i++) {
       auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL);
       auto TargetIndex = DAG.getVectorIdxConstant(i, DL);
+      auto ExistingValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ReducedTy.getScalarType(), {Accumulator, TargetIndex});
       auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex});
       N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N);
-      ResultVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex});
+      N = DAG.getNode(ISD::ADD, DL, ReducedTy.getScalarType(), ExistingValue, N);
+      Accumulator = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {Accumulator, N, TargetIndex});
     }
 
-    setValue(&I, ResultVector);
+    setValue(&I, Accumulator);
     return;
   }
   case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 684e54444621b..21371bdd4bf6d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6131,6 +6131,19 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::experimental_vector_partial_reduce_add: {
+    VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+    VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
+
+    auto VecWidth = VecTy->getElementCount().getKnownMinValue();
+    auto AccWidth = AccTy->getElementCount().getKnownMinValue();
+
+    Check((VecWidth % AccWidth) == 0, "Invalid vector widths for partial "
+                                      "reduction. The width of the input vector "
+                                      "must be a postive integer multiple of "
+                                      "the width of the accumulator vector.");
+    break;
+  }
   case Intrinsic::experimental_noalias_scope_decl: {
     NoAliasScopeDecls.push_back(cast<IntrinsicInst>(&Call));
     break;
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
index 6a5b3bd5ace2e..ccdcd1b740a27 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
@@ -4,72 +4,158 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
-define void @partial_reduce_add(<vscale x 16 x i8> %wide.load.pre, <vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %index) #0 {
+define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv s1, v1.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    mov v0.s[0], w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
+  ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv s1, v1.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov w10, v0.s[1]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    mov v0.s[0], w9
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
+  ret <4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) #0 {
 ; CHECK-LABEL: partial_reduce_add:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    uaddv d1, p0, z1.s
+; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z2.s, #0, #1
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEXT:    ptrue p2.s, vl1
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov w9, v0.s[1]
+; CHECK-NEXT:    uaddv d1, p0, z1.s
+; CHECK-NEXT:    ptrue p1.s, vl1
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    add w10, w10, w8
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    mov z0.s, p1/m, w10
+; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_quart:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov w10, s0
 ; CHECK-NEXT:    mov z6.s, w8
-; CHECK-NEXT:    cmpeq p1.s, p0/z, z2.s, z4.s
-; CHECK-NEXT:    uaddv d3, p0, z0.s
-; CHECK-NEXT:    mov z0.s, #0 // =0x0
-; CHECK-NEXT:    uaddv d7, p0, z1.s
-; CHECK-NEXT:    uaddv d4, p0, z5.s
+; CHECK-NEXT:    index z5.s, #0, #1
+; CHECK-NEXT:    ptrue p2.s, vl1
+; CHECK-NEXT:    uaddv d1, p0, z1.s
+; CHECK-NEXT:    mov w9, v0.s[1]
+; CHECK-NEXT:    uaddv d2, p0, z2.s
+; CHECK-NEXT:    uaddv d3, p0, z3.s
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z5.s, z6.s
+; CHECK-NEXT:    uaddv d4, p0, z4.s
+; CHECK-NEXT:    fmov x8, d1
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    fmov x8, d3
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    mov w10, #2 // =0x2
 ; CHECK-NEXT:    mov z1.s, p2/m, w8
-; CHECK-NEXT:    mov w8, #3 // =0x3
-; CHECK-NEXT:    cmpeq p2.s, p0/z, z2.s, z6.s
-; CHECK-NEXT:    mov z5.s, w8
-; CHECK-NEXT:    fmov x8, d7
-; CHECK-NEXT:    uaddv d3, p0, z3.s
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov z6.s, w10
+; CHECK-NEXT:    mov w10, v0.s[2]
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    cmpeq p2.s, p0/z, z5.s, z6.s
+; CHECK-NEXT:    mov z2.s, w9
+; CHECK-NEXT:    fmov x9, d3
 ; CHECK-NEXT:    mov z1.s, p1/m, w8
-; CHECK-NEXT:    fmov x8, d4
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z5.s
-; CHECK-NEXT:    mov z1.s, p2/m, w8
-; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    mov w8, v0.s[3]
+; CHECK-NEXT:    add w9, w10, w9
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z5.s, z2.s
+; CHECK-NEXT:    mov z1.s, p2/m, w9
+; CHECK-NEXT:    fmov x9, d4
+; CHECK-NEXT:    add w8, w8, w9
 ; CHECK-NEXT:    mov z1.s, p0/m, w8
-; CHECK-NEXT:    addvl x8, x1, #1
-; CHECK-NEXT:  .LBB0_1: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    orr z0.d, z1.d, z0.d
-; CHECK-NEXT:    cbnz x8, .LBB0_1
-; CHECK-NEXT:  // %bb.2: // %middle.block
+; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %2 = call i64 @llvm.vscale.i64()
-  %3 = mul i64 %2, 16
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %entry
-  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %4, %vector.body ]
-  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %1)
-  %4 = or <vscale x 4 x i32> %partial.reduce, %vec.phi
-  %index.next = add i64 %index, %3
-  %5 = icmp eq i64 %index.next, 0
-  br i1 %5, label %middle.block, label %vector.body
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
+  ret <vscale x 4 x i32> %partial.reduce
+}
 
-middle.block:                                     ; preds = %vector.body
-  %6 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %4)
-  ret void
+define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half_8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z2.s, z2.s, z3.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z3.s, #0, #1
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov w9, v0.s[1]
+; CHECK-NEXT:    ptrue p1.s, vl1
+; CHECK-NEXT:    uaddv d2, p0, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z3.s, z4.s
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    add w10, w10, w8
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    mov z0.s, p1/m, w10
+; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
+  ret <vscale x 8 x i32> %partial.reduce
 }
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare i64 @llvm.vscale.i64() #1
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>) #1
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32>) #1
+declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>) #1
 
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #2
+declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>) #2
 
 attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
 attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }

From b0c126ec70ded502201854668d37926a8b526572 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Mon, 10 Jun 2024 16:00:04 +0100
Subject: [PATCH 03/12] Rename test file

---
 .../{partial-reduce-sdot-ir.ll => partial-reduction-add.ll}       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/CodeGen/AArch64/{partial-reduce-sdot-ir.ll => partial-reduction-add.ll} (100%)

diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
similarity index 100%
rename from llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
rename to llvm/test/CodeGen/AArch64/partial-reduction-add.ll

From b8115585df7ca16a3fdb8f66016b4c9037d6865a Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Wed, 12 Jun 2024 15:06:52 +0100
Subject: [PATCH 04/12] Fix docs build error

---
 llvm/docs/LangRef.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 640b5062090f2..5b89c3973ff18 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19210,7 +19210,7 @@ This intrinsic will only return 0 if the input count is also 0. A non-zero input
 count will produce a non-zero result.
 
 '``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""

From 0786587c7c45de756da55c20e47e45996ef243ea Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Mon, 17 Jun 2024 13:35:04 +0100
Subject: [PATCH 05/12] Update LangRef.rst for partial reduction intrinsic

---
 llvm/docs/LangRef.rst                         | 21 ++++++++--------
 .../CodeGen/AArch64/partial-reduction-add.ll  | 24 ++++++-------------
 2 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 5b89c3973ff18..0f77b8cc77f60 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -14250,7 +14250,7 @@ Arguments:
 """"""""""
 The first 4 arguments are similar to ``llvm.instrprof.increment``. The indexing
 is specific to callsites, meaning callsites are indexed from 0, independent from
-the indexes used by the other intrinsics (such as
+the indexes used by the other intrinsics (such as 
 ``llvm.instrprof.increment[.step]``).
 
 The last argument is the called value of the callsite this intrinsic precedes.
@@ -14264,7 +14264,7 @@ a buffer LLVM can use to perform counter increments (i.e. the lowering of
 ``llvm.instrprof.increment[.step]``. The address range following the counter
 buffer, ``<num-counters>`` x ``sizeof(ptr)`` - sized, is expected to contain
 pointers to contexts of functions called from this function ("subcontexts").
-LLVM does not dereference into that memory region, just calculates GEPs.
+LLVM does not dereference into that memory region, just calculates GEPs. 
 
 The lowering of ``llvm.instrprof.callsite`` consists of:
 
@@ -19226,18 +19226,19 @@ This is an overloaded intrinsic.
 Overview:
 """""""""
 
-The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics perform an integer
-``ADD`` reduction of subvectors within a vector, before adding the resulting vector
-to the provided accumulator vector. The return type is a vector type that matches
-the type of the accumulator vector.
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the
+input vector down to the number of elements dictated by the result vector, and
+then adds the resulting vector to the accumulator vector. The return type is a
+vector type that matches the type of the accumulator vector.
 
 Arguments:
 """"""""""
 
-The first argument is the accumulator vector, or a `zeroinitializer`. The type of
-this argument must match the return type. The second argument is the vector to reduce
-into the accumulator, the width of this vector must be a positive integer multiple of
-the accumulator vector/return type.
+The first argument is the accumulator vector. The type of this argument must match the
+return type. The second argument is the vector to reduceinto the accumulator, the length
+of this vector must be a positive integer multiple of the accumulator vector/return type.
+The arguments must be either be both fixed or both scalable vectors, and must have
+matching element types.
 
 
 '``llvm.experimental.vector.histogram.*``' Intrinsic
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
index ccdcd1b740a27..325efbf757c47 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -141,22 +141,12 @@ entry:
   ret <vscale x 8 x i32> %partial.reduce
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) #1
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>)
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>) #1
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>)
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>) #1
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>) #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #2
-declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>) #2
-
-attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
-attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #0 = { "target-features"="+sve2" }

From a9a1028cbee9c04d6e976ffd3ea486e6fd389cb2 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Mon, 17 Jun 2024 15:31:39 +0100
Subject: [PATCH 06/12] Update LangRef.rst for partial reduction intrinsic

---
 llvm/docs/LangRef.rst | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 0f77b8cc77f60..076be0308c0a8 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19218,27 +19218,26 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accum, <8 x i32> %in)
-      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %accum, <16 x i32> %in)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accum, <vscale x 8 x i32> %in)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accum, <vscale x 16 x i32> %in)
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
 
 Overview:
 """""""""
 
 The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the
-input vector down to the number of elements dictated by the result vector, and
-then adds the resulting vector to the accumulator vector. The return type is a
-vector type that matches the type of the accumulator vector.
+second operand vector down to the number of elements dictated by the result
+vector, and then adds the resulting vector to the first operand vector. The
+return type is a vector type that matches the type of the first operand vector.
 
 Arguments:
 """"""""""
 
-The first argument is the accumulator vector. The type of this argument must match the
-return type. The second argument is the vector to reduceinto the accumulator, the length
-of this vector must be a positive integer multiple of the accumulator vector/return type.
-The arguments must be either be both fixed or both scalable vectors, and must have
-matching element types.
+Both arguments must be vectors of matching element types. The first argument type must
+match the return type, while the second argument type must have a vector length that is a
+positive integer multiple of the first vector/return type. The arguments must be either be
+both fixed or both scalable vectors.
 
 
 '``llvm.experimental.vector.histogram.*``' Intrinsic

From c01d6c6f7def7d0917c3d67bcd560dc6118a1cab Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Tue, 18 Jun 2024 11:52:21 +0100
Subject: [PATCH 07/12] Update LangRef.rst for partial reduction intrinsic

---
 llvm/docs/LangRef.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 076be0308c0a8..02537fe835083 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19227,16 +19227,16 @@ Overview:
 """""""""
 
 The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the
-second operand vector down to the number of elements dictated by the result
-vector, and then adds the resulting vector to the first operand vector. The
-return type is a vector type that matches the type of the first operand vector.
+concatenation of the two vector operands down to the number of elements dictated
+by the result type. The result type is a vector type that matches the type of the
+first operand vector.
 
 Arguments:
 """"""""""
 
 Both arguments must be vectors of matching element types. The first argument type must
-match the return type, while the second argument type must have a vector length that is a
-positive integer multiple of the first vector/return type. The arguments must be either be
+match the result type, while the second argument type must have a vector length that is a
+positive integer multiple of the first vector/result type. The arguments must be either be
 both fixed or both scalable vectors.
 
 

From fadffccc476e18562f12e768c071904d80f9c913 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Thu, 20 Jun 2024 16:27:25 +0100
Subject: [PATCH 08/12] Implement generic lowering for the partial reduction
 intrinsic

---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 38 +++++--
 .../CodeGen/AArch64/partial-reduction-add.ll  | 99 +++----------------
 2 files changed, 43 insertions(+), 94 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b9f7aa80b48cc..7e7b9eba96510 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7920,20 +7920,38 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     auto OpNode = getValue(I.getOperand(1));
     auto FullTy = OpNode.getValueType();
 
-    auto Accumulator = getValue(I.getOperand(0));
-    unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements();
+    unsigned Stride = ReducedTy.getVectorMinNumElements();
+    unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
 
+    // Collect all of the subvectors
+    SmallVector<SDValue> Subvectors;
+    Subvectors.push_back(getValue(I.getOperand(0)));
     for(unsigned i = 0; i < ScaleFactor; i++) {
-      auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL);
-      auto TargetIndex = DAG.getVectorIdxConstant(i, DL);
-      auto ExistingValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ReducedTy.getScalarType(), {Accumulator, TargetIndex});
-      auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex});
-      N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N);
-      N = DAG.getNode(ISD::ADD, DL, ReducedTy.getScalarType(), ExistingValue, N);
-      Accumulator = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {Accumulator, N, TargetIndex});
+      auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, DL);
+      Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex}));
+    }
+
+    while(Subvectors.size() >= 2) {
+      SmallVector<SDValue> NewSubvectors;
+      for(unsigned i = 0; i < Subvectors.size(); i+=2) {
+        unsigned j = i + 1;
+        auto A = Subvectors[i];
+        if(j >= Subvectors.size()) {
+          unsigned OldLastIdx = NewSubvectors.size()-1;
+          auto OldLast = NewSubvectors[OldLastIdx];
+          NewSubvectors[OldLastIdx] = DAG.getNode(ISD::ADD, DL, ReducedTy, {OldLast, A});
+          break;
+        }
+        auto B = Subvectors[j];
+        auto Node = DAG.getNode(ISD::ADD, DL, ReducedTy, {A, B});
+        NewSubvectors.push_back(Node);
+      }
+      Subvectors = NewSubvectors;
     }
+    
+    assert(Subvectors.size() == 1 && "There should only be one subvector after tree flattening");
 
-    setValue(&I, Accumulator);
+    setValue(&I, Subvectors[0]);
     return;
   }
   case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
index 325efbf757c47..ee57026eeb350 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -force-vector-interleave=1 %s | FileCheck %s
+; RUN: llc -force-vector-interleave=1 -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
@@ -7,11 +7,7 @@ target triple = "aarch64-none-unknown-elf"
 define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 {
 ; CHECK-LABEL: partial_reduce_add_fixed:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv s1, v1.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    mov v0.s[0], w8
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
@@ -21,14 +17,8 @@ entry:
 define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 {
 ; CHECK-LABEL: partial_reduce_add_fixed_half:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv s1, v1.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov w10, v0.s[1]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    add w9, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    mov v0.s[0], w9
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
@@ -38,13 +28,7 @@ entry:
 define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) #0 {
 ; CHECK-LABEL: partial_reduce_add:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    uaddv d1, p0, z1.s
-; CHECK-NEXT:    ptrue p0.s, vl1
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    add w8, w8, w9
-; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
@@ -54,20 +38,8 @@ entry:
 define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) #0 {
 ; CHECK-LABEL: partial_reduce_add_half:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    index z2.s, #0, #1
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    mov w9, v0.s[1]
-; CHECK-NEXT:    uaddv d1, p0, z1.s
-; CHECK-NEXT:    ptrue p1.s, vl1
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z3.s
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    add w10, w10, w8
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    mov z0.s, p1/m, w10
-; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
@@ -77,40 +49,10 @@ entry:
 define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
 ; CHECK-LABEL: partial_reduce_add_quart:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    mov z6.s, w8
-; CHECK-NEXT:    index z5.s, #0, #1
-; CHECK-NEXT:    ptrue p2.s, vl1
-; CHECK-NEXT:    uaddv d1, p0, z1.s
-; CHECK-NEXT:    mov w9, v0.s[1]
-; CHECK-NEXT:    uaddv d2, p0, z2.s
-; CHECK-NEXT:    uaddv d3, p0, z3.s
-; CHECK-NEXT:    cmpeq p1.s, p0/z, z5.s, z6.s
-; CHECK-NEXT:    uaddv d4, p0, z4.s
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    mov w10, #2 // =0x2
-; CHECK-NEXT:    mov z1.s, p2/m, w8
-; CHECK-NEXT:    fmov x8, d2
-; CHECK-NEXT:    mov z6.s, w10
-; CHECK-NEXT:    mov w10, v0.s[2]
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    mov w9, #3 // =0x3
-; CHECK-NEXT:    cmpeq p2.s, p0/z, z5.s, z6.s
-; CHECK-NEXT:    mov z2.s, w9
-; CHECK-NEXT:    fmov x9, d3
-; CHECK-NEXT:    mov z1.s, p1/m, w8
-; CHECK-NEXT:    mov w8, v0.s[3]
-; CHECK-NEXT:    add w9, w10, w9
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z5.s, z2.s
-; CHECK-NEXT:    mov z1.s, p2/m, w9
-; CHECK-NEXT:    fmov x9, d4
-; CHECK-NEXT:    add w8, w8, w9
-; CHECK-NEXT:    mov z1.s, p0/m, w8
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    add z2.s, z2.s, z3.s
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    add z1.s, z2.s, z4.s
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
@@ -120,21 +62,10 @@ entry:
 define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
 ; CHECK-LABEL: partial_reduce_add_half_8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add z2.s, z2.s, z3.s
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    index z3.s, #0, #1
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    mov w9, v0.s[1]
-; CHECK-NEXT:    ptrue p1.s, vl1
-; CHECK-NEXT:    uaddv d2, p0, z2.s
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z3.s, z4.s
-; CHECK-NEXT:    fmov x8, d2
-; CHECK-NEXT:    add w10, w10, w8
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    mov z0.s, p1/m, w10
-; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    add z1.s, z1.s, z3.s
+; CHECK-NEXT:    add z0.s, z0.s, z4.s
+; CHECK-NEXT:    add z1.s, z1.s, z5.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)

From 7c428bd884212993b9d9c5f274d251246bfa6842 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Tue, 25 Jun 2024 16:20:22 +0100
Subject: [PATCH 09/12] Use deque instead of SmallVector for generic lowering

---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 24 ++++++-------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 7e7b9eba96510..42c2b98b07306 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -108,6 +108,7 @@
 #include <limits>
 #include <optional>
 #include <tuple>
+#include <deque>
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -7924,29 +7925,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
 
     // Collect all of the subvectors
-    SmallVector<SDValue> Subvectors;
+    std::deque<SDValue> Subvectors;
     Subvectors.push_back(getValue(I.getOperand(0)));
     for(unsigned i = 0; i < ScaleFactor; i++) {
       auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, DL);
       Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex}));
     }
 
-    while(Subvectors.size() >= 2) {
-      SmallVector<SDValue> NewSubvectors;
-      for(unsigned i = 0; i < Subvectors.size(); i+=2) {
-        unsigned j = i + 1;
-        auto A = Subvectors[i];
-        if(j >= Subvectors.size()) {
-          unsigned OldLastIdx = NewSubvectors.size()-1;
-          auto OldLast = NewSubvectors[OldLastIdx];
-          NewSubvectors[OldLastIdx] = DAG.getNode(ISD::ADD, DL, ReducedTy, {OldLast, A});
-          break;
-        }
-        auto B = Subvectors[j];
-        auto Node = DAG.getNode(ISD::ADD, DL, ReducedTy, {A, B});
-        NewSubvectors.push_back(Node);
-      }
-      Subvectors = NewSubvectors;
+    // Flatten the subvector tree
+    while(Subvectors.size() > 1) {
+      Subvectors.push_back(DAG.getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]}));
+      Subvectors.pop_front();
+      Subvectors.pop_front();
     }
     
     assert(Subvectors.size() == 1 && "There should only be one subvector after tree flattening");

From 913ac876d4b3203fc19a4b12691ec96d24f1d7f6 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Wed, 3 Jul 2024 14:18:36 +0100
Subject: [PATCH 10/12] Address final comments

---
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp   | 13 ++++++-------
 llvm/lib/IR/Verifier.cpp                           |  6 +++---
 llvm/test/CodeGen/AArch64/partial-reduction-add.ll | 14 +++++++-------
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 42c2b98b07306..dc2a670085256 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7916,10 +7916,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::experimental_vector_partial_reduce_add: {
-    auto DL = getCurSDLoc();
-    auto ReducedTy = EVT::getEVT(I.getType());
-    auto OpNode = getValue(I.getOperand(1));
-    auto FullTy = OpNode.getValueType();
+    SDValue OpNode = getValue(I.getOperand(1));
+    EVT ReducedTy = EVT::getEVT(I.getType());
+    EVT FullTy = OpNode.getValueType();
 
     unsigned Stride = ReducedTy.getVectorMinNumElements();
     unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
@@ -7928,13 +7927,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     std::deque<SDValue> Subvectors;
     Subvectors.push_back(getValue(I.getOperand(0)));
     for(unsigned i = 0; i < ScaleFactor; i++) {
-      auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, DL);
-      Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex}));
+      auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl);
+      Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy, {OpNode, SourceIndex}));
     }
 
     // Flatten the subvector tree
     while(Subvectors.size() > 1) {
-      Subvectors.push_back(DAG.getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]}));
+      Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy, {Subvectors[0], Subvectors[1]}));
       Subvectors.pop_front();
       Subvectors.pop_front();
     }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 21371bdd4bf6d..c34389fb7ee01 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6135,12 +6135,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
 
-    auto VecWidth = VecTy->getElementCount().getKnownMinValue();
-    auto AccWidth = AccTy->getElementCount().getKnownMinValue();
+    unsigned VecWidth = VecTy->getElementCount().getKnownMinValue();
+    unsigned AccWidth = AccTy->getElementCount().getKnownMinValue();
 
     Check((VecWidth % AccWidth) == 0, "Invalid vector widths for partial "
                                       "reduction. The width of the input vector "
-                                      "must be a postive integer multiple of "
+                                      "must be a positive integer multiple of "
                                       "the width of the accumulator vector.");
     break;
   }
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
index ee57026eeb350..ae681ee54e687 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -18,7 +18,7 @@ define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32
 ; CHECK-LABEL: partial_reduce_add_fixed_half:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
@@ -39,7 +39,7 @@ define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulat
 ; CHECK-LABEL: partial_reduce_add_half:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    add z0.s, z2.s, z0.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
@@ -49,10 +49,10 @@ entry:
 define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
 ; CHECK-LABEL: partial_reduce_add_quart:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add z2.s, z2.s, z3.s
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    add z1.s, z2.s, z4.s
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    add z2.s, z2.s, z3.s
+; CHECK-NEXT:    add z0.s, z4.s, z0.s
+; CHECK-NEXT:    add z0.s, z2.s, z0.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
@@ -64,8 +64,8 @@ define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumul
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    add z1.s, z1.s, z3.s
-; CHECK-NEXT:    add z0.s, z0.s, z4.s
-; CHECK-NEXT:    add z1.s, z1.s, z5.s
+; CHECK-NEXT:    add z0.s, z4.s, z0.s
+; CHECK-NEXT:    add z1.s, z5.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)

From 631208f94358e58c35e5e016c270f59a9d0b7fda Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Wed, 3 Jul 2024 14:33:57 +0100
Subject: [PATCH 11/12] Fix formatting

---
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp    | 13 ++++++++-----
 llvm/lib/IR/Verifier.cpp                            |  9 +++++----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dc2a670085256..075cb77d933e7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -104,11 +104,11 @@
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cstddef>
+#include <deque>
 #include <iterator>
 #include <limits>
 #include <optional>
 #include <tuple>
-#include <deque>
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -7926,19 +7926,22 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // Collect all of the subvectors
     std::deque<SDValue> Subvectors;
     Subvectors.push_back(getValue(I.getOperand(0)));
-    for(unsigned i = 0; i < ScaleFactor; i++) {
+    for (unsigned i = 0; i < ScaleFactor; i++) {
       auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl);
-      Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy, {OpNode, SourceIndex}));
+      Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy,
+        {OpNode, SourceIndex}));
     }
 
     // Flatten the subvector tree
     while(Subvectors.size() > 1) {
-      Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy, {Subvectors[0], Subvectors[1]}));
+      Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy,
+                                       {Subvectors[0], Subvectors[1]}));
       Subvectors.pop_front();
       Subvectors.pop_front();
     }
     
-    assert(Subvectors.size() == 1 && "There should only be one subvector after tree flattening");
+    assert(Subvectors.size() == 1 &&
+           "There should only be one subvector after tree flattening");
 
     setValue(&I, Subvectors[0]);
     return;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c34389fb7ee01..ddefeb5a03e97 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6138,10 +6138,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     unsigned VecWidth = VecTy->getElementCount().getKnownMinValue();
     unsigned AccWidth = AccTy->getElementCount().getKnownMinValue();
 
-    Check((VecWidth % AccWidth) == 0, "Invalid vector widths for partial "
-                                      "reduction. The width of the input vector "
-                                      "must be a positive integer multiple of "
-                                      "the width of the accumulator vector.");
+    Check((VecWidth % AccWidth) == 0,
+          "Invalid vector widths for partial "
+          "reduction. The width of the input vector "
+          "must be a positive integer multiple of "
+          "the width of the accumulator vector.");
     break;
   }
   case Intrinsic::experimental_noalias_scope_decl: {

From f8eec059bb4e7706cdf525271e147f82e319fc89 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Wed, 3 Jul 2024 14:54:11 +0100
Subject: [PATCH 12/12] Fix formatting using git-clang-format

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 075cb77d933e7..76bac44f61e9c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7929,17 +7929,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     for (unsigned i = 0; i < ScaleFactor; i++) {
       auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl);
       Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy,
-        {OpNode, SourceIndex}));
+                                       {OpNode, SourceIndex}));
     }
 
     // Flatten the subvector tree
-    while(Subvectors.size() > 1) {
+    while (Subvectors.size() > 1) {
       Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy,
                                        {Subvectors[0], Subvectors[1]}));
       Subvectors.pop_front();
       Subvectors.pop_front();
     }
-    
+
     assert(Subvectors.size() == 1 &&
            "There should only be one subvector after tree flattening");