From efd07e93aed51049ad3783c701284617ae446330 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 17 Jan 2024 11:11:59 +0000 Subject: [PATCH 1/7] [AArch64] Combine store (trunc X to <3 x i8>) to sequence of ST1.b. Improve codegen for (trunc X to <3 x i8>) by converting it to a sequence of 3 ST1.b, but first converting the truncate operand to either v8i8 or v16i8, extracting the lanes for the truncate results and storing them. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: https://github.com/llvm/llvm-project/pull/77790 --- .../Target/AArch64/AArch64ISelLowering.cpp | 50 +++++++++++++++++++ .../AArch64/vec3-loads-ext-trunc-stores.ll | 33 +++++------- 2 files changed, 62 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8a6f1dc7487ba..4be78f61fe7b6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21318,6 +21318,53 @@ bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) { (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32); } +// Combine store (trunc X to <3 x i8>) to sequence of ST1.b. +static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + SDValue Value = ST->getValue(); + EVT ValueVT = Value.getValueType(); + + if (ST->isVolatile() || !Subtarget->isLittleEndian() || + ST->getOriginalAlign() >= 4 || Value.getOpcode() != ISD::TRUNCATE || + ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3)) + return SDValue(); + + SDLoc DL(ST); + auto WideVT = EVT::getVectorVT( + *DAG.getContext(), + Value->getOperand(0).getValueType().getVectorElementType(), 4); + SDValue UndefVector = DAG.getUNDEF(WideVT); + SDValue WideTrunc = DAG.getNode( + ISD::INSERT_SUBVECTOR, DL, WideVT, + {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)}); + SDValue Cast = DAG.getNode( + ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8, + WideTrunc); + + SDValue Chain = ST->getChain(); + SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, + DAG.getConstant(8, DL, MVT::i64)); + + SDValue Ptr2 = + DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(2), DL); + Chain = DAG.getStore(Chain, DL, E2, Ptr2, ST->getPointerInfo(), + ST->getOriginalAlign()); + + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, + DAG.getConstant(4, DL, MVT::i64)); + + SDValue Ptr1 = + DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(1), DL); + Chain = DAG.getStore(Chain, DL, E1, Ptr1, ST->getPointerInfo(), + ST->getOriginalAlign()); + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, + DAG.getConstant(0, DL, MVT::i64)); + Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), ST->getPointerInfo(), + ST->getOriginalAlign()); + + return Chain; +} + static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -21333,6 +21380,9 @@ static SDValue performSTORECombine(SDNode *N, return EltVT == MVT::f32 || EltVT == MVT::f64; }; + if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget)) + return Res; + // If this is an FP_ROUND followed by a store, fold this into a truncating // store. We can do this even if this is already a truncstore. // We purposefully don't care about legality of the nodes here as we know diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 9eeb194409df6..60639ea91fbaa 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -154,17 +154,12 @@ define <3 x i32> @load_v3i32(ptr %src) { define void @store_trunc_from_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: store_trunc_from_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ldrh w8, [x0, #4] -; CHECK-NEXT: mov.h v0[2], w8 -; CHECK-NEXT: xtn.8b v0, v0 -; CHECK-NEXT: str s0, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x0, #4 +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: strb w8, [x1] +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: st1.b { v0 }[4], [x8] ; CHECK-NEXT: ret ; ; BE-LABEL: store_trunc_from_64bits: @@ -236,17 +231,13 @@ entry: define void @shift_trunc_store(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store: From 3e747fe3f7000631af555f1cdaa90bd17b93c345 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 19 Jan 2024 16:13:29 +0000 Subject: [PATCH 2/7] !fixup fix extract index computation, adjust pointer info and alignment by offset --- .../Target/AArch64/AArch64ISelLowering.cpp | 27 ++++++++++--------- .../AArch64/vec3-loads-ext-trunc-stores.ll | 7 +++-- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4be78f61fe7b6..badea7537373c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21341,26 +21341,29 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8, WideTrunc); + unsigned IdxScale = WideVT.getScalarSizeInBits() / 8; + Align Align = ST->getOriginalAlign(); SDValue Chain = ST->getChain(); SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, - DAG.getConstant(8, DL, MVT::i64)); - - SDValue Ptr2 = - DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(2), DL); - Chain = DAG.getStore(Chain, DL, E2, Ptr2, ST->getPointerInfo(), - ST->getOriginalAlign()); + DAG.getConstant(2 * IdxScale, DL, MVT::i64)); + TypeSize Offset2 = TypeSize::getFixed(2); + SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL); + Chain = DAG.getStore(Chain, DL, E2, Ptr2, + ST->getPointerInfo().getWithOffset(Offset2), + commonAlignment(Align, Offset2)); SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, - DAG.getConstant(4, DL, MVT::i64)); + DAG.getConstant(1 * IdxScale, DL, MVT::i64)); + TypeSize Offset1 = TypeSize::getFixed(1); + SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL); + Chain = DAG.getStore(Chain, DL, E1, Ptr1, + ST->getPointerInfo().getWithOffset(Offset2), + commonAlignment(Align, Offset1)); - SDValue Ptr1 = - DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(1), DL); - Chain = DAG.getStore(Chain, DL, E1, Ptr1, ST->getPointerInfo(), - ST->getOriginalAlign()); SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, DAG.getConstant(0, DL, MVT::i64)); Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), ST->getPointerInfo(), - ST->getOriginalAlign()); + Align); return Chain; } diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 60639ea91fbaa..9f783badf38f9 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -154,11 +154,14 @@ define <3 x i32> @load_v3i32(ptr %src) { define void @store_trunc_from_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: store_trunc_from_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: ld1r.4h { v0 }, [x8] ; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: add x9, x0, #4 +; CHECK-NEXT: ld1r.4h { v0 }, [x9] +; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: strb w8, [x1] ; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: st1.b { v1 }[2], [x8] +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: st1.b { v0 }[4], [x8] ; CHECK-NEXT: ret ; From 6a9feb6d3764a7eb3c0a7a7cb2a521822013dd63 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 22 Jan 2024 14:54:13 +0000 Subject: [PATCH 3/7] !fixup add assert that offset is undef. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 089b7c18f4edd..9a9b5042b7187 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21391,6 +21391,7 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3)) return SDValue(); + assert(ST->getOffset().isUndef() && "undef offset expected"); SDLoc DL(ST); auto WideVT = EVT::getVectorVT( *DAG.getContext(), From dbfe059901ce693e4da8b9e2e940f2e3e9bd73bb Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 22 Jan 2024 16:08:44 +0000 Subject: [PATCH 4/7] !fixup update additional tests. --- .../AArch64/vec3-loads-ext-trunc-stores.ll | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 28bf017b24689..cf2995de1c85f 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -356,17 +356,14 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_const_offset_1: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #3] -; CHECK-NEXT: sturh w9, [x1, #1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: add x9, x1, #3 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x8] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_const_offset_1: @@ -395,17 +392,14 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_const_offset_3: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #5] -; CHECK-NEXT: sturh w9, [x1, #3] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: add x9, x1, #5 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: add x8, x1, #3 +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x8] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_const_offset_3: From 16c4044d9dfe55cc9bd16fff6a15b36fd2c3a855 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 23 Jan 2024 14:12:28 +0000 Subject: [PATCH 5/7] !fixup update on top of new test coverage. Update checks after adding more tests in e7b4ff8119403509da3e7941dcb86b1c6a6d61c5 --- .../CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 72752aefae1a5..642535939e042 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -279,23 +279,19 @@ entry: define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: store_trunc_add_from_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: add x9, x0, #4 ; CHECK-NEXT: Lloh0: ; CHECK-NEXT: adrp x8, lCPI7_0@PAGE ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: ldr d1, [x8, lCPI7_0@PAGEOFF] +; CHECK-NEXT: add x8, x1, #1 ; CHECK-NEXT: ld1.h { v0 }[2], [x9] +; CHECK-NEXT: add x9, x1, #2 ; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: st1.b { v0 }[2], [x8] +; CHECK-NEXT: st1.b { v0 }[4], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 ; From f2c926206543117bb97e9a6cf39c26a343aa2d7b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 24 Jan 2024 14:37:17 +0000 Subject: [PATCH 6/7] !fixup remove alignment check, update after new tests, use MMO. --- .../Target/AArch64/AArch64ISelLowering.cpp | 18 ++++------ .../AArch64/vec3-loads-ext-trunc-stores.ll | 33 ++++++++----------- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 36d516a418a85..69e9cce5dba55 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21478,7 +21478,7 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, EVT ValueVT = Value.getValueType(); if (ST->isVolatile() || !Subtarget->isLittleEndian() || - ST->getOriginalAlign() >= 4 || Value.getOpcode() != ISD::TRUNCATE || + Value.getOpcode() != ISD::TRUNCATE || ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3)) return SDValue(); @@ -21495,29 +21495,25 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8, WideTrunc); - unsigned IdxScale = WideVT.getScalarSizeInBits() / 8; - Align Align = ST->getOriginalAlign(); + MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = ST->getChain(); + MachineMemOperand *MMO = ST->getMemOperand(); + unsigned IdxScale = WideVT.getScalarSizeInBits() / 8; SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, DAG.getConstant(2 * IdxScale, DL, MVT::i64)); TypeSize Offset2 = TypeSize::getFixed(2); SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL); - Chain = DAG.getStore(Chain, DL, E2, Ptr2, - ST->getPointerInfo().getWithOffset(Offset2), - commonAlignment(Align, Offset2)); + Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1)); SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, DAG.getConstant(1 * IdxScale, DL, MVT::i64)); TypeSize Offset1 = TypeSize::getFixed(1); SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL); - Chain = DAG.getStore(Chain, DL, E1, Ptr1, - ST->getPointerInfo().getWithOffset(Offset2), - commonAlignment(Align, Offset1)); + Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1)); SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, DAG.getConstant(0, DL, MVT::i64)); - Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), ST->getPointerInfo(), - Align); + Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), ST->getMemOperand()); return Chain; } diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index f99ae78418a23..21079ef778776 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -391,6 +391,7 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: adrp x8, lCPI9_0@PAGE ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF] +; CHECK-NEXT: add x8, x1, #1 ; CHECK-NEXT: ld1.h { v0 }[2], [x9] ; CHECK-NEXT: add x9, x1, #2 ; CHECK-NEXT: add.4h v0, v0, v1 @@ -621,17 +622,13 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_default_align: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_default_align: @@ -659,17 +656,13 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_align_4: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_align_4: From f9d824be1351e14edfb61bc663c1918299493f1f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 25 Jan 2024 11:49:21 +0000 Subject: [PATCH 7/7] !fixup also adjust mem-operand of store to base address. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 69e9cce5dba55..48bb2d346b483 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21513,8 +21513,8 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, DAG.getConstant(0, DL, MVT::i64)); - Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), ST->getMemOperand()); - + Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), + MF.getMachineMemOperand(MMO, 0, 1)); return Chain; }