Skip to content

Commit eb678d8

Browse files
authored
[AArch64] Combine store (trunc X to <3 x i8>) to sequence of ST1.b. (llvm#78637)
Improve codegen for (trunc X to <3 x i8>) by converting it to a sequence of 3 ST1.b, but first converting the truncate operand to either v8i8 or v16i8, extracting the lanes for the truncate results and storing them. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790 PR: llvm#78637
1 parent c92ad41 commit eb678d8

File tree

2 files changed

+96
-70
lines changed

2 files changed

+96
-70
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+50
Original file line numberDiff line numberDiff line change
@@ -21471,6 +21471,53 @@ bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
2147121471
(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
2147221472
}
2147321473

21474+
// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
21475+
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
21476+
const AArch64Subtarget *Subtarget) {
21477+
SDValue Value = ST->getValue();
21478+
EVT ValueVT = Value.getValueType();
21479+
21480+
if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
21481+
Value.getOpcode() != ISD::TRUNCATE ||
21482+
ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
21483+
return SDValue();
21484+
21485+
assert(ST->getOffset().isUndef() && "undef offset expected");
21486+
SDLoc DL(ST);
21487+
auto WideVT = EVT::getVectorVT(
21488+
*DAG.getContext(),
21489+
Value->getOperand(0).getValueType().getVectorElementType(), 4);
21490+
SDValue UndefVector = DAG.getUNDEF(WideVT);
21491+
SDValue WideTrunc = DAG.getNode(
21492+
ISD::INSERT_SUBVECTOR, DL, WideVT,
21493+
{UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
21494+
SDValue Cast = DAG.getNode(
21495+
ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
21496+
WideTrunc);
21497+
21498+
MachineFunction &MF = DAG.getMachineFunction();
21499+
SDValue Chain = ST->getChain();
21500+
MachineMemOperand *MMO = ST->getMemOperand();
21501+
unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
21502+
SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
21503+
DAG.getConstant(2 * IdxScale, DL, MVT::i64));
21504+
TypeSize Offset2 = TypeSize::getFixed(2);
21505+
SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
21506+
Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
21507+
21508+
SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
21509+
DAG.getConstant(1 * IdxScale, DL, MVT::i64));
21510+
TypeSize Offset1 = TypeSize::getFixed(1);
21511+
SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
21512+
Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
21513+
21514+
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
21515+
DAG.getConstant(0, DL, MVT::i64));
21516+
Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
21517+
MF.getMachineMemOperand(MMO, 0, 1));
21518+
return Chain;
21519+
}
21520+
2147421521
static SDValue performSTORECombine(SDNode *N,
2147521522
TargetLowering::DAGCombinerInfo &DCI,
2147621523
SelectionDAG &DAG,
@@ -21486,6 +21533,9 @@ static SDValue performSTORECombine(SDNode *N,
2148621533
return EltVT == MVT::f32 || EltVT == MVT::f64;
2148721534
};
2148821535

21536+
if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
21537+
return Res;
21538+
2148921539
// If this is an FP_ROUND followed by a store, fold this into a truncating
2149021540
// store. We can do this even if this is already a truncstore.
2149121541
// We purposefully don't care about legality of the nodes here as we know

llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

+46-70
Original file line numberDiff line numberDiff line change
@@ -348,17 +348,15 @@ define <3 x i32> @load_v3i32(ptr %src) {
348348
define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
349349
; CHECK-LABEL: store_trunc_from_64bits:
350350
; CHECK: ; %bb.0: ; %entry
351-
; CHECK-NEXT: sub sp, sp, #16
352-
; CHECK-NEXT: .cfi_def_cfa_offset 16
353-
; CHECK-NEXT: ldr s0, [x0]
354-
; CHECK-NEXT: ldrh w8, [x0, #4]
355-
; CHECK-NEXT: mov.h v0[2], w8
356-
; CHECK-NEXT: xtn.8b v0, v0
357-
; CHECK-NEXT: str s0, [sp, #12]
358-
; CHECK-NEXT: ldrh w9, [sp, #12]
359-
; CHECK-NEXT: strb w8, [x1, #2]
360-
; CHECK-NEXT: strh w9, [x1]
361-
; CHECK-NEXT: add sp, sp, #16
351+
; CHECK-NEXT: ldr w8, [x0]
352+
; CHECK-NEXT: add x9, x0, #4
353+
; CHECK-NEXT: ld1r.4h { v0 }, [x9]
354+
; CHECK-NEXT: fmov s1, w8
355+
; CHECK-NEXT: strb w8, [x1]
356+
; CHECK-NEXT: add x8, x1, #1
357+
; CHECK-NEXT: st1.b { v1 }[2], [x8]
358+
; CHECK-NEXT: add x8, x1, #2
359+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
362360
; CHECK-NEXT: ret
363361
;
364362
; BE-LABEL: store_trunc_from_64bits:
@@ -387,23 +385,19 @@ entry:
387385
define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
388386
; CHECK-LABEL: store_trunc_add_from_64bits:
389387
; CHECK: ; %bb.0: ; %entry
390-
; CHECK-NEXT: sub sp, sp, #16
391-
; CHECK-NEXT: .cfi_def_cfa_offset 16
392388
; CHECK-NEXT: ldr s0, [x0]
393389
; CHECK-NEXT: add x9, x0, #4
394390
; CHECK-NEXT: Lloh0:
395391
; CHECK-NEXT: adrp x8, lCPI9_0@PAGE
396392
; CHECK-NEXT: Lloh1:
397393
; CHECK-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF]
394+
; CHECK-NEXT: add x8, x1, #1
398395
; CHECK-NEXT: ld1.h { v0 }[2], [x9]
396+
; CHECK-NEXT: add x9, x1, #2
399397
; CHECK-NEXT: add.4h v0, v0, v1
400-
; CHECK-NEXT: xtn.8b v1, v0
401-
; CHECK-NEXT: umov.h w8, v0[2]
402-
; CHECK-NEXT: str s1, [sp, #12]
403-
; CHECK-NEXT: ldrh w9, [sp, #12]
404-
; CHECK-NEXT: strb w8, [x1, #2]
405-
; CHECK-NEXT: strh w9, [x1]
406-
; CHECK-NEXT: add sp, sp, #16
398+
; CHECK-NEXT: st1.b { v0 }[2], [x8]
399+
; CHECK-NEXT: st1.b { v0 }[4], [x9]
400+
; CHECK-NEXT: st1.b { v0 }[0], [x1]
407401
; CHECK-NEXT: ret
408402
; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1
409403
;
@@ -594,17 +588,13 @@ entry:
594588
define void @shift_trunc_store(ptr %src, ptr %dst) {
595589
; CHECK-LABEL: shift_trunc_store:
596590
; CHECK: ; %bb.0:
597-
; CHECK-NEXT: sub sp, sp, #16
598-
; CHECK-NEXT: .cfi_def_cfa_offset 16
599591
; CHECK-NEXT: ldr q0, [x0]
600-
; CHECK-NEXT: shrn.4h v0, v0, #16
601-
; CHECK-NEXT: xtn.8b v1, v0
602-
; CHECK-NEXT: umov.h w8, v0[2]
603-
; CHECK-NEXT: str s1, [sp, #12]
604-
; CHECK-NEXT: ldrh w9, [sp, #12]
605-
; CHECK-NEXT: strb w8, [x1, #2]
606-
; CHECK-NEXT: strh w9, [x1]
607-
; CHECK-NEXT: add sp, sp, #16
592+
; CHECK-NEXT: add x8, x1, #1
593+
; CHECK-NEXT: add x9, x1, #2
594+
; CHECK-NEXT: ushr.4s v0, v0, #16
595+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
596+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
597+
; CHECK-NEXT: st1.b { v0 }[0], [x1]
608598
; CHECK-NEXT: ret
609599
;
610600
; BE-LABEL: shift_trunc_store:
@@ -632,17 +622,13 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
632622
define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
633623
; CHECK-LABEL: shift_trunc_store_default_align:
634624
; CHECK: ; %bb.0:
635-
; CHECK-NEXT: sub sp, sp, #16
636-
; CHECK-NEXT: .cfi_def_cfa_offset 16
637625
; CHECK-NEXT: ldr q0, [x0]
638-
; CHECK-NEXT: shrn.4h v0, v0, #16
639-
; CHECK-NEXT: xtn.8b v1, v0
640-
; CHECK-NEXT: umov.h w8, v0[2]
641-
; CHECK-NEXT: str s1, [sp, #12]
642-
; CHECK-NEXT: ldrh w9, [sp, #12]
643-
; CHECK-NEXT: strb w8, [x1, #2]
644-
; CHECK-NEXT: strh w9, [x1]
645-
; CHECK-NEXT: add sp, sp, #16
626+
; CHECK-NEXT: add x8, x1, #1
627+
; CHECK-NEXT: add x9, x1, #2
628+
; CHECK-NEXT: ushr.4s v0, v0, #16
629+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
630+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
631+
; CHECK-NEXT: st1.b { v0 }[0], [x1]
646632
; CHECK-NEXT: ret
647633
;
648634
; BE-LABEL: shift_trunc_store_default_align:
@@ -670,17 +656,13 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
670656
define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
671657
; CHECK-LABEL: shift_trunc_store_align_4:
672658
; CHECK: ; %bb.0:
673-
; CHECK-NEXT: sub sp, sp, #16
674-
; CHECK-NEXT: .cfi_def_cfa_offset 16
675659
; CHECK-NEXT: ldr q0, [x0]
676-
; CHECK-NEXT: shrn.4h v0, v0, #16
677-
; CHECK-NEXT: xtn.8b v1, v0
678-
; CHECK-NEXT: umov.h w8, v0[2]
679-
; CHECK-NEXT: str s1, [sp, #12]
680-
; CHECK-NEXT: ldrh w9, [sp, #12]
681-
; CHECK-NEXT: strb w8, [x1, #2]
682-
; CHECK-NEXT: strh w9, [x1]
683-
; CHECK-NEXT: add sp, sp, #16
660+
; CHECK-NEXT: add x8, x1, #1
661+
; CHECK-NEXT: add x9, x1, #2
662+
; CHECK-NEXT: ushr.4s v0, v0, #16
663+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
664+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
665+
; CHECK-NEXT: st1.b { v0 }[0], [x1]
684666
; CHECK-NEXT: ret
685667
;
686668
; BE-LABEL: shift_trunc_store_align_4:
@@ -708,17 +690,14 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
708690
define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
709691
; CHECK-LABEL: shift_trunc_store_const_offset_1:
710692
; CHECK: ; %bb.0:
711-
; CHECK-NEXT: sub sp, sp, #16
712-
; CHECK-NEXT: .cfi_def_cfa_offset 16
713693
; CHECK-NEXT: ldr q0, [x0]
714-
; CHECK-NEXT: shrn.4h v0, v0, #16
715-
; CHECK-NEXT: xtn.8b v1, v0
716-
; CHECK-NEXT: umov.h w8, v0[2]
717-
; CHECK-NEXT: str s1, [sp, #12]
718-
; CHECK-NEXT: ldrh w9, [sp, #12]
719-
; CHECK-NEXT: strb w8, [x1, #3]
720-
; CHECK-NEXT: sturh w9, [x1, #1]
721-
; CHECK-NEXT: add sp, sp, #16
694+
; CHECK-NEXT: add x8, x1, #2
695+
; CHECK-NEXT: add x9, x1, #3
696+
; CHECK-NEXT: ushr.4s v0, v0, #16
697+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
698+
; CHECK-NEXT: add x8, x1, #1
699+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
700+
; CHECK-NEXT: st1.b { v0 }[0], [x8]
722701
; CHECK-NEXT: ret
723702
;
724703
; BE-LABEL: shift_trunc_store_const_offset_1:
@@ -747,17 +726,14 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
747726
define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
748727
; CHECK-LABEL: shift_trunc_store_const_offset_3:
749728
; CHECK: ; %bb.0:
750-
; CHECK-NEXT: sub sp, sp, #16
751-
; CHECK-NEXT: .cfi_def_cfa_offset 16
752729
; CHECK-NEXT: ldr q0, [x0]
753-
; CHECK-NEXT: shrn.4h v0, v0, #16
754-
; CHECK-NEXT: xtn.8b v1, v0
755-
; CHECK-NEXT: umov.h w8, v0[2]
756-
; CHECK-NEXT: str s1, [sp, #12]
757-
; CHECK-NEXT: ldrh w9, [sp, #12]
758-
; CHECK-NEXT: strb w8, [x1, #5]
759-
; CHECK-NEXT: sturh w9, [x1, #3]
760-
; CHECK-NEXT: add sp, sp, #16
730+
; CHECK-NEXT: add x8, x1, #4
731+
; CHECK-NEXT: add x9, x1, #5
732+
; CHECK-NEXT: ushr.4s v0, v0, #16
733+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
734+
; CHECK-NEXT: add x8, x1, #3
735+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
736+
; CHECK-NEXT: st1.b { v0 }[0], [x8]
761737
; CHECK-NEXT: ret
762738
;
763739
; BE-LABEL: shift_trunc_store_const_offset_3:

0 commit comments

Comments
 (0)