Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1363,6 +1363,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v512i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom);
}
if (Subtarget.isISAFuture()) {
setOperationAction(ISD::LOAD, MVT::v1024i1, Custom);
setOperationAction(ISD::STORE, MVT::v1024i1, Custom);
addRegisterClass(MVT::v1024i1, &PPC::DMRRCRegClass);
}
Copy link
Contributor

@lei137 lei137 Feb 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be within the if (Subtarget.hasMMA()) block on line 1357? I am basing this on the fact that we need hadMMA() for this support.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to PPC.cpp future CPU should include all of the features of Power11, and Power11 includes all the same features as Power10 where mma feature is set to true for Power10, so I think isISAFuture implies hasMMA.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the checks of isISAFuture should I think be interpreted as placeholders. At some point a concrete target processor needs to be added and at that point all the checks for future should be updated to check specific features rather than just future - "future + 1" would also have those features but a check for future would fail.
Also for that reason I think the single condition is more correct, since we should be asking about one feature and not a cpu name in addition.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes Power11 implies hasMMA. I was more thinking of issues where users manually turn on/off features on a specific CPU. This patch uses lxvp|stvp, so if user explicitly turn off mma or paired-vector-memops we shouldn't be generating these code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that you have the guards in the custom lowering functions though.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the checks are inconsistent. I could make them consistent. The final switching code is TBD because the target features are TBD.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For paired support not being there, I think hitting the assert is better. It would be a case of manually forcing it off since ISA level supports it.


if (Subtarget.has64BitSupport())
setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
Expand Down Expand Up @@ -11758,6 +11763,64 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return Op;
}

SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
SDValue LoadChain = LN->getChain();
SDValue BasePtr = LN->getBasePtr();
EVT VT = Op.getValueType();

// Type v1024i1 is used for Dense Math dmr registers.
assert(VT == MVT::v1024i1 && "Unsupported type.");
assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
"Dense Math support required.");
assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");

SmallVector<SDValue, 4> Loads;
SmallVector<SDValue, 4> LoadChains;
SDValue IntrinID = DAG.getConstant(Intrinsic::ppc_vsx_lxvp, dl, MVT::i32);
SDValue LoadOps[] = {LoadChain, IntrinID, BasePtr};
MachineMemOperand *MMO = LN->getMemOperand();
unsigned NumVecs = VT.getSizeInBits() / 256;
for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
MachineMemOperand *NewMMO =
DAG.getMachineFunction().getMachineMemOperand(MMO, Idx * 32, 32);
if (Idx > 0) {
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
DAG.getConstant(32, dl, BasePtr.getValueType()));
LoadOps[2] = BasePtr;
}
SDValue Ld = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
DAG.getVTList(MVT::v256i1, MVT::Other),
LoadOps, MVT::v256i1, NewMMO);
LoadChains.push_back(Ld.getValue(1));
Loads.push_back(Ld);
}

if (Subtarget.isLittleEndian()) {
std::reverse(Loads.begin(), Loads.end());
std::reverse(LoadChains.begin(), LoadChains.end());
}

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1, Loads[0],
Loads[1]),
0);
SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTFDMR512_HI, dl, MVT::v512i1,
Loads[2], Loads[3]),
0);
SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
SDValue Value =
SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0);

SDValue RetOps[] = {Value, TF};
return DAG.getMergeValues(RetOps, dl);
}

SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
Expand All @@ -11766,6 +11829,9 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
SDValue BasePtr = LN->getBasePtr();
EVT VT = Op.getValueType();

if (VT == MVT::v1024i1)
return LowerDMFVectorLoad(Op, DAG);

if (VT != MVT::v256i1 && VT != MVT::v512i1)
return Op;

Expand Down Expand Up @@ -11803,6 +11869,69 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
return DAG.getMergeValues(RetOps, dl);
}

SDValue PPCTargetLowering::LowerDMFVectorStore(SDValue Op,
SelectionDAG &DAG) const {

SDLoc dl(Op);
StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
SDValue StoreChain = SN->getChain();
SDValue BasePtr = SN->getBasePtr();
SmallVector<SDValue, 4> Values;
SmallVector<SDValue, 4> Stores;
EVT VT = SN->getValue().getValueType();

// Type v1024i1 is used for Dense Math dmr registers.
assert(VT == MVT::v1024i1 && "Unsupported type.");
assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
"Dense Math support required.");
assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");

SDValue Lo(
DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
Op.getOperand(1),
DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32)),
0);
SDValue Hi(
DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
Op.getOperand(1),
DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32)),
0);
EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
MachineSDNode *ExtNode =
DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes, Lo);
Values.push_back(SDValue(ExtNode, 0));
Values.push_back(SDValue(ExtNode, 1));
ExtNode = DAG.getMachineNode(PPC::DMXXEXTFDMR512_HI, dl, ReturnTypes, Hi);
Values.push_back(SDValue(ExtNode, 0));
Values.push_back(SDValue(ExtNode, 1));

if (Subtarget.isLittleEndian())
std::reverse(Values.begin(), Values.end());

SDVTList Tys = DAG.getVTList(MVT::Other);
SmallVector<SDValue, 4> Ops{
StoreChain, DAG.getConstant(Intrinsic::ppc_vsx_stxvp, dl, MVT::i32),
Values[0], BasePtr};
MachineMemOperand *MMO = SN->getMemOperand();
unsigned NumVecs = VT.getSizeInBits() / 256;
for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
MachineMemOperand *NewMMO =
DAG.getMachineFunction().getMachineMemOperand(MMO, Idx * 32, 32);
if (Idx > 0) {
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
DAG.getConstant(32, dl, BasePtr.getValueType()));
Ops[3] = BasePtr;
}
Ops[2] = Values[Idx];
SDValue St = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops,
MVT::v256i1, NewMMO);
Stores.push_back(St);
}

SDValue TF = DAG.getTokenFactor(dl, Stores);
return TF;
}

SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
Expand All @@ -11813,6 +11942,9 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
SDValue Value2 = SN->getValue();
EVT StoreVT = Value.getValueType();

if (StoreVT == MVT::v1024i1)
return LowerDMFVectorStore(Op, DAG);

if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
return Op;

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/PowerPC/PPCISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,8 @@ namespace llvm {

SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
CallingConv::ID CallConv, bool isVarArg,
Expand Down
47 changes: 47 additions & 0 deletions llvm/test/CodeGen/PowerPC/v1024ls.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=future -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=future -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE

define void @v1024ls(ptr nocapture readonly %vqp, ptr nocapture %resp) {
; CHECK-LABEL: v1024ls:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxvp vsp34, 0(r3)
; CHECK-NEXT: lxvp vsp36, 32(r3)
; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
; CHECK-NEXT: lxvp vsp34, 64(r3)
; CHECK-NEXT: lxvp vsp36, 96(r3)
; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
; CHECK-NEXT: stxvp vsp34, 96(r4)
; CHECK-NEXT: stxvp vsp36, 64(r4)
; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
; CHECK-NEXT: stxvp vsp34, 32(r4)
; CHECK-NEXT: stxvp vsp36, 0(r4)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: v1024ls:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
; CHECK-BE-NEXT: stxvp vsp36, 96(r4)
; CHECK-BE-NEXT: stxvp vsp34, 64(r4)
; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
; CHECK-BE-NEXT: stxvp vsp36, 32(r4)
; CHECK-BE-NEXT: stxvp vsp34, 0(r4)
; CHECK-BE-NEXT: blr
entry:
%0 = load <1024 x i1>, ptr %vqp, align 64
store <1024 x i1> %0, ptr %resp, align 64
ret void
}

declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
Loading