Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArch.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ bool lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
FunctionPass *createLoongArchExpandAtomicPseudoPass();
FunctionPass *createLoongArchISelDag(LoongArchTargetMachine &TM);
FunctionPass *createLoongArchPreRAExpandPseudoPass();
FunctionPass *createLoongArchExpandPseudoPass();
void initializeLoongArchDAGToDAGISelPass(PassRegistry &);
void initializeLoongArchExpandAtomicPseudoPass(PassRegistry &);
void initializeLoongArchPreRAExpandPseudoPass(PassRegistry &);
void initializeLoongArchExpandPseudoPass(PassRegistry &);
} // end namespace llvm

#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCH_H
121 changes: 121 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ using namespace llvm;

#define LOONGARCH_PRERA_EXPAND_PSEUDO_NAME \
"LoongArch Pre-RA pseudo instruction expansion pass"
#define LOONGARCH_EXPAND_PSEUDO_NAME \
"LoongArch pseudo instruction expansion pass"

namespace {

Expand Down Expand Up @@ -513,15 +515,134 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
return true;
}

class LoongArchExpandPseudo : public MachineFunctionPass {
public:
const LoongArchInstrInfo *TII;
static char ID;

LoongArchExpandPseudo() : MachineFunctionPass(ID) {
initializeLoongArchExpandPseudoPass(*PassRegistry::getPassRegistry());
}

bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {
return LOONGARCH_EXPAND_PSEUDO_NAME;
}

private:
bool expandMBB(MachineBasicBlock &MBB);
bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
};

char LoongArchExpandPseudo::ID = 0;

bool LoongArchExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
TII =
static_cast<const LoongArchInstrInfo *>(MF.getSubtarget().getInstrInfo());

bool Modified = false;
for (auto &MBB : MF)
Modified |= expandMBB(MBB);

return Modified;
}

bool LoongArchExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
bool Modified = false;

MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
while (MBBI != E) {
MachineBasicBlock::iterator NMBBI = std::next(MBBI);
Modified |= expandMI(MBB, MBBI, NMBBI);
MBBI = NMBBI;
}

return Modified;
}

bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
switch (MBBI->getOpcode()) {
case LoongArch::PseudoCopyCFR:
return expandCopyCFR(MBB, MBBI, NextMBBI);
}

return false;
}

bool LoongArchExpandPseudo::expandCopyCFR(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
MachineFunction *MF = MBB.getParent();
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();

// Expand:
// MBB:
// fcmp.caf.s $dst, $fa0, $fa0 # set $dst 0(false)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if movgr2cf $dst, $zero would be better micro-architecture-wise, perhaps you know better?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW GCC uses movgr2cf %0,$r0 for zeroing a fcc. But I'm not sure which is micro-architecture-wise better too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some tests show movgr2cf is slower than other insns, but not sure about fcmp.caf.s.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Phew:

$ cat t.S
.globl main
main:
  li.w		$a0, 1000000
.L0:
.rept 100
#if USE_MOVGR2CF
  movgr2cf	$fcc0, $r0
#else
  fcmp.caf.s	$fcc0, $f0, $f0
#endif
.endr
  addi.w	$a0, $a0, -1
  bnez		$a0, .L0
  li.w		$a0, 0
  jr		$ra
$ gcc t.S -DUSE_MOVGR2CF
$ time ./a.out

real	0m0.688s
user	0m0.687s
sys	0m0.001s
$ gcc t.S
$ time ./a.out

real	0m0.024s
user	0m0.023s
sys	0m0.000s

So fcmp.caf.s is indeed better...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then maybe documenting this finding would be beneficial (and GCC could use some micro-optimization too)!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(And you could inform the HW team to add short-circuiting for movgr2cf *, $zero so it doesn't naïvely goes to the ALU unconditionally (which I expect to be the reason of slowdown) in that case. So we can have the semantically-equivalent patterns execute at the same speed without surprises on future models...)

// bceqz $src, SinkBB
// FalseBB:
// fcmp.cueq.s $dst, $fa0, $fa0 # set $dst 1(true)
// SinkBB:
// fallthrough

const BasicBlock *LLVM_BB = MBB.getBasicBlock();
auto *FalseBB = MF->CreateMachineBasicBlock(LLVM_BB);
auto *SinkBB = MF->CreateMachineBasicBlock(LLVM_BB);

MF->insert(++MBB.getIterator(), FalseBB);
MF->insert(++FalseBB->getIterator(), SinkBB);

Register DestReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
// DestReg = 0
BuildMI(MBB, MBBI, DL, TII->get(LoongArch::SET_CFR_FALSE), DestReg);
// Insert branch instruction.
BuildMI(MBB, MBBI, DL, TII->get(LoongArch::BCEQZ))
.addReg(SrcReg)
.addMBB(SinkBB);
// DestReg = 1
BuildMI(FalseBB, DL, TII->get(LoongArch::SET_CFR_TRUE), DestReg);

FalseBB->addSuccessor(SinkBB);

SinkBB->splice(SinkBB->end(), &MBB, MI, MBB.end());
SinkBB->transferSuccessors(&MBB);

MBB.addSuccessor(FalseBB);
MBB.addSuccessor(SinkBB);

NextMBBI = MBB.end();
MI.eraseFromParent();

// Make sure live-ins are correctly attached to this new basic block.
LivePhysRegs LiveRegs;
computeAndAddLiveIns(LiveRegs, *FalseBB);
computeAndAddLiveIns(LiveRegs, *SinkBB);

return true;
}

} // end namespace

INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo",
LOONGARCH_PRERA_EXPAND_PSEUDO_NAME, false, false)

INITIALIZE_PASS(LoongArchExpandPseudo, "loongarch-expand-pseudo",
LOONGARCH_EXPAND_PSEUDO_NAME, false, false)

namespace llvm {

FunctionPass *createLoongArchPreRAExpandPseudoPass() {
return new LoongArchPreRAExpandPseudo();
}
FunctionPass *createLoongArchExpandPseudoPass() {
return new LoongArchExpandPseudo();
}

} // end namespace llvm
17 changes: 17 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,23 @@ def PseudoST_CFR : Pseudo<(outs),
let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
def PseudoLD_CFR : Pseudo<(outs CFR:$ccd),
(ins GPR:$rj, grlenimm:$imm)>;

// SET_CFR_{FALSE,TRUE}
// These instructions are defined in order to avoid expensive check error if
// regular instruction patterns are used.
// fcmp.caf.s $dst, $fa0, $fa0
def SET_CFR_FALSE : SET_CFR<0x0c100000, "fcmp.caf.s">;
// fcmp.cueq.s $dst, $fa0, $fa0
def SET_CFR_TRUE : SET_CFR<0x0c160000, "fcmp.cueq.s">;

// Pseudo instruction for copying CFRs.
def PseudoCopyCFR : Pseudo<(outs CFR:$dst), (ins CFR:$src)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let Size = 12;
}

} // Predicates = [HasBasicF]

//===----------------------------------------------------------------------===//
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchFloatInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,15 @@ class FP_STORE_2RI12<bits<32> op, RegisterClass rc = FPR32>
: FPFmt2RI12<op, (outs), (ins rc:$fd, GPR:$rj, simm12:$imm12),
"$fd, $rj, $imm12">;
} // hasSideEffects = 0, mayLoad = 0, mayStore = 1

// This class is used to define `SET_CFR_{FALSE,TRUE}` instructions which are
// used to expand `PseudoCopyCFR`.
class SET_CFR<bits<32> op, string opcstr>
: FP_CMP<op> {
let isCodeGenOnly = 1;
let fj = 0; // fa0
let fk = 0; // fa0
let AsmString = opcstr # "\t$cd, $$fa0, $$fa0";
let OutOperandList = (outs CFR:$cd);
let InOperandList = (ins);
}
6 changes: 6 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
// CFR->CFR copy.
if (LoongArch::CFRRegClass.contains(DstReg, SrcReg)) {
BuildMI(MBB, MBBI, DL, get(LoongArch::PseudoCopyCFR), DstReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}

// FPR->FPR copies.
unsigned Opc;
Expand Down
7 changes: 0 additions & 7 deletions llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,6 @@ LoongArchRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
if (TFI->hasBP(MF))
markSuperRegs(Reserved, LoongArchABI::getBPReg()); // bp

// FIXME: To avoid generating COPY instructions between CFRs, only use $fcc0.
// This is required to work around the fact that COPY instruction between CFRs
// is not provided in LoongArch.
if (MF.getSubtarget<LoongArchSubtarget>().hasBasicF())
for (size_t Reg = LoongArch::FCC1; Reg <= LoongArch::FCC7; ++Reg)
markSuperRegs(Reserved, Reg);

assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ LoongArchTargetMachine::getTargetTransformInfo(const Function &F) const {
void LoongArchPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); }

void LoongArchPassConfig::addPreEmitPass2() {
addPass(createLoongArchExpandPseudoPass());
// Schedule the expansion of AtomicPseudos at the last possible moment,
// avoiding the possibility for other passes to break the requirements for
// forward progress in the LL/SC block.
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/LoongArch/O0-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Stack Frame Layout Analysis
; CHECK-NEXT: LoongArch pseudo instruction expansion pass
; CHECK-NEXT: LoongArch atomic pseudo instruction expansion pass
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
Expand Down
34 changes: 34 additions & 0 deletions llvm/test/CodeGen/LoongArch/cfr-copy.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
# RUN: llc --mtriple=loongarch64 --mattr=+d %s -o - | FileCheck %s

## Check the PseudoCopyCFR instruction expand.

--- |
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
target triple = "loongarch64"

define void @test() {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
; CHECK-NEXT: fcmp.caf.s $fcc1, $fa0, $fa0
; CHECK-NEXT: bceqz $fcc0, .LBB0_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: fcmp.cueq.s $fcc1, $fa0, $fa0
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: movcf2gr $a0, $fcc1
; CHECK-NEXT: ret
ret void
}
...
---
name: test
tracksRegLiveness: true
body: |
bb.0:
liveins: $fcc0

$fcc1 = COPY $fcc0
$r4 = COPY $fcc1
PseudoRET implicit killed $r4

...
26 changes: 26 additions & 0 deletions llvm/test/CodeGen/LoongArch/cfr-pseudo-copy.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc --mtriple=loongarch64 --mattr=+d --stop-after=postrapseudos %s \
# RUN: -o - | FileCheck %s

## Check the COPY instruction between CFRs.
## A pseudo (PseudoCopyCFR) is generated after postrapseudos pass.

...
---
name: test
tracksRegLiveness: true
body: |
bb.0.entry:
liveins: $fcc0

; CHECK-LABEL: name: test
; CHECK: liveins: $fcc0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $fcc1 = PseudoCopyCFR $fcc0
; CHECK-NEXT: $r4 = MOVCF2GR killed $fcc1
; CHECK-NEXT: PseudoRET implicit killed $r4
$fcc1 = COPY $fcc0
$r4 = COPY $fcc1
PseudoRET implicit killed $r4

...
13 changes: 6 additions & 7 deletions llvm/test/CodeGen/LoongArch/inline-asm-clobbers-fcc.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc --mtriple=loongarch64 --mattr=+d --run-pass=greedy %s -o - | FileCheck %s
# RUN: llc --mtriple=loongarch64 --mattr=+d --regalloc=fast \
# RUN: --stop-before=postra-machine-sink %s -o - | FileCheck %s

## Check that fcc register clobbered by inlineasm is correctly saved by examing
## a pair of pseudos (PseudoST_CFR and PseudoLD_CFR) are generated before and
Expand All @@ -15,13 +16,11 @@ body: |
; CHECK-LABEL: name: test
; CHECK: liveins: $f0_64, $f1_64
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f1_64
; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $f0_64
; CHECK-NEXT: [[FCMP_CLT_D:%[0-9]+]]:cfr = FCMP_CLT_D [[COPY]], [[COPY1]]
; CHECK-NEXT: PseudoST_CFR [[FCMP_CLT_D]], %stack.0, 0 :: (store (s64) into %stack.0)
; CHECK-NEXT: renamable $fcc0 = FCMP_CLT_D renamable $f1_64, renamable $f0_64
; CHECK-NEXT: PseudoST_CFR $fcc0, %stack.0, 0 :: (store (s64) into %stack.0)
; CHECK-NEXT: INLINEASM &nop, 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $fcc0
; CHECK-NEXT: [[PseudoLD_CFR:%[0-9]+]]:cfr = PseudoLD_CFR %stack.0, 0 :: (load (s64) from %stack.0)
; CHECK-NEXT: $r4 = COPY [[PseudoLD_CFR]]
; CHECK-NEXT: $fcc0 = PseudoLD_CFR %stack.0, 0 :: (load (s64) from %stack.0)
; CHECK-NEXT: $r4 = COPY killed renamable $fcc0
; CHECK-NEXT: PseudoRET implicit killed $r4
%1:fpr64 = COPY $f1_64
%0:fpr64 = COPY $f0_64
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/LoongArch/opt-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Stack Frame Layout Analysis
; CHECK-NEXT: LoongArch pseudo instruction expansion pass
; CHECK-NEXT: LoongArch atomic pseudo instruction expansion pass
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
Expand Down