Skip to content

Commit 11fb09e

Browse files
committed
[X86] Change precision control to FP80 during u64->fp32 conversion on Windows.
This is an alternative to D141074 to fix the problem by adjusting the precision control dynamically. Reviewed By: icedrocket Differential Revision: https://reviews.llvm.org/D142178
1 parent 402981e commit 11fb09e

File tree

4 files changed

+145
-5
lines changed

4 files changed

+145
-5
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+78-3
Original file line numberDiff line numberDiff line change
@@ -22048,15 +22048,25 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
2204822048
// Extend everything to 80 bits to force it to be done on x87.
2204922049
// TODO: Are there any fast-math-flags to propagate here?
2205022050
if (IsStrict) {
22051-
SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
22052-
{Chain, Fild, Fudge});
22051+
unsigned Opc = ISD::STRICT_FADD;
22052+
// Windows needs the precision control changed to 80bits around this add.
22053+
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22054+
Opc = X86ISD::STRICT_FP80_ADD;
22055+
22056+
SDValue Add =
22057+
DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
2205322058
// STRICT_FP_ROUND can't handle equal types.
2205422059
if (DstVT == MVT::f80)
2205522060
return Add;
2205622061
return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
2205722062
{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
2205822063
}
22059-
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
22064+
unsigned Opc = ISD::FADD;
22065+
// Windows needs the precision control changed to 80bits around this add.
22066+
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22067+
Opc = X86ISD::FP80_ADD;
22068+
22069+
SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
2206022070
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
2206122071
DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
2206222072
}
@@ -34881,6 +34891,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3488134891
NODE_NAME_CASE(AESDECWIDE256KL)
3488234892
NODE_NAME_CASE(CMPCCXADD)
3488334893
NODE_NAME_CASE(TESTUI)
34894+
NODE_NAME_CASE(FP80_ADD)
34895+
NODE_NAME_CASE(STRICT_FP80_ADD)
3488434896
}
3488534897
return nullptr;
3488634898
#undef NODE_NAME_CASE
@@ -37356,6 +37368,69 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
3735637368
case X86::CMOV_VK64:
3735737369
return EmitLoweredSelect(MI, BB);
3735837370

37371+
case X86::FP80_ADDr:
37372+
case X86::FP80_ADDm32: {
37373+
// Change the floating point control register to use double extended
37374+
// precision when performing the addition.
37375+
int OrigCWFrameIdx =
37376+
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37377+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
37378+
OrigCWFrameIdx);
37379+
37380+
// Load the old value of the control word...
37381+
Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37382+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
37383+
OrigCWFrameIdx);
37384+
37385+
// OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37386+
// precision.
37387+
Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37388+
BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
37389+
.addReg(OldCW, RegState::Kill)
37390+
.addImm(0x300);
37391+
37392+
// Extract to 16 bits.
37393+
Register NewCW16 =
37394+
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37395+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
37396+
.addReg(NewCW, RegState::Kill, X86::sub_16bit);
37397+
37398+
// Prepare memory for FLDCW.
37399+
int NewCWFrameIdx =
37400+
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37401+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
37402+
NewCWFrameIdx)
37403+
.addReg(NewCW16, RegState::Kill);
37404+
37405+
// Reload the modified control word now...
37406+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
37407+
NewCWFrameIdx);
37408+
37409+
// Do the addition.
37410+
if (MI.getOpcode() == X86::FP80_ADDr) {
37411+
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
37412+
.add(MI.getOperand(0))
37413+
.add(MI.getOperand(1))
37414+
.add(MI.getOperand(2));
37415+
} else {
37416+
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
37417+
.add(MI.getOperand(0))
37418+
.add(MI.getOperand(1))
37419+
.add(MI.getOperand(2))
37420+
.add(MI.getOperand(3))
37421+
.add(MI.getOperand(4))
37422+
.add(MI.getOperand(5))
37423+
.add(MI.getOperand(6));
37424+
}
37425+
37426+
// Reload the original control word now.
37427+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
37428+
OrigCWFrameIdx);
37429+
37430+
MI.eraseFromParent(); // The pseudo instruction is gone now.
37431+
return BB;
37432+
}
37433+
3735937434
case X86::FP32_TO_INT16_IN_MEM:
3736037435
case X86::FP32_TO_INT32_IN_MEM:
3736137436
case X86::FP32_TO_INT64_IN_MEM:

llvm/lib/Target/X86/X86ISelLowering.h

+6
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,9 @@ namespace llvm {
740740
// User level interrupts - testui
741741
TESTUI,
742742

743+
// Perform an FP80 add after changing precision control in FPCW.
744+
FP80_ADD,
745+
743746
/// X86 strict FP compare instructions.
744747
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
745748
STRICT_FCMPS,
@@ -779,6 +782,9 @@ namespace llvm {
779782
STRICT_CVTPS2PH,
780783
STRICT_CVTPH2PS,
781784

785+
// Perform an FP80 add after changing precision control in FPCW.
786+
STRICT_FP80_ADD,
787+
782788
// WARNING: Only add nodes here if they are strict FP nodes. Non-memory and
783789
// non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
784790

llvm/lib/Target/X86/X86InstrFPStack.td

+15
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
2626
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
2727
def SDTX86CwdLoad : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
2828

29+
def X86fp80_add : SDNode<"X86ISD::FP80_ADD", SDTFPBinOp, [SDNPCommutative]>;
30+
def X86strict_fp80_add : SDNode<"X86ISD::STRICT_FP80_ADD", SDTFPBinOp,
31+
[SDNPHasChain,SDNPCommutative]>;
32+
def any_X86fp80_add : PatFrags<(ops node:$lhs, node:$rhs),
33+
[(X86strict_fp80_add node:$lhs, node:$rhs),
34+
(X86fp80_add node:$lhs, node:$rhs)]>;
35+
2936
def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
3037
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
3138
def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
@@ -141,6 +148,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in {
141148
[(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
142149
def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
143150
[(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
151+
152+
def FP80_ADDr : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
153+
[(set RFP80:$dst,
154+
(any_X86fp80_add RFP80:$src1, RFP80:$src2))]>;
155+
def FP80_ADDm32 : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2),
156+
[(set RFP80:$dst,
157+
(any_X86fp80_add RFP80:$src1,
158+
(f80 (extloadf32 addr:$src2))))]>;
144159
}
145160

146161
// All FP Stack operations are represented with four instructions here. The

llvm/test/CodeGen/X86/uint64-to-float.ll

+46-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=i686-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
3-
; RUN: llc < %s -mtriple=x86_64-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
2+
; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
3+
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
4+
; RUN: llc < %s -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86-WIN
5+
; RUN: llc < %s -mtriple=x86_64-windows -mattr=+sse2 | FileCheck %s --check-prefix=X64-WIN
46

57
; Verify that we are using the efficient uitofp --> sitofp lowering illustrated
68
; by the compiler_rt implementation of __floatundisf.
@@ -42,6 +44,48 @@ define float @test(i64 %a) nounwind {
4244
; X64-NEXT: cvtsi2ss %rdi, %xmm0
4345
; X64-NEXT: addss %xmm0, %xmm0
4446
; X64-NEXT: retq
47+
;
48+
; X86-WIN-LABEL: test:
49+
; X86-WIN: # %bb.0: # %entry
50+
; X86-WIN-NEXT: pushl %ebp
51+
; X86-WIN-NEXT: movl %esp, %ebp
52+
; X86-WIN-NEXT: andl $-8, %esp
53+
; X86-WIN-NEXT: subl $24, %esp
54+
; X86-WIN-NEXT: movl 12(%ebp), %eax
55+
; X86-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
56+
; X86-WIN-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
57+
; X86-WIN-NEXT: shrl $31, %eax
58+
; X86-WIN-NEXT: fildll {{[0-9]+}}(%esp)
59+
; X86-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
60+
; X86-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
61+
; X86-WIN-NEXT: orl $768, %ecx # imm = 0x300
62+
; X86-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp)
63+
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
64+
; X86-WIN-NEXT: fadds __real@5f80000000000000(,%eax,4)
65+
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
66+
; X86-WIN-NEXT: fstps {{[0-9]+}}(%esp)
67+
; X86-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
68+
; X86-WIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
69+
; X86-WIN-NEXT: flds {{[0-9]+}}(%esp)
70+
; X86-WIN-NEXT: movl %ebp, %esp
71+
; X86-WIN-NEXT: popl %ebp
72+
; X86-WIN-NEXT: retl
73+
;
74+
; X64-WIN-LABEL: test:
75+
; X64-WIN: # %bb.0: # %entry
76+
; X64-WIN-NEXT: testq %rcx, %rcx
77+
; X64-WIN-NEXT: js .LBB0_1
78+
; X64-WIN-NEXT: # %bb.2: # %entry
79+
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
80+
; X64-WIN-NEXT: retq
81+
; X64-WIN-NEXT: .LBB0_1:
82+
; X64-WIN-NEXT: movq %rcx, %rax
83+
; X64-WIN-NEXT: shrq %rax
84+
; X64-WIN-NEXT: andl $1, %ecx
85+
; X64-WIN-NEXT: orq %rax, %rcx
86+
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
87+
; X64-WIN-NEXT: addss %xmm0, %xmm0
88+
; X64-WIN-NEXT: retq
4589
entry:
4690
%b = uitofp i64 %a to float
4791
ret float %b

0 commit comments

Comments
 (0)