Skip to content

Commit 00d591c

Browse files
author
v01dxyz
committed
[X86][NVPTX][LegalizeDAG] If i16 legal, legalize FABS/FNEG/FCOPYSIGN (f16) with Expand
For the concerned targets, `Expand` clears/flips/copies the sign bit after bitcasting to i16. This is cheaper than `Promote` that requires converting back and forth from f16 to f32 (with possibly expensive libcalls) in order to call FABS/FNEG/FCOPYSIGN (f32).
1 parent 2847020 commit 00d591c

File tree

6 files changed

+31
-56
lines changed

6 files changed

+31
-56
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
849849
AddPromotedToType(Op, MVT::bf16, MVT::f32);
850850
}
851851
for (const auto &Op : {ISD::FABS}) {
852-
setOperationAction(Op, MVT::f16, Promote);
852+
// Expand instead of Promote to clear sign bit by bitcasting to i16
853+
setOperationAction(Op, MVT::f16, Expand);
853854
setOperationAction(Op, MVT::f32, Legal);
854855
setOperationAction(Op, MVT::f64, Legal);
855856
setOperationAction(Op, MVT::v2f16, Expand);

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
601601
auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
602602
setOperationAction(ISD::FABS, VT, Action);
603603
setOperationAction(ISD::FNEG, VT, Action);
604-
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
604+
setOperationAction(ISD::FCOPYSIGN, VT, Action);
605605
setOperationAction(ISD::FREM, VT, Action);
606606
setOperationAction(ISD::FMA, VT, Action);
607607
setOperationAction(ISD::FMINNUM, VT, Action);
@@ -672,6 +672,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
672672

673673
// Half type will be promoted by default.
674674
setF16Action(MVT::f16, Promote);
675+
// Expand instead of Promote to clear/flip/copy sign bit by bitcasting to
676+
// i16.
677+
setOperationAction(ISD::FABS, MVT::f16, Expand);
678+
setOperationAction(ISD::FNEG, MVT::f16, Expand);
679+
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
675680
setOperationAction(ISD::FADD, MVT::f16, Promote);
676681
setOperationAction(ISD::FSUB, MVT::f16, Promote);
677682
setOperationAction(ISD::FMUL, MVT::f16, Promote);

llvm/test/CodeGen/NVPTX/f16-instructions.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -981,14 +981,10 @@ define half @test_fma(half %a, half %b, half %c) #0 {
981981
}
982982

983983
; CHECK-LABEL: test_fabs(
984-
; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fabs_param_0];
985-
; CHECK-NOFTZ: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
986-
; CHECK-NOFTZ: abs.f32 [[RF:%f[0-9]+]], [[AF]];
987-
; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[AF:%f[0-9]+]], [[A]];
988-
; CHECK-F16-FTZ: abs.ftz.f32 [[RF:%f[0-9]+]], [[AF]];
989-
; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]];
990-
; CHECK: st.param.b16 [func_retval0+0], [[R]];
991-
; CHECK: ret;
984+
; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fabs_param_0];
985+
; CHECK: and.b16 [[RF:%rs[0-9]+]], [[A]], 32767;
986+
; CHECK: st.param.b16 [func_retval0+0], [[RF]];
987+
; CHECK: ret;
992988
define half @test_fabs(half %a) #0 {
993989
%r = call half @llvm.fabs.f16(half %a)
994990
ret half %r

llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,17 +1183,13 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
11831183
}
11841184

11851185
; CHECK-LABEL: test_fabs(
1186-
; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0];
1187-
; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
1188-
; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
1189-
; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
1190-
; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]];
1191-
; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]];
1192-
; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]];
1193-
; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]];
1194-
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
1195-
; CHECK: st.param.b32 [func_retval0+0], [[R]];
1196-
; CHECK: ret;
1186+
; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0];
1187+
; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
1188+
; CHECK: and.b16 [[A2:%rs[0-9]+]], [[A1]], 32767;
1189+
; CHECK: and.b16 [[A3:%rs[0-9]+]], [[A0]], 32767;
1190+
; CHECK: mov.b32 [[B:%r[0-9]+]], {[[A3]], [[A2]]};
1191+
; CHECK: st.param.b32 [func_retval0+0], [[B]];
1192+
; CHECK: ret;
11971193
define <2 x half> @test_fabs(<2 x half> %a) #0 {
11981194
%r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
11991195
ret <2 x half> %r

llvm/test/CodeGen/X86/fp16-libcalls.ll

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,7 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
124124
; F16C-LABEL: test_half_fabs:
125125
; F16C: # %bb.0:
126126
; F16C-NEXT: vpextrw $0, %xmm0, %eax
127-
; F16C-NEXT: vmovd %eax, %xmm0
128-
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
129-
; F16C-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
130-
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
131-
; F16C-NEXT: vmovd %xmm0, %eax
127+
; F16C-NEXT: andl $32767, %eax # imm = 0x7FFF
132128
; F16C-NEXT: movw %ax, (%rdi)
133129
; F16C-NEXT: retq
134130
;
@@ -141,34 +137,17 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
141137
;
142138
; X64-LABEL: test_half_fabs:
143139
; X64: # %bb.0:
144-
; X64-NEXT: pushq %rbx
145-
; X64-NEXT: movq %rdi, %rbx
146-
; X64-NEXT: callq __extendhfsf2@PLT
147-
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
148-
; X64-NEXT: callq __truncsfhf2@PLT
149140
; X64-NEXT: pextrw $0, %xmm0, %eax
150-
; X64-NEXT: movw %ax, (%rbx)
151-
; X64-NEXT: popq %rbx
141+
; X64-NEXT: andl $32767, %eax # imm = 0x7FFF
142+
; X64-NEXT: movw %ax, (%rdi)
152143
; X64-NEXT: retq
153144
;
154145
; X86-LABEL: test_half_fabs:
155146
; X86: # %bb.0:
156-
; X86-NEXT: pushl %esi
157-
; X86-NEXT: subl $8, %esp
158-
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
159-
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
160-
; X86-NEXT: pextrw $0, %xmm0, %eax
161-
; X86-NEXT: movw %ax, (%esp)
162-
; X86-NEXT: calll __extendhfsf2
163-
; X86-NEXT: fstps {{[0-9]+}}(%esp)
164-
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
165-
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
166-
; X86-NEXT: movd %xmm0, (%esp)
167-
; X86-NEXT: calll __truncsfhf2
168-
; X86-NEXT: pextrw $0, %xmm0, %eax
169-
; X86-NEXT: movw %ax, (%esi)
170-
; X86-NEXT: addl $8, %esp
171-
; X86-NEXT: popl %esi
147+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
148+
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
149+
; X86-NEXT: andl $32767, %ecx # imm = 0x7FFF
150+
; X86-NEXT: movw %cx, (%eax)
172151
; X86-NEXT: retl
173152
%res = call half @llvm.fabs.half(half %a0)
174153
store half %res, ptr %p0, align 2

llvm/test/CodeGen/X86/half.ll

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,7 +1059,6 @@ define void @main.158() #0 {
10591059
; CHECK-LIBCALL: # %bb.0: # %entry
10601060
; CHECK-LIBCALL-NEXT: pushq %rax
10611061
; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0
1062-
; CHECK-LIBCALL-NEXT: callq __truncsfhf2@PLT
10631062
; CHECK-LIBCALL-NEXT: callq __extendhfsf2@PLT
10641063
; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
10651064
; CHECK-LIBCALL-NEXT: ucomiss %xmm0, %xmm1
@@ -1077,10 +1076,10 @@ define void @main.158() #0 {
10771076
; BWON-F16C-LABEL: main.158:
10781077
; BWON-F16C: # %bb.0: # %entry
10791078
; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
1080-
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1081-
; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
1082-
; BWON-F16C-NEXT: vmovss {{.*#+}} xmm2 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
1083-
; BWON-F16C-NEXT: vucomiss %xmm1, %xmm2
1079+
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
1080+
; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
1081+
; BWON-F16C-NEXT: vucomiss %xmm0, %xmm1
1082+
; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
10841083
; BWON-F16C-NEXT: jae .LBB20_2
10851084
; BWON-F16C-NEXT: # %bb.1: # %entry
10861085
; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
@@ -1093,8 +1092,7 @@ define void @main.158() #0 {
10931092
; CHECK-I686-LABEL: main.158:
10941093
; CHECK-I686: # %bb.0: # %entry
10951094
; CHECK-I686-NEXT: subl $12, %esp
1096-
; CHECK-I686-NEXT: movl $0, (%esp)
1097-
; CHECK-I686-NEXT: calll __truncsfhf2
1095+
; CHECK-I686-NEXT: pxor %xmm0, %xmm0
10981096
; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax
10991097
; CHECK-I686-NEXT: movw %ax, (%esp)
11001098
; CHECK-I686-NEXT: calll __extendhfsf2

0 commit comments

Comments
 (0)