diff --git a/llvm/test/CodeGen/AMDGPU/fabs-r600.ll b/llvm/test/CodeGen/AMDGPU/fabs-r600.ll new file mode 100644 index 0000000000000..7e1aa99c3ec40 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fabs-r600.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s + + +; DAGCombiner will transform: +; (fabsf (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) +; unless isFabsFree returns true +define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { +; R600-LABEL: s_fabsf_fn_free: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[2].Z, +; R600-NEXT: MOV T0.X, |PV.W|, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %bc= bitcast i32 %in to float + %fabs = call float @fabsf(float %bc) + store float %fabs, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { +; R600-LABEL: s_fabsf_free: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[2].Z, +; R600-NEXT: MOV T0.X, |PV.W|, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %bc= bitcast i32 %in to float + %fabs = call float @llvm.fabs.f32(float %bc) + store float %fabs, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { +; R600-LABEL: s_fabsf_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[2].Z, +; R600-NEXT: MOV T0.X, |PV.W|, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call float @llvm.fabs.f32(float %in) + store float %fabs, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { +; R600-LABEL: fabs_v2f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[3].X, +; R600-NEXT: MOV T0.Y, |PV.W|, +; R600-NEXT: MOV * T0.W, KC0[2].W, +; R600-NEXT: MOV T0.X, |PV.W|, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) + store <2 x float> %fabs, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { +; R600-LABEL: fabsf_v4f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV T0.W, KC0[4].X, +; R600-NEXT: MOV * T1.W, KC0[3].W, +; R600-NEXT: MOV * T0.W, |PV.W|, +; R600-NEXT: MOV T0.Z, |T1.W|, +; R600-NEXT: MOV * T1.W, KC0[3].Z, +; R600-NEXT: MOV T0.Y, |PV.W|, +; R600-NEXT: MOV * T1.W, KC0[3].Y, +; R600-NEXT: MOV T0.X, |PV.W|, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) + store <4 x float> %fabs, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, float %in1) { +; R600-LABEL: fabsf_fn_fold: +; R600: ; %bb.0: +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: MUL_IEEE * T1.X, |KC0[2].Z|, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call float @fabsf(float %in0) + %fmul = fmul float %fabs, %in1 + store float %fmul, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) { +; R600-LABEL: fabs_fold: +; R600: ; %bb.0: +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: MUL_IEEE * T1.X, |KC0[2].Z|, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call float @llvm.fabs.f32(float %in0) + %fmul = fmul float %fabs, %in1 + store float %fmul, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) { +; R600-LABEL: bitpreserve_fabsf_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: ADD * T1.X, |KC0[2].Z|, 1.0, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %in.bc = bitcast float %in to i32 + %int.abs = and i32 %in.bc, 2147483647 + %bc = bitcast i32 %int.abs to float + %fadd = fadd float %bc, 1.0 + store float %fadd, ptr addrspace(1) %out + ret void +} + +declare float @fabsf(float) readnone +declare float @llvm.fabs.f32(float) readnone +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index e18c76f89b6c7..07581ade57ccd 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -1,104 +1,256 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -enable-misched=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -march=amdgcn -enable-misched=0 < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched=0 < %s | FileCheck -check-prefixes=GCN,VI %s ; DAGCombiner will transform: ; (fabsf (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) ; unless isFabsFree returns true - -; FUNC-LABEL: {{^}}s_fabsf_fn_free: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| - -; GCN: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_fabsf_fn_free: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fabsf_fn_free: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %bc= bitcast i32 %in to float %fabs = call float @fabsf(float %bc) store float %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}s_fabsf_free: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| - -; GCN: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_fabsf_free: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fabsf_free: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitset0_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %bc= bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) store float %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}s_fabsf_f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| - -; GCN: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { +; SI-LABEL: s_fabsf_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fabsf_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitset0_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %in) store float %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_v2f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| - -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { +; SI-LABEL: fabs_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_and_b32 s0, s3, 0x7fffffff +; SI-NEXT: s_and_b32 s1, s2, 0x7fffffff +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fabs_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitset0_b32 s3, 31 +; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabsf_v4f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| - -; GCN: s_bitset0_b32 -; GCN: s_bitset0_b32 -; GCN: s_bitset0_b32 -; GCN: s_bitset0_b32 define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { +; SI-LABEL: fabsf_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_bitset0_b32 s2, 31 +; SI-NEXT: s_bitset0_b32 s1, 31 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fabsf_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_bitset0_b32 s3, 31 +; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: s_bitset0_b32 s1, 31 +; VI-NEXT: s_bitset0_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) store <4 x float> %fabs, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}fabsf_fn_fold: -; SI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x9 -; VI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x24 -; GCN-NOT: and -; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[#LOAD + 3]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[#LOAD + 2]]|, [[V_MUL_VI]] define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, float %in1) { +; SI-LABEL: fabsf_fn_fold: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mul_f32_e64 v0, |s2|, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fabsf_fn_fold: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %fabs = call float @fabsf(float %in0) %fmul = fmul float %fabs, %in1 store float %fmul, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_fold: -; SI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x9 -; VI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x24 -; GCN-NOT: and -; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[#LOAD + 3]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[#LOAD + 2]]|, [[V_MUL_VI]] define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) { +; SI-LABEL: fabs_fold: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mul_f32_e64 v0, |s2|, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fabs_fold: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %in0) %fmul = fmul float %fabs, %in1 store float %fmul, ptr addrspace(1) %out ret void } -; Make sure we turn some integer operations back into fabsf -; FUNC-LABEL: {{^}}bitpreserve_fabsf_f32: -; GCN: v_add_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|, 1.0 define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) { +; SI-LABEL: bitpreserve_fabsf_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bitpreserve_fabsf_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %in.bc = bitcast float %in to i32 %int.abs = and i32 %in.bc, 2147483647 %bc = bitcast i32 %int.abs to float @@ -111,3 +263,5 @@ declare float @fabsf(float) readnone declare float @llvm.fabs.f32(float) readnone declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs-r600.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs-r600.ll new file mode 100644 index 0000000000000..4f5271ed23252 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs-r600.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 %s + +define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) { +; R600-LABEL: fneg_fabsf_fadd_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: ADD * T1.X, KC0[2].W, -|KC0[2].Z|, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call float @llvm.fabs.f32(float %x) + %fsub = fsub float -0.000000e+00, %fabs + %fadd = fadd float %y, %fsub + store float %fadd, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) { +; R600-LABEL: fneg_fabsf_fmul_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: MUL_IEEE * T1.X, KC0[2].W, -|KC0[2].Z|, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call float @llvm.fabs.f32(float %x) + %fsub = fsub float -0.000000e+00, %fabs + %fmul = fmul float %y, %fsub + store float %fmul, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { +; R600-LABEL: fneg_fabsf_free_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[2].Z, +; R600-NEXT: MOV * T0.W, |PV.W|, +; R600-NEXT: MOV T0.X, -PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %bc = bitcast i32 %in to float + %fabs = call float @llvm.fabs.f32(float %bc) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in) { +; R600-LABEL: fneg_fabsf_fn_free_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[2].Z, +; R600-NEXT: MOV * T0.W, |PV.W|, +; R600-NEXT: MOV T0.X, -PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %bc = bitcast i32 %in to float + %fabs = call float @fabsf(float %bc) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { +; R600-LABEL: fneg_fabsf_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[2].Z, +; R600-NEXT: MOV * T0.W, |PV.W|, +; R600-NEXT: MOV T0.X, -PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call float @llvm.fabs.f32(float %in) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; R600-LABEL: v_fneg_fabsf_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @6 +; R600-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 8: +; R600-NEXT: MOV * T0.X, KC0[2].Z, +; R600-NEXT: ALU clause starting at 9: +; R600-NEXT: MOV * T0.W, |T0.X|, +; R600-NEXT: MOV T0.X, -PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %val = load float, ptr addrspace(1) %in, align 4 + %fabs = call float @llvm.fabs.f32(float %val) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) { +; R600-LABEL: fneg_fabsf_v2f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV T0.W, KC0[3].X, +; R600-NEXT: MOV * T1.W, KC0[2].W, +; R600-NEXT: MOV * T0.W, |PV.W|, +; R600-NEXT: MOV T0.Y, -PV.W, +; R600-NEXT: MOV * T0.W, |T1.W|, +; R600-NEXT: MOV T0.X, -PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) + %fsub = fsub <2 x float> , %fabs + store <2 x float> %fsub, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { +; R600-LABEL: fneg_fabsf_v4f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[4].X, +; R600-NEXT: MOV T0.W, |PV.W|, +; R600-NEXT: MOV * T1.W, KC0[3].W, +; R600-NEXT: MOV T0.Z, KC0[3].Z, +; R600-NEXT: MOV T1.W, |PS|, +; R600-NEXT: MOV * T2.W, -PV.W, +; R600-NEXT: MOV T2.Z, -PV.W, +; R600-NEXT: MOV T0.W, KC0[3].Y, +; R600-NEXT: MOV * T1.W, |PV.Z|, +; R600-NEXT: MOV T2.Y, -PS, +; R600-NEXT: MOV * T0.W, |PV.W|, +; R600-NEXT: MOV T2.X, -PV.W, +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) + %fsub = fsub <4 x float> , %fabs + store <4 x float> %fsub, ptr addrspace(1) %out + ret void +} + +declare float @fabsf(float) readnone +declare float @llvm.fabs.f32(float) readnone +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index 5f1d232daabe5..2c9042ec17da8 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -1,12 +1,35 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefixes=SI,GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,GCN %s -; FIXME: Check something here. Currently it seems fabs + fneg aren't -; into 2 modifiers, although theoretically that should work. - -; GCN-LABEL: {{^}}fneg_fabs_fadd_f64: -; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}| define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, double %y) { +; SI-LABEL: fneg_fabs_fadd_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_add_f64 v[0:1], s[8:9], -|v[0:1]| +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_fadd_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], -|v[0:1]| +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %x) %fsub = fsub double -0.000000e+00, %fabs %fadd = fadd double %y, %fsub @@ -15,6 +38,29 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, } define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrspace(1) %xptr, ptr addrspace(1) %yptr) { +; SI-LABEL: v_fneg_fabs_fadd_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[4:5], -|s[4:5]| +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_fneg_fabs_fadd_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[2:3], -|s[2:3]| +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %x = load double, ptr addrspace(1) %xptr, align 8 %y = load double, ptr addrspace(1) %xptr, align 8 %fabs = call double @llvm.fabs.f64(double %x) @@ -24,9 +70,34 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs ret void } -; GCN-LABEL: {{^}}fneg_fabs_fmul_f64: -; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}| define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, double %y) { +; SI-LABEL: fneg_fabs_fmul_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mul_f64 v[0:1], s[8:9], -|v[0:1]| +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_fmul_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mul_f64 v[0:1], s[0:1], -|v[0:1]| +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %x) %fsub = fsub double -0.000000e+00, %fabs %fmul = fmul double %y, %fsub @@ -34,8 +105,32 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, ret void } -; GCN-LABEL: {{^}}fneg_fabs_free_f64: define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: fneg_fabs_free_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s3, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_free_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_or_b32 s0, s3, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm %bc = bitcast i64 %in to double %fabs = call double @llvm.fabs.f64(double %bc) %fsub = fsub double -0.000000e+00, %fabs @@ -43,10 +138,32 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ret void } -; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64: -; SI: s_bitset1_b32 -; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: fneg_fabs_fn_free_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s3, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_fn_free_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_or_b32 s0, s3, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm %bc = bitcast i64 %in to double %fabs = call double @fabs(double %bc) %fsub = fsub double -0.000000e+00, %fabs @@ -54,38 +171,126 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) ret void } -; GCN-LABEL: {{^}}fneg_fabs_f64: -; SI-DAG: s_load_dwordx2 s[[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x13 -; VI-DAG: s_load_dwordx2 s[[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x4c -; GCN-DAG: s_bitset1_b32 s[[HI_X]], 31 -; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] -; GCN-DAG: v_mov_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]] -; GCN: buffer_store_dwordx2 v[[[LO_V]]:[[HI_V]]] define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %in) { +; SI-LABEL: fneg_fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s5, 31 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %in) %fsub = fsub double -0.000000e+00, %fabs store double %fsub, ptr addrspace(1) %out, align 8 ret void } -; GCN-LABEL: {{^}}fneg_fabs_v2f64: -; GCN-NOT: 0x80000000 -; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 -; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { +; SI-LABEL: fneg_fabs_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s7, 31 +; SI-NEXT: s_bitset1_b32 s5, 31 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_or_b32 s2, s7, 0x80000000 +; VI-NEXT: s_or_b32 s3, s5, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) %fsub = fsub <2 x double> , %fabs store <2 x double> %fsub, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}fneg_fabs_v4f64: -; GCN-NOT: 0x80000000 -; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 -; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 -; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 -; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { +; SI-LABEL: fneg_fabs_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s7, 31 +; SI-NEXT: s_bitset1_b32 s11, 31 +; SI-NEXT: s_bitset1_b32 s9, 31 +; SI-NEXT: s_bitset1_b32 s5, 31 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_v4f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitset1_b32 s7, 31 +; VI-NEXT: s_bitset1_b32 s5, 31 +; VI-NEXT: s_or_b32 s2, s11, 0x80000000 +; VI-NEXT: s_or_b32 s3, s9, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) %fsub = fsub <4 x double> , %fabs store <4 x double> %fsub, ptr addrspace(1) %out @@ -96,3 +301,5 @@ declare double @fabs(double) readnone declare double @llvm.fabs.f64(double) readnone declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index b0c17828cb13b..3c000d4fa63a3 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -1,11 +1,31 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck --check-prefixes=SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,FUNC %s -; FUNC-LABEL: {{^}}fneg_fabsf_fadd_f32: -; SI-NOT: and -; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) { +; SI-LABEL: fneg_fabsf_fadd_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_sub_f32_e64 v0, s3, |v0| +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabsf_fadd_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_sub_f32_e64 v2, s3, |v0| +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs %fadd = fadd float %y, %fsub @@ -13,11 +33,30 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, ret void } -; FUNC-LABEL: {{^}}fneg_fabsf_fmul_f32: -; SI-NOT: and -; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| -; SI-NOT: and define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) { +; SI-LABEL: fneg_fabsf_fmul_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mul_f32_e64 v0, s3, -|v0| +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabsf_fmul_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0| +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs %fmul = fmul float %y, %fsub @@ -25,18 +64,30 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, ret void } -; DAGCombiner will transform: -; (fabsf (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) -; unless isFabsFree returns true - -; FUNC-LABEL: {{^}}fneg_fabsf_free_f32: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| -; R600: -PV - -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; VI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: fneg_fabsf_free_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabsf_free_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %bc = bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) %fsub = fsub float -0.000000e+00, %fabs @@ -44,13 +95,30 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ret void } -; FUNC-LABEL: {{^}}fneg_fabsf_fn_free_f32: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| -; R600: -PV - -; SI: s_load_dwordx2 s[0:1], s[2:3], 0x9 define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: fneg_fabsf_fn_free_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabsf_fn_free_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %bc = bitcast i32 %in to float %fabs = call float @fabsf(float %bc) %fsub = fsub float -0.000000e+00, %fabs @@ -58,18 +126,68 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in ret void } -; FUNC-LABEL: {{^}}fneg_fabsf_f32: -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { +; SI-LABEL: fneg_fabsf_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabsf_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %in) %fsub = fsub float -0.000000e+00, %fabs store float %fsub, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}v_fneg_fabsf_f32: -; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: v_fneg_fabsf_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_fneg_fabsf_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, 0x80000000, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %val = load float, ptr addrspace(1) %in, align 4 %fabs = call float @llvm.fabs.f32(float %val) %fsub = fsub float -0.000000e+00, %fabs @@ -77,28 +195,76 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace ret void } -; FUNC-LABEL: {{^}}fneg_fabsf_v2f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: -PV -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: -PV - -; FIXME: In this case two uses of the constant should be folded -; SI: s_bitset1_b32 s{{[0-9]+}}, 31 -; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) { +; SI-LABEL: fneg_fabsf_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s3, 31 +; SI-NEXT: s_bitset1_b32 s2, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabsf_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs store <2 x float> %fsub, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fneg_fabsf_v4f32: -; SI: s_bitset1_b32 s{{[0-9]+}}, 31 -; SI: s_bitset1_b32 s{{[0-9]+}}, 31 -; SI: s_bitset1_b32 s{{[0-9]+}}, 31 -; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { +; SI-LABEL: fneg_fabsf_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s7, 31 +; SI-NEXT: s_bitset1_b32 s6, 31 +; SI-NEXT: s_bitset1_b32 s5, 31 +; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabsf_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_or_b32 s2, s7, 0x80000000 +; VI-NEXT: s_or_b32 s3, s6, 0x80000000 +; VI-NEXT: s_bitset1_b32 s5, 31 +; VI-NEXT: s_bitset1_b32 s4, 31 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs store <4 x float> %fsub, ptr addrspace(1) %out @@ -112,3 +278,5 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FUNC: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index 03ca780c90322..d78bdfe08772a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -1,89 +1,279 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,SI %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,GFX11 %s -; RUN: not llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,R600 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s +; RUN: not llc -mtriple=r600 -mcpu=redwood < %s -; FUNC-LABEL: {{^}}s_fneg_f32: -; R600: -PV - -; GCN: s_load_{{dword|b32}} [[VAL:s[0-9]+]] -; GCN: s_xor_b32 [[NEG_VAL:s[0-9]+]], [[VAL]], 0x80000000 -; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[NEG_VAL]] define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { +; SI-LABEL: s_fneg_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = fsub float -0.000000e+00, %in store float %fneg, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}s_fneg_v2f32: -; R600: -PV -; R600: -PV - -; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 -; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) { +; SI-LABEL: s_fneg_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_xor_b32 s0, s3, 0x80000000 +; SI-NEXT: s_xor_b32 s1, s2, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = fsub <2 x float> , %in store <2 x float> %fneg, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}s_fneg_v4f32: -; R600: -PV -; R600: -T -; R600: -PV -; R600: -PV - -; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 -; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 -; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 -; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) { +; SI-LABEL: s_fneg_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b32 s7, s7, 0x80000000 +; SI-NEXT: s_xor_b32 s6, s6, 0x80000000 +; SI-NEXT: s_xor_b32 s5, s5, 0x80000000 +; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b32 s2, s7, 0x80000000 +; VI-NEXT: s_xor_b32 s3, s6, 0x80000000 +; VI-NEXT: s_xor_b32 s5, s5, 0x80000000 +; VI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000 +; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000 +; GFX11-NEXT: s_xor_b32 s4, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s5, s5, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = fsub <4 x float> , %in store <4 x float> %fneg, ptr addrspace(1) %out ret void } -; DAGCombiner will transform: -; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000)) -; unless the target returns true for isNegFree() - -; FUNC-LABEL: {{^}}fsub0_f32: - -; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}} - -; R600-NOT: XOR -; R600: -KC0[2].Z define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: fsub0_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_sub_f32_e64 v0, 0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fsub0_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_sub_f32_e64 v2, 0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fsub0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to float %fsub = fsub float 0.0, %bc store float %fsub, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fneg_free_f32: -; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c -; GFX11: s_load_b32 [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c - -; GCN: s_xor_b32 [[RES:s[0-9]+]], [[NEG_VALUE]], 0x80000000 -; GCN: v_mov_b32_e32 [[V_RES:v[0-9]+]], [[RES]] -; GCN: buffer_store_{{dword|b32}} [[V_RES]] -; R600-NOT: XOR -; R600: -PV.W define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: fneg_free_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_free_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_free_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to float %fsub = fsub float -0.0, %bc store float %fsub, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fneg_fold_f32: -; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c -; GFX11: s_load_{{dword|b32}} [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c -; GCN-NOT: xor -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { +; SI-LABEL: fneg_fold_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, -s4, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fold_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mul_f32_e64 v2, -s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fold_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fsub = fsub float -0.0, %in %fmul = fmul float %fsub, %in store float %fmul, ptr addrspace(1) %out @@ -91,9 +281,41 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { } ; Make sure we turn some integer operations back into fabs -; FUNC-LABEL: {{^}}bitpreserve_fneg_f32: -; GCN: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -4.0 define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) { +; SI-LABEL: bitpreserve_fneg_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, s4, -4.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bitpreserve_fneg_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: bitpreserve_fneg_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %in.bc = bitcast float %in to i32 %int.abs = xor i32 %in.bc, 2147483648 %bc = bitcast i32 %int.abs to float @@ -102,29 +324,94 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ret void } -; FUNC-LABEL: {{^}}s_fneg_i32: -; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]] -; GCN: s_xor_b32 [[FNEG:s[0-9]+]], [[IN]], 0x80000000 -; GCN: v_mov_b32_e32 [[V_FNEG:v[0-9]+]], [[FNEG]] define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_fneg_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = xor i32 %in, -2147483648 store i32 %fneg, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}v_fneg_i32: -; GCN: s_waitcnt -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: s_setpc_b64 define i32 @v_fneg_i32(i32 %in) { +; GCN-LABEL: v_fneg_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor i32 %in, -2147483648 ret i32 %fneg } -; FUNC-LABEL: {{^}}s_fneg_i32_fp_use: -; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]] -; GCN: v_sub_f32_e64 v{{[0-9]+}}, 2.0, [[IN]] define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_fneg_i32_fp_use: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_sub_f32_e64 v0, 2.0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_i32_fp_use: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_i32_fp_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = xor i32 %in, -2147483648 %bitcast = bitcast i32 %fneg to float %fadd = fadd float %bitcast, 2.0 @@ -132,37 +419,105 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ret void } -; FUNC-LABEL: {{^}}v_fneg_i32_fp_use: -; GCN: s_waitcnt -; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0 -; GCN-NEXT: s_setpc_b64 define float @v_fneg_i32_fp_use(i32 %in) { +; GCN-LABEL: v_fneg_i32_fp_use: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor i32 %in, -2147483648 %bitcast = bitcast i32 %fneg to float %fadd = fadd float %bitcast, 2.0 ret float %fadd } -; FUNC-LABEL: {{^}}s_fneg_i64: -; GCN: s_xor_b32 s[[NEG_HI:[0-9]+]], s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: s_fneg_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_xor_b32 s0, s3, 0x80000000 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = xor i64 %in, -9223372036854775808 store i64 %fneg, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}v_fneg_i64: -; GCN: s_waitcnt -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GCN-NEXT: s_setpc_b64 define i64 @v_fneg_i64(i64 %in) { +; GCN-LABEL: v_fneg_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor i64 %in, -9223372036854775808 ret i64 %fneg } -; FUNC-LABEL: {{^}}s_fneg_i64_fp_use: -; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, -s{{\[[0-9]+:[0-9]+\]}}, 2.0 define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: s_fneg_i64_fp_use: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_i64_fp_use: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_i64_fp_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = xor i64 %in, -9223372036854775808 %bitcast = bitcast i64 %fneg to double %fadd = fadd double %bitcast, 2.0 @@ -170,34 +525,65 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ret void } -; FUNC-LABEL: {{^}}v_fneg_i64_fp_use: -; GCN: s_waitcnt -; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0 -; GCN-NEXT: s_setpc_b64 define double @v_fneg_i64_fp_use(i64 %in) { +; GCN-LABEL: v_fneg_i64_fp_use: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0 +; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor i64 %in, -9223372036854775808 %bitcast = bitcast i64 %fneg to double %fadd = fadd double %bitcast, 2.0 ret double %fadd } -; FUNC-LABEL: {{^}}v_fneg_i16: -; GCN: s_waitcnt -; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GCN-NEXT: s_setpc_b64 define i16 @v_fneg_i16(i16 %in) { +; GCN-LABEL: v_fneg_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor i16 %in, -32768 ret i16 %fneg } -; FUNC-LABEL: {{^}}s_fneg_i16_fp_use: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], s{{[0-9]+}} -; SI: v_sub_f32_e32 [[ADD:v[0-9]+]], 2.0, [[CVT0]] -; SI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[ADD]] - -; VI: s_load_dword [[IN:s[0-9]+]] -; VI: v_sub_f16_e64 v{{[0-9]+}}, 2.0, [[IN]] define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { +; SI-LABEL: s_fneg_i16_fp_use: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_i16_fp_use: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_i16_fp_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = xor i16 %in, -32768 %bitcast = bitcast i16 %fneg to half %fadd = fadd half %bitcast, 2.0 @@ -205,69 +591,157 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ret void } -; FUNC-LABEL: {{^}}v_fneg_i16_fp_use: -; SI: s_waitcnt -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 -; SI-NEXT: s_setpc_b64 - -; VI: s_waitcnt -; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0 -; VI-NEXT: s_setpc_b64 define half @v_fneg_i16_fp_use(i16 %in) { +; SI-LABEL: v_fneg_i16_fp_use: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fneg_i16_fp_use: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fneg_i16_fp_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_f16_e32 v0, 2.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg = xor i16 %in, -32768 %bitcast = bitcast i16 %fneg to half %fadd = fadd half %bitcast, 2.0 ret half %fadd } -; FUNC-LABEL: {{^}}s_fneg_v2i16: -; SI: s_xor_b32 s4, s4, 0x80008000 - -; VI: s_lshr_b32 s5, s4, 16 -; VI: s_xor_b32 s4, s4, 0x8000 -; VI: s_xor_b32 s5, s5, 0x8000 -; VI: s_and_b32 s4, s4, 0xffff -; VI: s_lshl_b32 s5, s5, 16 -; VI: s_or_b32 s4, s4, s5 define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { +; SI-LABEL: s_fneg_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b32 s4, s4, 0x80008000 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_xor_b32 s2, s2, 0x8000 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %in = bitcast i32 %arg to <2 x i16> %fneg = xor <2 x i16> %in, store <2 x i16> %fneg, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}v_fneg_v2i16: -; SI: v_xor_b32_e32 v1, 0x8000, v1 -; SI: v_xor_b32_e32 v0, 0x8000, v0 -; SI: v_lshlrev_b32_e32 v2, 16, v1 -; SI: v_and_b32_e32 v0, 0xffff, v0 -; SI: v_or_b32_e32 v0, v0, v2 -; SI: v_and_b32_e32 v1, 0xffff, v1 - -; VI: s_waitcnt -; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; VI-NEXT: s_setpc_b64 define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { +; SI-LABEL: v_fneg_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fneg_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fneg_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg = xor <2 x i16> %in, ret <2 x i16> %fneg } -; FUNC-LABEL: {{^}}s_fneg_v2i16_fp_use: -; SI: s_lshr_b32 s3, s2, 16 -; SI: v_cvt_f32_f16_e32 v0, s3 -; SI: v_cvt_f32_f16_e32 v1, s2 -; SI: v_sub_f32_e32 v0, 2.0, v0 -; SI: v_sub_f32_e32 v1, 2.0, v1 - -; VI: s_lshr_b32 s5, s4, 16 -; VI: s_xor_b32 s5, s5, 0x8000 -; VI: s_xor_b32 s4, s4, 0x8000 -; VI: v_mov_b32_e32 v0, s5 -; VI: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_add_f16_e64 v1, s4, 2.0 -; VI: v_or_b32_e32 v0, v1, v0 define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) { +; SI-LABEL: s_fneg_v2i16_fp_use: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_v2i16_fp_use: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x4000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 +; VI-NEXT: s_xor_b32 s2, s2, 0x8000 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 +; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_v2i16_fp_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %in = bitcast i32 %arg to <2 x i16> %fneg = xor <2 x i16> %in, %bitcast = bitcast <2 x i16> %fneg to <2 x half> @@ -276,20 +750,31 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ret void } -; FUNC-LABEL: {{^}}v_fneg_v2i16_fp_use: -; SI: v_lshrrev_b32_e32 v1, 16, v0 -; SI: v_cvt_f32_f16_e32 v0, v0 -; SI: v_cvt_f32_f16_e32 v1, v1 -; SI: v_sub_f32_e32 v0, 2.0, v0 -; SI: v_sub_f32_e32 v1, 2.0, v1 - -; VI: s_waitcnt -; VI: v_mov_b32_e32 v1, 0x4000 -; VI: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_sub_f16_e32 v0, 2.0, v0 -; VI: v_or_b32_e32 v0, v0, v1 -; VI: s_setpc_b64 define <2 x half> @v_fneg_v2i16_fp_use(i32 %arg) { +; SI-LABEL: v_fneg_v2i16_fp_use: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fneg_v2i16_fp_use: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, 0x4000 +; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fneg_v2i16_fp_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] %in = bitcast i32 %arg to <2 x i16> %fneg = xor <2 x i16> %in, %bitcast = bitcast <2 x i16> %fneg to <2 x half> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll index ab035b9de04b9..826862e124920 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -1,63 +1,160 @@ -; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,GCN,SI-NOHSA,FUNC %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,VI-NOHSA,GCN,FUNC %s -; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI,GCN,FUNC %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,GCN,FUNC %s +; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s - -; FUNC-LABEL: {{^}}local_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV * [[VAL]], KC0[1].Z - -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 -; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 - -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { +; SI-LABEL: local_size_x: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_x: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x18 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_x: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: MOV * T1.X, KC0[1].Z, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = call i32 @llvm.r600.read.local.size.x() #0 store i32 %0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}local_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV * [[VAL]], KC0[1].W - -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { +; SI-LABEL: local_size_y: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_y: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x1c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_y: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: MOV * T1.X, KC0[1].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = call i32 @llvm.r600.read.local.size.y() #0 store i32 %0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}local_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV * [[VAL]], KC0[2].X - -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { +; SI-LABEL: local_size_z: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_z: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x20 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_z: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: MOV * T1.X, KC0[2].X, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = call i32 @llvm.r600.read.local.size.z() #0 store i32 %0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}local_size_xy: -; SI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9+]]]], s[0:1], 0x6 -; VI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9+]]]], s[0:1], 0x18 -; GCN: s_mul_i32 [[VAL:s[0-9]+]], s[[X]], s[[Y]] -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { +; SI-LABEL: local_size_xy: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_xy: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_xy: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[1].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %x = call i32 @llvm.r600.read.local.size.x() #0 %y = call i32 @llvm.r600.read.local.size.y() #0 @@ -66,17 +163,43 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_xz: - -; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 -; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 -; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 -; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 -; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff -; GCN: s_mul_i32 [[VAL:s[0-9]+]], [[X]], [[Z]] -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { +; SI-LABEL: local_size_xz: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0x6 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mul_i32 s4, s2, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_xz: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x18 +; VI-NEXT: s_load_dword s3, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_xz: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[2].X, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %x = call i32 @llvm.r600.read.local.size.x() #0 %z = call i32 @llvm.r600.read.local.size.z() #0 @@ -85,16 +208,41 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_yz: -; HSA: enable_sgpr_private_segment_buffer = 1 -; HSA: enable_sgpr_dispatch_ptr = 1 - -; SI-NOHSA-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x7 -; VI-NOHSA-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x1c -; GCN: s_mul_i32 [[VAL:s[0-9]+]], s[[#LOAD + 0]], s[[#LOAD + 1]] -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { +; SI-LABEL: local_size_yz: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x7 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mul_i32 s0, s0, s1 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_yz: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_yz: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: MULLO_INT * T1.X, KC0[1].W, KC0[2].X, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %y = call i32 @llvm.r600.read.local.size.y() #0 %z = call i32 @llvm.r600.read.local.size.z() #0 @@ -103,19 +251,46 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_xyz: -; HSA: enable_sgpr_private_segment_buffer = 1 -; HSA: enable_sgpr_dispatch_ptr = 1 - -; SI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9]+]]], s[0:1], 0x6 -; SI-NOHSA-DAG: s_load_dword s[[Z:[0-9]+]], s[0:1], 0x8 -; VI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9]+]]], s[0:1], 0x18 -; VI-NOHSA-DAG: s_load_dword s[[Z:[0-9]+]], s[0:1], 0x20 -; GCN: s_mul_i32 [[M:s[0-9]+]], s[[X]], s[[Y]] -; GCN: s_add_i32 [[VAL:s[0-9]+]], [[M]], s[[Z]] -; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { +; SI-LABEL: local_size_xyz: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 +; SI-NEXT: s_load_dword s2, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_add_i32 s4, s4, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_xyz: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 +; VI-NEXT: s_load_dword s4, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: s_add_i32 s2, s2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_xyz: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MULLO_INT * T0.X, KC0[1].Z, KC0[1].W, +; R600-NEXT: ADD_INT T0.X, PS, KC0[2].X, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %x = call i32 @llvm.r600.read.local.size.x() #0 %y = call i32 @llvm.r600.read.local.size.y() #0 @@ -126,13 +301,39 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_x_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { +; SI-LABEL: local_size_x_known_bits: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_x_known_bits: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x18 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_x_known_bits: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: AND_INT * T1.X, KC0[1].Z, literal.y, +; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41) entry: %size = call i32 @llvm.r600.read.local.size.x() #0 %shl = shl i32 %size, 16 @@ -141,13 +342,39 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_y_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { +; SI-LABEL: local_size_y_known_bits: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_y_known_bits: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x1c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_y_known_bits: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: AND_INT * T1.X, KC0[1].W, literal.y, +; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41) entry: %size = call i32 @llvm.r600.read.local.size.y() #0 %shl = shl i32 %size, 16 @@ -156,13 +383,39 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_z_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { +; SI-LABEL: local_size_z_known_bits: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: local_size_z_known_bits: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x20 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; R600-LABEL: local_size_z_known_bits: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: AND_INT * T1.X, KC0[2].X, literal.y, +; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41) entry: %size = call i32 @llvm.r600.read.local.size.z() #0 %shl = shl i32 %size, 16 @@ -176,3 +429,6 @@ declare i32 @llvm.r600.read.local.size.y() #0 declare i32 @llvm.r600.read.local.size.z() #0 attributes #0 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FUNC: {{.*}} +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index 4d6adc7cc9417..a3f7906a05f6b 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -1,17 +1,21 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}} define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -28,18 +32,21 @@ bb: } ; Apply fneg to broadcasted vector -; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -57,18 +64,21 @@ bb: } ; Apply fneg before broadcast -; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -86,18 +96,21 @@ bb: } ; Apply fneg before and after broadcast, and should cancel out. -; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}} define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -116,18 +129,21 @@ bb: } ; Add scalar, but negate low component -; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_neg_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -144,18 +160,21 @@ bb: } ; Add scalar, but negate high component -; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_neg_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -172,17 +191,20 @@ bb: } ; Apply fneg before broadcast with bitcast -; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { +; GCN-LABEL: add_vector_neg_bitcast_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4 %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2 @@ -197,19 +219,26 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]] - -; FIXME: Remove and -; GCN-DAG: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]] -; GCN-DAG: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]] -; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]] - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}} define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v3, v1 +; GCN-NEXT: ds_read_u16 v1, v1 offset:4 +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 +; GCN-NEXT: global_store_dword v4, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %arg2.gep = getelementptr inbounds half, ptr addrspace(3) %arg2, i32 2 @@ -229,15 +258,23 @@ bb: } ; FIXME: Can we avoid waitcnt between the two halves? -; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_u16 [[PACKED:v[0-9]+]] -; GCN: s_waitcnt -; GCN: ds_read_u16_d16_hi [[PACKED]] - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v3, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_read_u16_d16_hi v3, v1 offset:4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %arg2.gep = getelementptr inbounds half, ptr addrspace(3) %arg2, i32 2 @@ -257,18 +294,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_neg_vector_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -285,18 +325,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_vector_neg_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -314,17 +357,20 @@ bb: ret void } -; GCN-LABEL: {{^}}add_vector_scalar_hi: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}} define amdgpu_kernel void @add_vector_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: add_vector_scalar_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_add_u16 v0, v1, v0 op_sel:[0,1] +; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x i16>, ptr addrspace(3) %lds, i32 1 @@ -338,18 +384,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_scalar_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -366,18 +415,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}} define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_neg_vector_lo_neg_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -396,18 +448,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_swap_vector: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} define amdgpu_kernel void @fma_vector_vector_swap_vector(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_swap_vector: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -423,19 +478,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or -; GCN-NOT: xor - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_swap_neg_vector: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -452,19 +509,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or -; GCN-NOT: xor - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -480,19 +539,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or -; GCN-NOT: xor - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -508,19 +569,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or -; GCN-NOT: xor - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_2: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -536,19 +599,21 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: or -; GCN-NOT: xor - -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_3: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -564,9 +629,22 @@ bb: ret void } -; GCN-LABEL: {{^}}bitcast_fneg_f32: -; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}} define amdgpu_kernel void @bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: bitcast_fneg_f32: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 +; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4 %f32 = load volatile float, ptr addrspace(3) undef, align 4 @@ -578,9 +656,22 @@ bb: ret void } -; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32: -; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @shuffle_bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: shuffle_bitcast_fneg_f32: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 op_sel:[0,1] op_sel_hi:[1,0] +; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4 @@ -593,10 +684,24 @@ bb: ret void } -; GCN-LABEL: {{^}}extract_from_i64: -; GCN: v_lshl_or_b32 -; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}} define amdgpu_kernel void @extract_from_i64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: extract_from_i64: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v3, 0xffff +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_and_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_add_u16 v0, v2, v0 +; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4 %i64 = load volatile i64, ptr addrspace(1) undef @@ -612,21 +717,24 @@ bb: ret void } - -; Bitcast is final obstacle to identifying same source register -; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: _or - -; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} define amdgpu_kernel void @bitcast_lo_elt_op_sel(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: bitcast_lo_elt_op_sel: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v3, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 @@ -647,21 +755,29 @@ bb: ret void } - -; Bitcast is final obstacle to identifying same source register -; GCN-LABEL: {{^}}mix_elt_types_op_sel: -; GCN: ds_read_b32 [[VEC0:v[0-9]+]] -; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_b32 [[VEC2:v[0-9]+]] - -; GCN-NOT: pack -; GCN-NOT: and -; GCN-NOT: shl -; GCN-NOT: _or - -; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] -; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GCN-LABEL: mix_elt_types_op_sel: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: ; kill: killed $vgpr0_vgpr1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v3, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_ushort v3, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; kill: killed $vgpr0_vgpr1 +; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2 diff --git a/llvm/test/CodeGen/AMDGPU/xor-r600.ll b/llvm/test/CodeGen/AMDGPU/xor-r600.ll new file mode 100644 index 0000000000000..3fb11f4484bd2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xor-r600.ll @@ -0,0 +1,478 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600 %s + +define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; R600-LABEL: xor_v2i32: +; R600: ; %bb.0: +; R600-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 1 @6 +; R600-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 +; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 10: +; R600-NEXT: MOV T0.X, KC0[2].Z, +; R600-NEXT: MOV * T1.X, KC0[2].W, +; R600-NEXT: ALU clause starting at 12: +; R600-NEXT: XOR_INT * T0.Y, T0.Y, T1.Y, +; R600-NEXT: XOR_INT T0.X, T0.X, T1.X, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load <2 x i32>, ptr addrspace(1) %in0 + %b = load <2 x i32>, ptr addrspace(1) %in1 + %result = xor <2 x i32> %a, %b + store <2 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; R600-LABEL: xor_v4i32: +; R600: ; %bb.0: +; R600-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 1 @6 +; R600-NEXT: ALU 5, @12, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1 +; R600-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 10: +; R600-NEXT: MOV T0.X, KC0[2].Z, +; R600-NEXT: MOV * T1.X, KC0[2].W, +; R600-NEXT: ALU clause starting at 12: +; R600-NEXT: XOR_INT * T0.W, T0.W, T1.W, +; R600-NEXT: XOR_INT * T0.Z, T0.Z, T1.Z, +; R600-NEXT: XOR_INT * T0.Y, T0.Y, T1.Y, +; R600-NEXT: XOR_INT T0.X, T0.X, T1.X, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load <4 x i32>, ptr addrspace(1) %in0 + %b = load <4 x i32>, ptr addrspace(1) %in1 + %result = xor <4 x i32> %a, %b + store <4 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; R600-LABEL: xor_i1: +; R600: ; %bb.0: +; R600-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @8 +; R600-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @10 +; R600-NEXT: ALU 5, @14, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 8: +; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; R600-NEXT: Fetch clause starting at 10: +; R600-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; R600-NEXT: ALU clause starting at 12: +; R600-NEXT: MOV * T0.X, KC0[2].W, +; R600-NEXT: ALU clause starting at 13: +; R600-NEXT: MOV * T1.X, KC0[2].Z, +; R600-NEXT: ALU clause starting at 14: +; R600-NEXT: SETGE_DX10 T0.W, T0.X, 1.0, +; R600-NEXT: SETGE_DX10 * T1.W, T1.X, 0.0, +; R600-NEXT: XOR_INT * T0.W, PS, PV.W, +; R600-NEXT: CNDE_INT T0.X, PV.W, T0.X, T1.X, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load float, ptr addrspace(1) %in0 + %b = load float, ptr addrspace(1) %in1 + %acmp = fcmp oge float %a, 0.000000e+00 + %bcmp = fcmp oge float %b, 1.000000e+00 + %xor = xor i1 %acmp, %bcmp + %result = select i1 %xor, float %a, float %b + store float %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; R600-LABEL: v_xor_i1: +; R600: ; %bb.0: +; R600-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @8 +; R600-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @10 +; R600-NEXT: ALU 12, @14, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 8: +; R600-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 +; R600-NEXT: Fetch clause starting at 10: +; R600-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1 +; R600-NEXT: ALU clause starting at 12: +; R600-NEXT: MOV * T0.X, KC0[2].Z, +; R600-NEXT: ALU clause starting at 13: +; R600-NEXT: MOV * T1.X, KC0[2].W, +; R600-NEXT: ALU clause starting at 14: +; R600-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; R600-NEXT: XOR_INT * T1.W, T0.X, T1.X, +; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; R600-NEXT: AND_INT T1.W, PS, 1, +; R600-NEXT: LSHL * T0.W, PV.W, literal.x, +; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; R600-NEXT: LSHL T0.X, PV.W, PS, +; R600-NEXT: LSHL * T0.W, literal.x, PS, +; R600-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; R600-NEXT: MOV T0.Y, 0.0, +; R600-NEXT: MOV * T0.Z, 0.0, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load volatile i1, ptr addrspace(1) %in0 + %b = load volatile i1, ptr addrspace(1) %in1 + %xor = xor i1 %a, %b + store i1 %xor, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; R600-LABEL: vector_xor_i32: +; R600: ; %bb.0: +; R600-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 1 @6 +; R600-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 10: +; R600-NEXT: MOV T0.X, KC0[2].Z, +; R600-NEXT: MOV * T1.X, KC0[2].W, +; R600-NEXT: ALU clause starting at 12: +; R600-NEXT: XOR_INT T0.X, T0.X, T1.X, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load i32, ptr addrspace(1) %in0 + %b = load i32, ptr addrspace(1) %in1 + %result = xor i32 %a, %b + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { +; R600-LABEL: scalar_xor_i32: +; R600: ; %bb.0: +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: NOT_INT * T1.X, KC0[2].Z, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %result = xor i32 %a, -1 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; R600-LABEL: vector_not_i32: +; R600: ; %bb.0: +; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @6 +; R600-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 8: +; R600-NEXT: MOV * T0.X, KC0[2].Z, +; R600-NEXT: ALU clause starting at 9: +; R600-NEXT: NOT_INT T0.X, T0.X, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load i32, ptr addrspace(1) %in0 + %b = load i32, ptr addrspace(1) %in1 + %result = xor i32 %a, -1 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; R600-LABEL: vector_xor_i64: +; R600: ; %bb.0: +; R600-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 1 @6 +; R600-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 +; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 10: +; R600-NEXT: MOV T0.X, KC0[2].Z, +; R600-NEXT: MOV * T1.X, KC0[2].W, +; R600-NEXT: ALU clause starting at 12: +; R600-NEXT: XOR_INT * T0.Y, T0.Y, T1.Y, +; R600-NEXT: XOR_INT T0.X, T0.X, T1.X, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load i64, ptr addrspace(1) %in0 + %b = load i64, ptr addrspace(1) %in1 + %result = xor i64 %a, %b + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; R600-LABEL: scalar_xor_i64: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: XOR_INT * T0.Y, KC0[3].X, KC0[3].Z, +; R600-NEXT: XOR_INT * T0.X, KC0[2].W, KC0[3].Y, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %result = xor i64 %a, %b + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { +; R600-LABEL: scalar_not_i64: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: NOT_INT * T0.Y, KC0[3].X, +; R600-NEXT: NOT_INT T0.X, KC0[2].W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %result = xor i64 %a, -1 + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; R600-LABEL: vector_not_i64: +; R600: ; %bb.0: +; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @6 +; R600-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 8: +; R600-NEXT: MOV * T0.X, KC0[2].Z, +; R600-NEXT: ALU clause starting at 9: +; R600-NEXT: NOT_INT * T0.Y, T0.Y, +; R600-NEXT: NOT_INT T0.X, T0.X, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load i64, ptr addrspace(1) %in0 + %b = load i64, ptr addrspace(1) %in1 + %result = xor i64 %a, -1 + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { +; R600-LABEL: xor_cf: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[] +; R600-NEXT: JUMP @5 POP:1 +; R600-NEXT: ALU 0, @19, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @12 +; R600-NEXT: ALU_POP_AFTER 1, @20, KC0[], KC1[] +; R600-NEXT: ALU_PUSH_BEFORE 2, @22, KC0[CB0:0-32], KC1[] +; R600-NEXT: JUMP @8 POP:1 +; R600-NEXT: ALU_POP_AFTER 5, @25, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 1, @31, KC0[], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 12: +; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 14: +; R600-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X, +; R600-NEXT: MOV * T1.W, literal.x, +; R600-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; R600-NEXT: SETNE_INT * T0.W, PV.W, 0.0, +; R600-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, +; R600-NEXT: ALU clause starting at 19: +; R600-NEXT: MOV * T0.X, KC0[2].Z, +; R600-NEXT: ALU clause starting at 20: +; R600-NEXT: MOV * T1.W, literal.x, +; R600-NEXT: 0(0.000000e+00), 0(0.000000e+00) +; R600-NEXT: ALU clause starting at 22: +; R600-NEXT: MOV T0.W, KC0[2].Y, +; R600-NEXT: SETE_INT * T1.W, T1.W, 0.0, +; R600-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, +; R600-NEXT: ALU clause starting at 25: +; R600-NEXT: MOV T1.W, KC0[2].W, +; R600-NEXT: MOV * T2.W, KC0[3].Y, +; R600-NEXT: XOR_INT T0.X, PV.W, PS, +; R600-NEXT: MOV T1.W, KC0[3].X, +; R600-NEXT: MOV * T2.W, KC0[3].Z, +; R600-NEXT: XOR_INT * T0.Y, PV.W, PS, +; R600-NEXT: ALU clause starting at 31: +; R600-NEXT: LSHR * T1.X, T0.W, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = xor i64 %a, %b + br label %endif + +else: + %2 = load i64, ptr addrspace(1) %in + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; R600-LABEL: scalar_xor_literal_i64: +; R600: ; %bb.0: +; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: XOR_INT * T0.Y, KC0[5].X, literal.x, +; R600-NEXT: 992123(1.390260e-39), 0(0.000000e+00) +; R600-NEXT: XOR_INT T0.X, KC0[4].W, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; R600-NEXT: 12345(1.729903e-41), 2(2.802597e-45) + %or = xor i64 %a, 4261135838621753 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { +; R600-LABEL: scalar_xor_literal_multi_use_i64: +; R600: ; %bb.0: +; R600-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 6: +; R600-NEXT: ADDC_UINT * T0.W, KC0[5].Y, literal.x, +; R600-NEXT: 12345(1.729903e-41), 0(0.000000e+00) +; R600-NEXT: ADD_INT T0.X, KC0[5].Y, literal.x, +; R600-NEXT: ADD_INT * T0.W, KC0[5].Z, PV.W, +; R600-NEXT: 12345(1.729903e-41), 0(0.000000e+00) +; R600-NEXT: ADD_INT T1.X, PV.W, literal.x, +; R600-NEXT: MOV * T2.X, literal.y, +; R600-NEXT: 992123(1.390260e-39), 0(0.000000e+00) +; R600-NEXT: XOR_INT * T3.Y, KC0[5].X, literal.x, +; R600-NEXT: 992123(1.390260e-39), 0(0.000000e+00) +; R600-NEXT: XOR_INT T3.X, KC0[4].W, literal.x, +; R600-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, +; R600-NEXT: 12345(1.729903e-41), 2(2.802597e-45) + %or = xor i64 %a, 4261135838621753 + store i64 %or, ptr addrspace(1) %out + + %foo = add i64 %b, 4261135838621753 + store volatile i64 %foo, ptr addrspace(1) undef + ret void +} + +define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; R600-LABEL: scalar_xor_inline_imm_i64: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.Y, KC0[5].X, +; R600-NEXT: XOR_INT T0.X, KC0[4].W, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; R600-NEXT: 63(8.828180e-44), 2(2.802597e-45) + %or = xor i64 %a, 63 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; R600-LABEL: scalar_xor_neg_inline_imm_i64: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: NOT_INT * T0.Y, KC0[5].X, +; R600-NEXT: XOR_INT T0.X, KC0[4].W, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; R600-NEXT: -8(nan), 2(2.802597e-45) + %or = xor i64 %a, -8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; R600-LABEL: vector_xor_i64_neg_inline_imm: +; R600: ; %bb.0: +; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @6 +; R600-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 8: +; R600-NEXT: MOV * T0.X, KC0[2].Z, +; R600-NEXT: ALU clause starting at 9: +; R600-NEXT: NOT_INT * T0.Y, T0.Y, +; R600-NEXT: XOR_INT T0.X, T0.X, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; R600-NEXT: -8(nan), 2(2.802597e-45) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = xor i64 %loada, -8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; R600-LABEL: vector_xor_literal_i64: +; R600: ; %bb.0: +; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @6 +; R600-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 8: +; R600-NEXT: MOV * T0.X, KC0[2].Z, +; R600-NEXT: ALU clause starting at 9: +; R600-NEXT: XOR_INT * T0.Y, T0.Y, literal.x, +; R600-NEXT: 5231(7.330192e-42), 0(0.000000e+00) +; R600-NEXT: XOR_INT T0.X, T0.X, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; R600-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = xor i64 %loada, 22470723082367 + store i64 %or, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 1315c0b52af43..e15fd7f29671a 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -1,16 +1,49 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}xor_v2i32: -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; SI-LABEL: xor_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, v3, v1 +; SI-NEXT: v_xor_b32_e32 v0, v2, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: xor_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, v1, v3 +; VI-NEXT: v_xor_b32_e32 v0, v0, v2 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %in0 %b = load <2 x i32>, ptr addrspace(1) %in1 %result = xor <2 x i32> %a, %b @@ -18,18 +51,52 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ret void } -; FUNC-LABEL: {{^}}xor_v4i32: -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; SI-LABEL: xor_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v7, v3 +; SI-NEXT: v_xor_b32_e32 v2, v6, v2 +; SI-NEXT: v_xor_b32_e32 v1, v5, v1 +; SI-NEXT: v_xor_b32_e32 v0, v4, v0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: xor_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v3, v7 +; VI-NEXT: v_xor_b32_e32 v2, v2, v6 +; VI-NEXT: v_xor_b32_e32 v1, v1, v5 +; VI-NEXT: v_xor_b32_e32 v0, v0, v4 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(1) %in0 %b = load <4 x i32>, ptr addrspace(1) %in1 %result = xor <4 x i32> %a, %b @@ -37,16 +104,54 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ret void } -; FUNC-LABEL: {{^}}xor_i1: -; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}} - -; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}} -; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}} -; SI: s_xor_b64 [[XOR:vcc]], [[CMP1]], [[CMP0]] -; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; SI-LABEL: xor_i1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: xor_i1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v2 +; VI-NEXT: s_xor_b64 vcc, vcc, s[0:1] +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -57,13 +162,50 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ret void } -; FUNC-LABEL: {{^}}v_xor_i1: -; SI: buffer_load_ubyte [[B:v[0-9]+]] -; SI: buffer_load_ubyte [[A:v[0-9]+]] -; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]] -; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] -; SI: buffer_store_byte [[RESULT]] define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; SI-LABEL: v_xor_i1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_xor_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_xor_i1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_load_ubyte v4, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_ubyte v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_xor_b32_e32 v2, v4, v2 +; VI-NEXT: v_and_b32_e32 v2, 1, v2 +; VI-NEXT: flat_store_byte v[0:1], v2 +; VI-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in0 %b = load volatile i1, ptr addrspace(1) %in1 %xor = xor i1 %a, %b @@ -71,9 +213,46 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 ret void } -; FUNC-LABEL: {{^}}vector_xor_i32: -; SI: v_xor_b32_e32 define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; SI-LABEL: vector_xor_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: vector_xor_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in0 %b = load i32, ptr addrspace(1) %in1 %result = xor i32 %a, %b @@ -81,25 +260,96 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; FUNC-LABEL: {{^}}scalar_xor_i32: -; SI: s_xor_b32 define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { +; SI-LABEL: scalar_xor_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_xor_b32 s0, s2, s3 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: scalar_xor_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %result = xor i32 %a, %b store i32 %result, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}scalar_not_i32: -; SI: s_not_b32 define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { +; SI-LABEL: scalar_not_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_not_b32 s4, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: scalar_not_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %result = xor i32 %a, -1 store i32 %result, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}vector_not_i32: -; SI: v_not_b32 define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; SI-LABEL: vector_not_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_not_b32_e32 v0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: vector_not_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_not_b32_e32 v2, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in0 %b = load i32, ptr addrspace(1) %in1 %result = xor i32 %a, -1 @@ -107,11 +357,48 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; FUNC-LABEL: {{^}}vector_xor_i64: -; SI: v_xor_b32_e32 -; SI: v_xor_b32_e32 -; SI: s_endpgm define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; SI-LABEL: vector_xor_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v0, v2, v0 +; SI-NEXT: v_xor_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: vector_xor_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v0, v0, v2 +; VI-NEXT: v_xor_b32_e32 v1, v1, v3 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %in0 %b = load i64, ptr addrspace(1) %in1 %result = xor i64 %a, %b @@ -119,27 +406,104 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; FUNC-LABEL: {{^}}scalar_xor_i64: -; SI: s_xor_b64 -; SI: s_endpgm define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; SI-LABEL: scalar_xor_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_xor_b64 s[4:5], s[6:7], s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: scalar_xor_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm %result = xor i64 %a, %b store i64 %result, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}scalar_not_i64: -; SI: s_not_b64 define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { +; SI-LABEL: scalar_not_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_not_b64 s[0:1], s[2:3] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: scalar_not_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_not_b64 s[0:1], s[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm %result = xor i64 %a, -1 store i64 %result, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}vector_not_i64: -; SI: v_not_b32 -; SI: v_not_b32 define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; SI-LABEL: vector_not_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_not_b32_e32 v0, v0 +; SI-NEXT: v_not_b32_e32 v1, v1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: vector_not_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: v_not_b32_e32 v1, v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %in0 %b = load i64, ptr addrspace(1) %in1 %result = xor i64 %a, -1 @@ -147,13 +511,65 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; Test that we have a pattern to match xor inside a branch. -; Note that in the future the backend may be smart enough to -; use an SALU instruction for this. - -; FUNC-LABEL: {{^}}xor_cf: -; SI: s_xor_b64 define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { +; SI-LABEL: xor_cf: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 +; SI-NEXT: s_and_b64 vcc, exec, s[10:11] +; SI-NEXT: s_cbranch_vccz .LBB12_4 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; SI-NEXT: s_cbranch_vccnz .LBB12_3 +; SI-NEXT: .LBB12_2: ; %if +; SI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: .LBB12_3: ; %endif +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB12_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB12_2 +; +; VI-LABEL: xor_cf: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VI-NEXT: s_cbranch_scc0 .LBB12_4 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; VI-NEXT: s_cbranch_vccnz .LBB12_3 +; VI-NEXT: .LBB12_2: ; %if +; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: .LBB12_3: ; %endif +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB12_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_branch .LBB12_2 entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else @@ -172,27 +588,82 @@ endif: ret void } -; FUNC-LABEL: {{^}}scalar_xor_literal_i64: -; SI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} -; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s{{[0-9]+}}, 0xf237b -; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s{{[0-9]+}}, 0x3039 -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]] -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]] define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; SI-LABEL: scalar_xor_literal_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b32 s5, s5, 0xf237b +; SI-NEXT: s_xor_b32 s4, s4, 0x3039 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: scalar_xor_literal_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b32 s3, s3, 0xf237b +; VI-NEXT: s_xor_b32 s2, s2, 0x3039 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}scalar_xor_literal_multi_use_i64: -; SI: s_load_dwordx4 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b -; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 -; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]] - -; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 -; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { +; SI-LABEL: scalar_xor_literal_multi_use_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x13 +; SI-NEXT: s_movk_i32 s8, 0x3039 +; SI-NEXT: s_mov_b32 s9, 0xf237b +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_add_u32 s0, s2, 0x3039 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_addc_u32 s1, s3, 0xf237b +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: scalar_xor_literal_multi_use_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_movk_i32 s2, 0x3039 +; VI-NEXT: s_mov_b32 s3, 0xf237b +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_add_u32 s0, s6, 0x3039 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_addc_u32 s1, s7, 0xf237b +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out @@ -201,51 +672,146 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou ret void } -; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64: -; SI: s_load_dwordx2 s[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; SI-NOT: xor_b32 -; SI: s_xor_b32 s[[VAL_LO]], s{{[0-9]+}}, 63 -; SI-NOT: xor_b32 -; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s{{[0-9]+}} -; SI-NOT: xor_b32 -; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s{{[0-9]+}} -; SI-NOT: xor_b32 -; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]] define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; SI-LABEL: scalar_xor_inline_imm_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b32 s4, s4, 63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: scalar_xor_inline_imm_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b32 s2, s2, 63 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %or = xor i64 %a, 63 store i64 %or, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -8 define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; SI-LABEL: scalar_xor_neg_inline_imm_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: scalar_xor_neg_inline_imm_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm %or = xor i64 %a, -8 store i64 %or, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}vector_xor_i64_neg_inline_imm: -; SI: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]] -; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}} -; SI: s_endpgm define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SI-LABEL: vector_xor_i64_neg_inline_imm: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v0, -8, v0 +; SI-NEXT: v_xor_b32_e32 v1, -1, v1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: vector_xor_i64_neg_inline_imm: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v0, -8, v0 +; VI-NEXT: v_xor_b32_e32 v1, -1, v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %loada = load i64, ptr addrspace(1) %a, align 8 %or = xor i64 %loada, -8 store i64 %or, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}vector_xor_literal_i64: -; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]] -; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]] -; SI: s_endpgm define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SI-LABEL: vector_xor_literal_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, 0x146f, v1 +; SI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: vector_xor_literal_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, 0x146f, v1 +; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm %loada = load i64, ptr addrspace(1) %a, align 8 %or = xor i64 %loada, 22470723082367 store i64 %or, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}}