From cdd6b18ff534917f2b1c2e795128dd410371990f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 22 Jan 2024 16:05:29 +0000 Subject: [PATCH 1/5] Precommit tests --- .../lower-work-group-id-intrinsics-hsa.ll | 277 ++++++++++++++++++ .../lower-work-group-id-intrinsics-pal.ll | 188 ++++++++++++ .../AMDGPU/lower-work-group-id-intrinsics.ll | 128 -------- 3 files changed, 465 insertions(+), 128 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll new file mode 100644 index 0000000000000..af1c601ee972a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -0,0 +1,277 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s + +define amdgpu_kernel void @workgroup_ids_kernel() { +; GFX9-LABEL: workgroup_ids_kernel: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel: +; GFX9ARCH-SDAG: ; %bb.0: ; %.entry +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: workgroup_ids_kernel: +; GFX9ARCH-GISEL: ; %bb.0: ; %.entry +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: workgroup_ids_kernel: +; GFX12-SDAG: ; %bb.0: ; %.entry +; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: workgroup_ids_kernel: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +.entry: + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + %idy = call i32 @llvm.amdgcn.workgroup.id.y() + %idz = call i32 @llvm.amdgcn.workgroup.id.z() + %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 + %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 + %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 + call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_kernel void @caller() { +; GFX9-SDAG-LABEL: caller: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s7 +; GFX9-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-SDAG-NEXT: s_add_u32 s8, s2, 36 +; GFX9-SDAG-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX9-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 +; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x0 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-SDAG-NEXT: s_mov_b32 s12, s6 +; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: caller: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s7 +; GFX9-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-GISEL-NEXT: s_add_u32 s8, s2, 36 +; GFX9-GISEL-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-GISEL-NEXT: s_mov_b32 s12, s6 +; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: caller: +; GFX9ARCH-SDAG: ; %bb.0: +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s2, 36 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s3, 0 +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s12, ttmp9 +; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: caller: +; GFX9ARCH-GISEL: ; %bb.0: +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s2, 36 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s3, 0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s12, ttmp9 +; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: caller: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-SDAG-NEXT: s_mov_b32 s12, ttmp9 +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: caller: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-GISEL-NEXT: s_mov_b32 s12, ttmp9 +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-GISEL-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + call void @callee(i32 %idx) #0 + ret void +} + +declare void @callee(i32) #0 + +define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) { +; GFX9-LABEL: workgroup_ids_device_func: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9ARCH-LABEL: workgroup_ids_device_func: +; GFX9ARCH: ; %bb.0: +; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-NEXT: v_mov_b32_e32 v6, s12 +; GFX9ARCH-NEXT: global_store_dword v[0:1], v6, off +; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-NEXT: v_mov_b32_e32 v0, s13 +; GFX9ARCH-NEXT: global_store_dword v[2:3], v0, off +; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-NEXT: v_mov_b32_e32 v0, s14 +; GFX9ARCH-NEXT: global_store_dword v[4:5], v0, off +; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: workgroup_ids_device_func: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 +; GFX12-NEXT: v_mov_b32_e32 v8, s14 +; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_store_b32 v[2:3], v7, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_store_b32 v[4:5], v8, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %id.x = call i32 @llvm.amdgcn.workgroup.id.x() + %id.y = call i32 @llvm.amdgcn.workgroup.id.y() + %id.z = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %id.x, ptr addrspace(1) %outx + store volatile i32 %id.y, ptr addrspace(1) %outy + store volatile i32 %id.z, ptr addrspace(1) %outz + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() +declare i32 @llvm.amdgcn.workgroup.id.y() +declare i32 @llvm.amdgcn.workgroup.id.z() +declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) + +attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll new file mode 100644 index 0000000000000..473b85459d3d3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll @@ -0,0 +1,188 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s + +define amdgpu_cs void @_amdgpu_cs_main() { +; GFX9-LABEL: _amdgpu_cs_main: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main: +; GFX9ARCH-SDAG: ; %bb.0: ; %.entry +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: _amdgpu_cs_main: +; GFX9ARCH-GISEL: ; %bb.0: ; %.entry +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: _amdgpu_cs_main: +; GFX12-SDAG: ; %bb.0: ; %.entry +; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: _amdgpu_cs_main: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +.entry: + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + %idy = call i32 @llvm.amdgcn.workgroup.id.y() + %idz = call i32 @llvm.amdgcn.workgroup.id.z() + %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 + %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 + %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 + call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_cs void @caller() { +; GFX9-LABEL: caller: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: caller: +; GFX9ARCH-SDAG: ; %bb.0: +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s10, -1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s8, s0 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s9, 0 +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: caller: +; GFX9ARCH-GISEL: ; %bb.0: +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s10, -1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s8, s0 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: caller: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: caller: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + call amdgpu_gfx void @callee(i32 %idx) + ret void +} + +declare amdgpu_gfx void @callee(i32) + +define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) { +; GFX9-LABEL: workgroup_ids_gfx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9ARCH-LABEL: workgroup_ids_gfx: +; GFX9ARCH: ; %bb.0: +; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: workgroup_ids_gfx: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %id.x = call i32 @llvm.amdgcn.workgroup.id.x() + %id.y = call i32 @llvm.amdgcn.workgroup.id.y() + %id.z = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %id.x, ptr addrspace(1) %outx + store volatile i32 %id.y, ptr addrspace(1) %outy + store volatile i32 %id.z, ptr addrspace(1) %outz + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() +declare i32 @llvm.amdgcn.workgroup.id.y() +declare i32 @llvm.amdgcn.workgroup.id.z() +declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll deleted file mode 100644 index 495b54758de04..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll +++ /dev/null @@ -1,128 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s - -define amdgpu_cs void @_amdgpu_cs_main() { -; GFX9-SDAG-LABEL: _amdgpu_cs_main: -; GFX9-SDAG: ; %bb.0: ; %.entry -; GFX9-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX9-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: _amdgpu_cs_main: -; GFX9-GISEL: ; %bb.0: ; %.entry -; GFX9-GISEL-NEXT: s_mov_b32 s0, ttmp9 -; GFX9-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX9-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: _amdgpu_cs_main: -; GFX12-SDAG: ; %bb.0: ; %.entry -; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: _amdgpu_cs_main: -; GFX12-GISEL: ; %bb.0: ; %.entry -; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 -; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm -.entry: - %idx = call i32 @llvm.amdgcn.workgroup.id.x() - %idy = call i32 @llvm.amdgcn.workgroup.id.y() - %idz = call i32 @llvm.amdgcn.workgroup.id.z() - %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 - %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 - %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 - call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) - ret void -} - -define amdgpu_cs void @caller() { -; GFX9-SDAG-LABEL: caller: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_getpc_b64 s[8:9] -; GFX9-SDAG-NEXT: s_mov_b32 s8, s0 -; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 -; GFX9-SDAG-NEXT: s_mov_b32 s5, callee@abs32@hi -; GFX9-SDAG-NEXT: s_mov_b32 s4, callee@abs32@lo -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_add_u32 s8, s8, s0 -; GFX9-SDAG-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9] -; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: caller: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_getpc_b64 s[8:9] -; GFX9-GISEL-NEXT: s_mov_b32 s8, s0 -; GFX9-GISEL-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 -; GFX9-GISEL-NEXT: s_mov_b32 s4, callee@abs32@lo -; GFX9-GISEL-NEXT: s_mov_b32 s5, callee@abs32@hi -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_add_u32 s8, s8, s0 -; GFX9-GISEL-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9] -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: caller: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi -; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo -; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: caller: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo -; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi -; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX12-GISEL-NEXT: s_endpgm - %idx = call i32 @llvm.amdgcn.workgroup.id.x() - call amdgpu_gfx void @callee(i32 %idx) - ret void -} - -declare amdgpu_gfx void @callee(i32) - -declare i32 @llvm.amdgcn.workgroup.id.x() -declare i32 @llvm.amdgcn.workgroup.id.y() -declare i32 @llvm.amdgcn.workgroup.id.z() -declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12: {{.*}} -; GFX9: {{.*}} From c7d065554d04fb102e168b8f6b2ccb1c5f0f29b8 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 23 Jan 2024 10:18:03 +0000 Subject: [PATCH 2/5] Implement architected SGPR support directly in legalization/isel. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 36 ++++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 36 ++++- .../lower-work-group-id-intrinsics-hsa.ll | 66 ++++++--- .../lower-work-group-id-intrinsics-pal.ll | 13 +- .../AMDGPU/workgroup-id-in-arch-sgprs.ll | 131 +++++++----------- 5 files changed, 170 insertions(+), 112 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8e74d4c0e9459..b88d7534f3e26 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4178,10 +4178,42 @@ bool AMDGPULegalizerInfo::loadInputValue( Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); - const ArgDescriptor *Arg; + const ArgDescriptor *Arg = nullptr; const TargetRegisterClass *ArgRC; LLT ArgTy; - std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); + + const ArgDescriptor WorkGroupIDX = + ArgDescriptor::createRegister(AMDGPU::TTMP9); + // TODO: No need to mask GridY if GridZ is not valid. + const ArgDescriptor WorkGroupIDY = + ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu); + const ArgDescriptor WorkGroupIDZ = + ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); + if (ST.hasArchitectedSGPRs() && + AMDGPU::isCompute(B.getMF().getFunction().getCallingConv())) { + switch (ArgType) { + case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: + Arg = &WorkGroupIDX; + ArgRC = &AMDGPU::SReg_32RegClass; + ArgTy = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: + Arg = &WorkGroupIDY; + ArgRC = &AMDGPU::SReg_32RegClass; + ArgTy = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: + Arg = &WorkGroupIDZ; + ArgRC = &AMDGPU::SReg_32RegClass; + ArgTy = LLT::scalar(32); + break; + default: + break; + } + } + + if (!Arg) + std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); if (!Arg) { if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 073c8cc721173..2cc0fc1f54ddc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2063,11 +2063,43 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT, AMDGPUFunctionArgInfo::PreloadedValue PVID) const { - const ArgDescriptor *Reg; + const ArgDescriptor *Reg = nullptr; const TargetRegisterClass *RC; LLT Ty; - std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); + const ArgDescriptor WorkGroupIDX = + ArgDescriptor::createRegister(AMDGPU::TTMP9); + // TODO: No need to mask GridY if GridZ is not valid. + const ArgDescriptor WorkGroupIDY = + ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu); + const ArgDescriptor WorkGroupIDZ = + ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); + if (Subtarget->hasArchitectedSGPRs() && + AMDGPU::isCompute( + DAG.getMachineFunction().getFunction().getCallingConv())) { + switch (PVID) { + case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: + Reg = &WorkGroupIDX; + RC = &AMDGPU::SReg_32RegClass; + Ty = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: + Reg = &WorkGroupIDY; + RC = &AMDGPU::SReg_32RegClass; + Ty = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: + Reg = &WorkGroupIDZ; + RC = &AMDGPU::SReg_32RegClass; + Ty = LLT::scalar(32); + break; + default: + break; + } + } + + if (!Reg) + std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); if (!Reg) { if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { // It's possible for a kernarg intrinsic call to appear in a kernel with diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index af1c601ee972a..063cba73886b4 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -17,11 +17,11 @@ define amdgpu_kernel void @workgroup_ids_kernel() { ; ; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel: ; GFX9ARCH-SDAG: ; %bb.0: ; %.entry -; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 ; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX9ARCH-SDAG-NEXT: s_endpgm ; @@ -38,11 +38,10 @@ define amdgpu_kernel void @workgroup_ids_kernel() { ; ; GFX12-SDAG-LABEL: workgroup_ids_kernel: ; GFX12-SDAG: ; %bb.0: ; %.entry -; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -230,19 +229,37 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX9ARCH-LABEL: workgroup_ids_device_func: -; GFX9ARCH: ; %bb.0: -; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9ARCH-NEXT: v_mov_b32_e32 v6, s12 -; GFX9ARCH-NEXT: global_store_dword v[0:1], v6, off -; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) -; GFX9ARCH-NEXT: v_mov_b32_e32 v0, s13 -; GFX9ARCH-NEXT: global_store_dword v[2:3], v0, off -; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) -; GFX9ARCH-NEXT: v_mov_b32_e32 v0, s14 -; GFX9ARCH-NEXT: global_store_dword v[4:5], v0, off -; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) -; GFX9ARCH-NEXT: s_setpc_b64 s[30:31] +; GFX9ARCH-SDAG-LABEL: workgroup_ids_device_func: +; GFX9ARCH-SDAG: ; %bb.0: +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v6, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX9ARCH-SDAG-NEXT: global_store_dword v[0:1], v6, off +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s4, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: global_store_dword v[2:3], v0, off +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9ARCH-SDAG-NEXT: global_store_dword v[4:5], v0, off +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9ARCH-GISEL-LABEL: workgroup_ids_device_func: +; GFX9ARCH-GISEL: ; %bb.0: +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v6, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s5, ttmp7, 16 +; GFX9ARCH-GISEL-NEXT: global_store_dword v[0:1], v6, off +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9ARCH-GISEL-NEXT: global_store_dword v[2:3], v0, off +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9ARCH-GISEL-NEXT: global_store_dword v[4:5], v0, off +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: workgroup_ids_device_func: ; GFX12: ; %bb.0: @@ -251,8 +268,11 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 -; GFX12-NEXT: v_mov_b32_e32 v8, s14 +; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0 +; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-NEXT: v_mov_b32_e32 v8, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_store_b32 v[2:3], v7, off scope:SCOPE_SYS @@ -275,3 +295,5 @@ declare i32 @llvm.amdgcn.workgroup.id.z() declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9ARCH: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll index 473b85459d3d3..cfff0a969da9e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll @@ -14,11 +14,11 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; ; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main: ; GFX9ARCH-SDAG: ; %bb.0: ; %.entry -; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 ; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX9ARCH-SDAG-NEXT: s_endpgm ; @@ -35,11 +35,10 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; ; GFX12-SDAG-LABEL: _amdgpu_cs_main: ; GFX12-SDAG: ; %bb.0: ; %.entry -; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index 769e6b0964abd..c2ce6e169dc53 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -5,43 +5,25 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { -; GFX9-SDAG-LABEL: workgroup_id_x: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm ; -; GFX9-GISEL-LABEL: workgroup_id_x: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: workgroup_id_x: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX9-LABEL: workgroup_id_x: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: workgroup_id_x: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: workgroup_id_x: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx @@ -52,23 +34,25 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace ; GFX9-LABEL: workgroup_id_xy: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, ttmp9 +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v2, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xy: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 -; GFX12-NEXT: v_mov_b32_e32 v2, ttmp7 +; GFX12-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: global_store_b32 v0, v2, s[2:3] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v2, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -81,37 +65,21 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace } define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { -; GFX9-SDAG-LABEL: workgroup_id_xyz: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[2:3] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[6:7] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: workgroup_id_xyz: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-GISEL-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[6:7] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: workgroup_id_xyz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_and_b32 s6, ttmp7, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: @@ -119,15 +87,15 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX12-NEXT: global_store_b32 v0, v2, s[6:7] -; GFX12-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: global_store_b32 v1, v2, s[6:7] +; GFX12-NEXT: global_store_b32 v1, v3, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -144,3 +112,8 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac declare i32 @llvm.amdgcn.workgroup.id.x() declare i32 @llvm.amdgcn.workgroup.id.y() declare i32 @llvm.amdgcn.workgroup.id.z() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} From 13a22d1173e07366480bfd4e7dc020d1e64ce52d Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 23 Jan 2024 10:57:43 +0000 Subject: [PATCH 3/5] Remove architected SGPR support from argument handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 ++++++++--------- .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 32 ++++++------------- .../AMDGPU/indirect-call-known-callees.ll | 1 - .../lower-work-group-id-intrinsics-hsa.ll | 4 --- 4 files changed, 22 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2cc0fc1f54ddc..2d7fd51b135be 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2528,28 +2528,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, } } - if (Info.hasWorkGroupIDX()) { - Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs); - if (!HasArchitectedSGPRs) + if (!HasArchitectedSGPRs) { + if (Info.hasWorkGroupIDX()) { + Register Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDY()) { - Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs); - if (!HasArchitectedSGPRs) + if (Info.hasWorkGroupIDY()) { + Register Reg = Info.addWorkGroupIDY(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDZ()) { - Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs); - if (!HasArchitectedSGPRs) + if (Info.hasWorkGroupIDZ()) { + Register Reg = Info.addWorkGroupIDZ(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); - - CCInfo.AllocateReg(Reg); + CCInfo.AllocateReg(Reg); + } } if (Info.hasWorkGroupInfo()) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ecc31fbd9dd3d..71513e3ea98e4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -744,35 +744,21 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, } // Add system SGPRs. - Register addWorkGroupIDX(bool HasArchitectedSGPRs) { - Register Reg = - HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR(); - ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg); - if (!HasArchitectedSGPRs) - NumSystemSGPRs += 1; - + Register addWorkGroupIDX() { + ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); + NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDX.getRegister(); } - Register addWorkGroupIDY(bool HasArchitectedSGPRs) { - Register Reg = - HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR(); - unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u; - ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask); - if (!HasArchitectedSGPRs) - NumSystemSGPRs += 1; - + Register addWorkGroupIDY() { + ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); + NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDY.getRegister(); } - Register addWorkGroupIDZ(bool HasArchitectedSGPRs) { - Register Reg = - HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR(); - unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u; - ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask); - if (!HasArchitectedSGPRs) - NumSystemSGPRs += 1; - + Register addWorkGroupIDZ() { + ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); + NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDZ.getRegister(); } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index 380a13ed16128..47110d9491887 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -55,7 +55,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: v_mov_b32_e32 v31, v0 -; GFX12-NEXT: s_mov_b32 s12, ttmp9 ; GFX12-NEXT: s_mov_b64 s[8:9], 0 ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index 063cba73886b4..afa914c8375f6 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -145,7 +145,6 @@ define amdgpu_kernel void @caller() { ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s12, ttmp9 ; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 @@ -173,7 +172,6 @@ define amdgpu_kernel void @caller() { ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s12, ttmp9 ; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -185,7 +183,6 @@ define amdgpu_kernel void @caller() { ; GFX12-SDAG-LABEL: caller: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-SDAG-NEXT: s_mov_b32 s12, ttmp9 ; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi ; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo @@ -198,7 +195,6 @@ define amdgpu_kernel void @caller() { ; GFX12-GISEL-LABEL: caller: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-GISEL-NEXT: s_mov_b32 s12, ttmp9 ; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo ; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi From b5cc9f2111308d59744c873aec13e6aa0069248d Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 23 Jan 2024 15:00:47 +0000 Subject: [PATCH 4/5] Optimize for workgroup ID Y in entry functions --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 9 +++++---- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +++++----- .../AMDGPU/workgroup-id-in-arch-sgprs.ll | 18 ++++++++---------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b88d7534f3e26..9c7da02d1cad8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4182,15 +4182,16 @@ bool AMDGPULegalizerInfo::loadInputValue( const TargetRegisterClass *ArgRC; LLT ArgTy; + CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); const ArgDescriptor WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::TTMP9); // TODO: No need to mask GridY if GridZ is not valid. - const ArgDescriptor WorkGroupIDY = - ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu); + const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( + AMDGPU::TTMP7, + AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu); const ArgDescriptor WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); - if (ST.hasArchitectedSGPRs() && - AMDGPU::isCompute(B.getMF().getFunction().getCallingConv())) { + if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) { switch (ArgType) { case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: Arg = &WorkGroupIDX; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2d7fd51b135be..a86c49ca45a88 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2067,16 +2067,16 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, const TargetRegisterClass *RC; LLT Ty; + CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv(); const ArgDescriptor WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::TTMP9); // TODO: No need to mask GridY if GridZ is not valid. - const ArgDescriptor WorkGroupIDY = - ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu); + const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( + AMDGPU::TTMP7, + AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu); const ArgDescriptor WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); - if (Subtarget->hasArchitectedSGPRs() && - AMDGPU::isCompute( - DAG.getMachineFunction().getFunction().getCallingConv())) { + if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) { switch (PVID) { case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: Reg = &WorkGroupIDX; diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index c2ce6e169dc53..40e4692a18ec7 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -35,24 +35,22 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_and_b32 s4, ttmp7, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: global_store_dword v1, v2, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xy: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-NEXT: s_and_b32 s4, ttmp7, 0xffff -; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: global_store_b32 v1, v2, s[2:3] +; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm From 4e93c15e3eb29495edf5ede0fecf433632130b64 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 23 Jan 2024 17:06:09 +0000 Subject: [PATCH 5/5] Update TODO comments --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 4 +++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 9c7da02d1cad8..fc02766a4b27a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4185,7 +4185,9 @@ bool AMDGPULegalizerInfo::loadInputValue( CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); const ArgDescriptor WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::TTMP9); - // TODO: No need to mask GridY if GridZ is not valid. + // If GridZ is not programmed in an entry function then the hardware will set + // it to all zeros, so there is no need to mask the GridY value in the low + // order bits. const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( AMDGPU::TTMP7, AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a86c49ca45a88..b81ec6629d834 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2070,7 +2070,9 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv(); const ArgDescriptor WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::TTMP9); - // TODO: No need to mask GridY if GridZ is not valid. + // If GridZ is not programmed in an entry function then the hardware will set + // it to all zeros, so there is no need to mask the GridY value in the low + // order bits. const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( AMDGPU::TTMP7, AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);