From 18ff9f41de12aa4fea0a3cf0c4d40c88781b1d8d Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sun, 27 Apr 2025 13:11:33 -0400 Subject: [PATCH] [NFC][AMDGPU] Auto generate check lines for some codegen tests Make preparation for #137488. --- llvm/test/CodeGen/AMDGPU/dag-divergence.ll | 87 +- llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll | 305 ++- .../test/CodeGen/AMDGPU/lds-misaligned-bug.ll | 930 ++++++++- .../CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll | 1694 ++++++++++++++-- .../CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll | 1736 +++++++++++++++-- llvm/test/CodeGen/AMDGPU/load-select-ptr.ll | 128 +- .../memory-legalizer-store-infinite-loop.ll | 25 +- .../AMDGPU/si-triv-disjoint-mem-access.ll | 594 ++++-- llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll | 466 ++++- .../InferAddressSpaces/AMDGPU/basic.ll | 181 +- .../AMDGPU/mem-intrinsics.ll | 119 +- 11 files changed, 5463 insertions(+), 802 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll index 9f83393d88061..cdf4a88814dfc 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll @@ -1,11 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}private_load_maybe_divergent: -; GCN: buffer_load_dword -; GCN-NOT: s_load_dword s -; GCN: flat_load_dword -; GCN-NOT: s_load_dword s define amdgpu_kernel void @private_load_maybe_divergent(ptr addrspace(4) %k, ptr %flat) { +; GCN-LABEL: private_load_maybe_divergent: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 +; GCN-NEXT: buffer_load_dword v0, v0, s[20:23], 0 offen glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_endpgm %load = load volatile i32, ptr addrspace(5) poison, align 4 %gep = getelementptr inbounds i32, ptr addrspace(4) %k, i32 %load %maybe.not.uniform.load = load i32, ptr addrspace(4) %gep, align 4 @@ -13,15 +31,27 @@ define amdgpu_kernel void @private_load_maybe_divergent(ptr addrspace(4) %k, ptr ret void } -; GCN-LABEL: {{^}}flat_load_maybe_divergent: -; GCN: s_load_dwordx4 -; GCN-NOT: s_load -; GCN: flat_load_dword -; GCN-NOT: s_load -; GCN: flat_load_dword -; GCN-NOT: s_load -; GCN: flat_store_dword define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %flat) { +; GCN-LABEL: flat_load_maybe_divergent: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_endpgm %load = load i32, ptr %flat, align 4 %gep = getelementptr inbounds i32, ptr addrspace(4) %k, i32 %load %maybe.not.uniform.load = load i32, ptr addrspace(4) %gep, align 4 @@ -34,12 +64,33 @@ define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %f ; last values are divergent due to the carry in glue (such that ; divergence needs to propagate through glue if there are any non-void ; outputs) -; GCN-LABEL: {{^}}wide_carry_divergence_error: -; GCN: v_sub_u32_e32 -; GCN: v_subb_u32_e32 -; GCN: v_subb_u32_e32 -; GCN: v_subb_u32_e32 define <2 x i128> @wide_carry_divergence_error(i128 %arg) { +; GCN-LABEL: wide_carry_divergence_error: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_ffbh_u32_e32 v0, v0 +; GCN-NEXT: v_ffbh_u32_e32 v4, v2 +; GCN-NEXT: v_add_u32_e64 v0, s[4:5], v0, 32 clamp +; GCN-NEXT: v_ffbh_u32_e32 v1, v1 +; GCN-NEXT: v_add_u32_e32 v4, vcc, 32, v4 +; GCN-NEXT: v_min3_u32 v0, v0, v1, 64 +; GCN-NEXT: v_add_u32_e32 v0, vcc, 64, v0 +; GCN-NEXT: v_ffbh_u32_e32 v5, v3 +; GCN-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GCN-NEXT: v_min_u32_e32 v4, v4, v5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GCN-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_subb_u32_e32 v2, vcc, 0, v3, vcc +; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: s_setpc_b64 s[30:31] %i = call i128 @llvm.ctlz.i128(i128 %arg, i1 false) %i1 = sub i128 0, %i %i2 = insertelement <2 x i128> zeroinitializer, i128 %i1, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll index 54343fa820cba..1732dd0521e5f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll @@ -1,13 +1,40 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9_11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9_11 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s -; GCN-LABEL: flat_inst_offset: -; GFX9_11: flat_load_{{dword|b32}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:4 -; GFX9_11: flat_store_{{dword|b32}} v[{{[0-9:]+}}], v{{[0-9]+}} offset:4 -; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} -; GFX10: flat_store_dword v[{{[0-9:]+}}], v{{[0-9]+}}{{$}} define void @flat_inst_offset(ptr nocapture %p) { +; GFX9-LABEL: flat_inst_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v2, v[0:1] offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 1, v2 +; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_inst_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v2, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i32, ptr %p, i64 1 %load = load i32, ptr %gep, align 4 %inc = add nsw i32 %load, 1 @@ -15,10 +42,34 @@ define void @flat_inst_offset(ptr nocapture %p) { ret void } -; GCN-LABEL: global_inst_offset: -; GCN: global_load_{{dword|b32}} v{{[0-9]+}}, v[{{[0-9:]+}}], off offset:4 -; GCN: global_store_{{dword|b32}} v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 define void @global_inst_offset(ptr addrspace(1) nocapture %p) { +; GFX9-LABEL: global_inst_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 1, v2 +; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_inst_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v2, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; GFX10-NEXT: global_store_dword v[0:1], v2, off offset:4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off offset:4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i32, ptr addrspace(1) %p, i64 1 %load = load i32, ptr addrspace(1) %gep, align 4 %inc = add nsw i32 %load, 1 @@ -26,10 +77,51 @@ define void @global_inst_offset(ptr addrspace(1) nocapture %p) { ret void } -; GCN-LABEL: load_i16_lo: -; GFX9_11: flat_load_{{short_d16|d16_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} -; GFX10: flat_load_short_d16 v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_i16_lo(ptr %arg, ptr %out) { +; GFX9-LABEL: load_i16_lo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v2, v2, v2 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_i16_lo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s0, 8 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v2, v2, v2 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: load_i16_lo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_d16_b16 v2, v[0:1] offset:8 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v2, v2, v2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr %arg, i32 4 %ld = load i16, ptr %gep, align 2 %vec = insertelement <2 x i16> , i16 %ld, i32 0 @@ -38,10 +130,51 @@ define amdgpu_kernel void @load_i16_lo(ptr %arg, ptr %out) { ret void } -; GCN-LABEL: load_i16_hi: -; GFX9_11: flat_load_{{short_d16_hi|d16_hi_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} -; GFX10: flat_load_short_d16_hi v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_i16_hi(ptr %arg, ptr %out) { +; GFX9-LABEL: load_i16_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v2, v2, v2 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_i16_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s0, 8 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_short_d16_hi v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v2, v2, v2 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: load_i16_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_d16_hi_b16 v2, v[0:1] offset:8 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v2, v2, v2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr %arg, i32 4 %ld = load i16, ptr %gep, align 2 %vec = insertelement <2 x i16> , i16 %ld, i32 1 @@ -50,10 +183,51 @@ define amdgpu_kernel void @load_i16_hi(ptr %arg, ptr %out) { ret void } -; GCN-LABEL: load_half_lo: -; GFX9_11: flat_load_{{short_d16|d16_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} -; GFX10: flat_load_short_d16 v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_half_lo(ptr %arg, ptr %out) { +; GFX9-LABEL: load_half_lo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v2, v2, v2 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_half_lo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s0, 8 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v2, v2, v2 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: load_half_lo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_d16_b16 v2, v[0:1] offset:8 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v2, v2, v2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds half, ptr %arg, i32 4 %ld = load half, ptr %gep, align 2 %vec = insertelement <2 x half> , half %ld, i32 0 @@ -62,10 +236,51 @@ define amdgpu_kernel void @load_half_lo(ptr %arg, ptr %out) { ret void } -; GCN-LABEL: load_half_hi: -; GFX9_11: flat_load_{{short_d16_hi|d16_hi_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} -; GFX10: flat_load_short_d16_hi v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_half_hi(ptr %arg, ptr %out) { +; GFX9-LABEL: load_half_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v2, v2, v2 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_half_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s0, 8 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_short_d16_hi v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v2, v2, v2 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: load_half_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_d16_hi_b16 v2, v[0:1] offset:8 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v2, v2, v2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds half, ptr %arg, i32 4 %ld = load half, ptr %gep, align 2 %vec = insertelement <2 x half> , half %ld, i32 1 @@ -74,10 +289,48 @@ define amdgpu_kernel void @load_half_hi(ptr %arg, ptr %out) { ret void } -; GCN-LABEL: load_float_lo: -; GFX9_11: flat_load_{{dword|b32}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:16{{$}} -; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_float_lo(ptr %arg, ptr %out) { +; GFX9-LABEL: load_float_lo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dword v2, v[0:1] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v2, v2, v2 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_float_lo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s0, 16 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_dword v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v2, v2, v2 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: load_float_lo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:16 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds float, ptr %arg, i32 4 %ld = load float, ptr %gep, align 4 %v = fadd float %ld, %ld diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 278ad63b0b76c..7ffc2a6987742 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -1,17 +1,68 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode -early-live-intervals < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=SPLIT %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefix=SPLIT %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefix=SPLIT %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefix=ALIGNED-GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefix=UNALIGNED-GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode -early-live-intervals < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefix=UNALIGNED-GFX11 %s -; GCN-LABEL: test_local_misaligned_v2: -; GCN-DAG: ds_{{read2|load_2addr}}_b32 -; GCN-DAG: ds_{{write2|store_2addr}}_b32 define amdgpu_kernel void @test_local_misaligned_v2(ptr addrspace(3) %arg) { +; SPLIT-LABEL: test_local_misaligned_v2: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; SPLIT-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: ds_write2_b32 v2, v1, v0 offset1:1 +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_local_misaligned_v2: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: ds_write2_b32 v2, v1, v0 offset1:1 +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_local_misaligned_v2: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: ds_write2_b32 v2, v1, v0 offset1:1 +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_local_misaligned_v2: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: ds_store_2addr_b32 v2, v1, v0 offset1:1 +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_local_misaligned_v2: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: ds_store_2addr_b32 v2, v1, v0 offset1:1 +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid @@ -24,12 +75,75 @@ bb: ret void } -; GCN-LABEL: test_local_misaligned_v4: -; GCN-DAG: ds_{{read2|load_2addr}}_b32 -; GCN-DAG: ds_{{read2|load_2addr}}_b32 -; GCN-DAG: ds_{{write2|store_2addr}}_b32 -; GCN-DAG: ds_{{write2|store_2addr}}_b32 define amdgpu_kernel void @test_local_misaligned_v4(ptr addrspace(3) %arg) { +; SPLIT-LABEL: test_local_misaligned_v4: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; SPLIT-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:3 +; SPLIT-NEXT: ds_read2_b32 v[2:3], v4 offset1:1 +; SPLIT-NEXT: s_waitcnt lgkmcnt(1) +; SPLIT-NEXT: ds_write2_b32 v4, v1, v0 offset1:1 +; SPLIT-NEXT: s_waitcnt lgkmcnt(1) +; SPLIT-NEXT: ds_write2_b32 v4, v3, v2 offset0:2 offset1:3 +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_local_misaligned_v4: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v4 offset1:1 +; ALIGNED-GFX10-NEXT: ds_read2_b32 v[2:3], v4 offset0:2 offset1:3 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-GFX10-NEXT: ds_write2_b32 v4, v1, v0 offset0:2 offset1:3 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-GFX10-NEXT: ds_write2_b32 v4, v3, v2 offset1:1 +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_local_misaligned_v4: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:3 +; UNALIGNED-GFX10-NEXT: ds_read2_b32 v[2:3], v4 offset1:1 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(1) +; UNALIGNED-GFX10-NEXT: ds_write2_b32 v4, v1, v0 offset1:1 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(1) +; UNALIGNED-GFX10-NEXT: ds_write2_b32 v4, v3, v2 offset0:2 offset1:3 +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_local_misaligned_v4: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v4 offset1:1 +; ALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[2:3], v4 offset0:2 offset1:3 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-GFX11-NEXT: ds_store_2addr_b32 v4, v1, v0 offset0:2 offset1:3 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-GFX11-NEXT: ds_store_2addr_b32 v4, v3, v2 offset1:1 +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_local_misaligned_v4: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v4 offset0:2 offset1:3 +; UNALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[2:3], v4 offset1:1 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(1) +; UNALIGNED-GFX11-NEXT: ds_store_2addr_b32 v4, v1, v0 offset1:1 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(1) +; UNALIGNED-GFX11-NEXT: ds_store_2addr_b32 v4, v3, v2 offset0:2 offset1:3 +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid @@ -46,12 +160,70 @@ bb: ret void } -; GCN-LABEL: test_local_misaligned_v3: -; GCN-DAG: ds_{{read2|load_2addr}}_b32 -; GCN-DAG: ds_{{read|load}}_b32 -; GCN-DAG: ds_{{write2|store_2addr}}_b32 -; GCN-DAG: ds_{{write|store}}_b32 define amdgpu_kernel void @test_local_misaligned_v3(ptr addrspace(3) %arg) { +; SPLIT-LABEL: test_local_misaligned_v3: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; SPLIT-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; SPLIT-NEXT: ds_read_b32 v3, v2 offset:8 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: ds_write2_b32 v2, v3, v0 offset1:1 +; SPLIT-NEXT: ds_write_b32 v2, v1 offset:8 +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_local_misaligned_v3: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; ALIGNED-GFX10-NEXT: ds_read_b32 v3, v2 offset:8 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: ds_write2_b32 v2, v3, v0 offset1:1 +; ALIGNED-GFX10-NEXT: ds_write_b32 v2, v1 offset:8 +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_local_misaligned_v3: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; UNALIGNED-GFX10-NEXT: ds_read_b32 v3, v2 offset:8 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: ds_write2_b32 v2, v3, v0 offset1:1 +; UNALIGNED-GFX10-NEXT: ds_write_b32 v2, v1 offset:8 +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_local_misaligned_v3: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; ALIGNED-GFX11-NEXT: ds_load_b32 v3, v2 offset:8 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: ds_store_2addr_b32 v2, v3, v0 offset1:1 +; ALIGNED-GFX11-NEXT: ds_store_b32 v2, v1 offset:8 +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_local_misaligned_v3: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v2, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; UNALIGNED-GFX11-NEXT: ds_load_b32 v3, v2 offset:8 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: ds_store_2addr_b32 v2, v3, v0 offset1:1 +; UNALIGNED-GFX11-NEXT: ds_store_b32 v2, v1 offset:8 +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid @@ -66,14 +238,82 @@ bb: ret void } -; GCN-LABEL: test_flat_misaligned_v2: -; VECT-DAG: flat_load_{{dwordx2|b64}} v -; VECT-DAG: flat_store_{{dwordx2|b64}} v -; SPLIT-DAG: flat_load_{{dword|b32}} v -; SPLIT-DAG: flat_load_{{dword|b32}} v -; SPLIT-DAG: flat_store_{{dword|b32}} v -; SPLIT-DAG: flat_store_{{dword|b32}} v define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { +; SPLIT-LABEL: test_flat_misaligned_v2: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 4 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: s_clause 0x1 +; SPLIT-NEXT: flat_load_dword v4, v[2:3] +; SPLIT-NEXT: flat_load_dword v5, v[0:1] +; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; SPLIT-NEXT: flat_store_dword v[0:1], v4 +; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; SPLIT-NEXT: flat_store_dword v[2:3], v5 +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_flat_misaligned_v2: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v2: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_flat_misaligned_v2: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v2: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr %arg, i32 %lid @@ -86,18 +326,97 @@ bb: ret void } -; GCN-LABEL: test_flat_misaligned_v4: -; VECT-DAG: flat_load_{{dwordx4|b128}} v -; VECT-DAG: flat_store_{{dwordx4|b128}} v -; SPLIT-DAG: flat_load_{{dword|b32}} v -; SPLIT-DAG: flat_load_{{dword|b32}} v -; SPLIT-DAG: flat_load_{{dword|b32}} v -; SPLIT-DAG: flat_load_{{dword|b32}} v -; SPLIT-DAG: flat_store_{{dword|b32}} v -; SPLIT-DAG: flat_store_{{dword|b32}} v -; SPLIT-DAG: flat_store_{{dword|b32}} v -; SPLIT-DAG: flat_store_{{dword|b32}} v define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) { +; SPLIT-LABEL: test_flat_misaligned_v4: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 12 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: v_add_co_u32 v4, vcc_lo, v0, 4 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: s_clause 0x3 +; SPLIT-NEXT: flat_load_dword v8, v[2:3] +; SPLIT-NEXT: flat_load_dword v9, v[4:5] +; SPLIT-NEXT: flat_load_dword v10, v[0:1] +; SPLIT-NEXT: flat_load_dword v11, v[6:7] +; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; SPLIT-NEXT: flat_store_dword v[6:7], v9 +; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; SPLIT-NEXT: flat_store_dword v[2:3], v10 +; SPLIT-NEXT: flat_store_dword v[0:1], v8 +; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; SPLIT-NEXT: flat_store_dword v[4:5], v11 +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_flat_misaligned_v4: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v4: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_flat_misaligned_v4: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 +; ALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v4: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr %arg, i32 %lid @@ -114,16 +433,89 @@ bb: ret void } -; GCN-LABEL: test_flat_misaligned_v3: -; VECT-DAG: flat_load_{{dwordx3|b96}} v -; VECT-DAG: flat_store_{{dwordx3|b96}} v -; SPLIT-DAG: flat_load_{{dword|b32}} v -; SPLIT-DAG: flat_load_{{dword|b32}} v -; SPLIT-DAG: flat_load_{{dword|b32}} v -; SPLIT-DAG: flat_store_{{dword|b32}} v -; SPLIT-DAG: flat_store_{{dword|b32}} v -; SPLIT-DAG: flat_store_{{dword|b32}} v define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { +; SPLIT-LABEL: test_flat_misaligned_v3: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 4 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: v_add_co_u32 v4, vcc_lo, v0, 8 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: s_clause 0x2 +; SPLIT-NEXT: flat_load_dword v6, v[2:3] +; SPLIT-NEXT: flat_load_dword v7, v[4:5] +; SPLIT-NEXT: flat_load_dword v8, v[0:1] +; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; SPLIT-NEXT: flat_store_dword v[4:5], v6 +; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; SPLIT-NEXT: flat_store_dword v[0:1], v7 +; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; SPLIT-NEXT: flat_store_dword v[2:3], v8 +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_flat_misaligned_v3: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 +; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v3: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_flat_misaligned_v3: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; ALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v3: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; UNALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr %arg, i32 %lid @@ -138,10 +530,65 @@ bb: ret void } -; GCN-LABEL: test_local_aligned_v2: -; GCN-DAG: ds_{{read|load}}_b64 -; GCN-DAG: ds_{{write|store}}_b64 define amdgpu_kernel void @test_local_aligned_v2(ptr addrspace(3) %arg) { +; SPLIT-LABEL: test_local_aligned_v2: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_lshl_add_u32 v3, v0, 2, s0 +; SPLIT-NEXT: ds_read_b64 v[0:1], v3 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 +; SPLIT-NEXT: ds_write_b64 v3, v[1:2] +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_local_aligned_v2: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v3, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read_b64 v[0:1], v3 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: ds_write_b64 v3, v[1:2] +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_local_aligned_v2: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v3, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read_b64 v[0:1], v3 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: ds_write_b64 v3, v[1:2] +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_local_aligned_v2: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v3, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_b64 v[0:1], v3 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: ds_store_b64 v3, v[1:2] +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_local_aligned_v2: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v3, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_b64 v[0:1], v3 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: ds_store_b64 v3, v[1:2] +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid @@ -154,10 +601,68 @@ bb: ret void } -; GCN-LABEL: test_local_aligned_v3: -; GCN-DAG: ds_{{read|load}}_b96 -; GCN-DAG: ds_{{write|store}}_b96 define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { +; SPLIT-LABEL: test_local_aligned_v3: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; SPLIT-NEXT: ds_read_b96 v[0:2], v5 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v3, v0 +; SPLIT-NEXT: v_mov_b32_e32 v4, v1 +; SPLIT-NEXT: ds_write_b96 v5, v[2:4] +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_local_aligned_v3: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 +; ALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_local_aligned_v3: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 +; UNALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_local_aligned_v3: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; ALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_local_aligned_v3: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; UNALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid @@ -172,10 +677,77 @@ bb: ret void } -; GCN-LABEL: test_flat_aligned_v2: -; GCN-DAG: flat_load_{{dwordx2|b64}} v -; GCN-DAG: flat_store_{{dwordx2|b64}} v define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { +; SPLIT-LABEL: test_flat_aligned_v2: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; SPLIT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v4, v2 +; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_flat_aligned_v2: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_flat_aligned_v2: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_flat_aligned_v2: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_flat_aligned_v2: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr %arg, i32 %lid @@ -188,10 +760,85 @@ bb: ret void } -; GCN-LABEL: test_flat_aligned_v4: -; GCN-DAG: flat_load_{{dwordx4|b128}} v -; GCN-DAG: flat_store_{{dwordx4|b128}} v define amdgpu_kernel void @test_flat_aligned_v4(ptr %arg) { +; SPLIT-LABEL: test_flat_aligned_v4: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_add_co_u32 v7, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 +; SPLIT-NEXT: flat_load_dwordx4 v[0:3], v[7:8] +; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v4, v2 +; SPLIT-NEXT: v_mov_b32_e32 v5, v1 +; SPLIT-NEXT: v_mov_b32_e32 v6, v0 +; SPLIT-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_flat_aligned_v4: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_flat_aligned_v4: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_flat_aligned_v4: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 +; ALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_flat_aligned_v4: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr %arg, i32 %lid @@ -208,12 +855,71 @@ bb: ret void } -; GCN-LABEL: test_local_v4_aligned8: -; ALIGNED-DAG: ds_{{read2|load_2addr}}_b64 -; ALIGNED-DAG: ds_{{write2|store_2addr}}_b64 -; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64 -; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64 define amdgpu_kernel void @test_local_v4_aligned8(ptr addrspace(3) %arg) { +; SPLIT-LABEL: test_local_v4_aligned8: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_lshl_add_u32 v6, v0, 2, s0 +; SPLIT-NEXT: ds_read2_b64 v[0:3], v6 offset1:1 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v4, v1 +; SPLIT-NEXT: v_mov_b32_e32 v5, v0 +; SPLIT-NEXT: v_mov_b32_e32 v1, v3 +; SPLIT-NEXT: ds_write2_b64 v6, v[1:2], v[4:5] offset1:1 +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_local_v4_aligned8: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read2_b64 v[0:3], v5 offset1:1 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: ds_write2_b64 v5, v[3:4], v[1:2] offset1:1 +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_local_v4_aligned8: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read2_b64 v[0:3], v5 offset1:1 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: ds_write2_b64 v5, v[3:4], v[1:2] offset1:1 +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_local_v4_aligned8: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_2addr_b64 v[0:3], v5 offset1:1 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: ds_store_2addr_b64 v5, v[3:4], v[1:2] offset1:1 +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_local_v4_aligned8: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_2addr_b64 v[0:3], v5 offset1:1 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: ds_store_2addr_b64 v5, v[3:4], v[1:2] offset1:1 +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid @@ -230,14 +936,92 @@ bb: ret void } -; GCN-LABEL: test_flat_v4_aligned8: -; VECT-DAG: flat_load_{{dwordx4|b128}} v -; VECT-DAG: flat_store_{{dwordx4|b128}} v -; SPLIT-DAG: flat_load_{{dwordx2|b64}} v -; SPLIT-DAG: flat_load_{{dwordx2|b64}} v -; SPLIT-DAG: flat_store_{{dwordx2|b64}} v -; SPLIT-DAG: flat_store_{{dwordx2|b64}} v define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) { +; SPLIT-LABEL: test_flat_v4_aligned8: +; SPLIT: ; %bb.0: ; %bb +; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: s_waitcnt lgkmcnt(0) +; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 8 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: s_clause 0x1 +; SPLIT-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; SPLIT-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; SPLIT-NEXT: v_mov_b32_e32 v8, v5 +; SPLIT-NEXT: v_mov_b32_e32 v9, v4 +; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v4, v7 +; SPLIT-NEXT: v_mov_b32_e32 v5, v6 +; SPLIT-NEXT: flat_store_dwordx2 v[2:3], v[8:9] +; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; SPLIT-NEXT: s_endpgm +; +; ALIGNED-GFX10-LABEL: test_flat_v4_aligned8: +; ALIGNED-GFX10: ; %bb.0: ; %bb +; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; ALIGNED-GFX10-NEXT: s_endpgm +; +; UNALIGNED-GFX10-LABEL: test_flat_v4_aligned8: +; UNALIGNED-GFX10: ; %bb.0: ; %bb +; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; UNALIGNED-GFX10-NEXT: s_endpgm +; +; ALIGNED-GFX11-LABEL: test_flat_v4_aligned8: +; ALIGNED-GFX11: ; %bb.0: ; %bb +; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 +; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 +; ALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; ALIGNED-GFX11-NEXT: s_endpgm +; +; UNALIGNED-GFX11-LABEL: test_flat_v4_aligned8: +; UNALIGNED-GFX11: ; %bb.0: ; %bb +; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr %arg, i32 %lid diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index 75d2f156bdd2c..a0db4ea8bc12a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -1,6 +1,7 @@ -; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,CIVI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #2 declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2 @@ -12,108 +13,416 @@ declare i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr nocapture, i64, i32, i32, i1) #2 declare i32 @llvm.amdgcn.workitem.id.x() #1 -; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +; CI-LABEL: lds_atomic_dec_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: ds_dec_rtn_u32 v0, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: ds_dec_rtn_u32 v0, v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_dec_rtn_u32 v0, v1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +; CI-LABEL: lds_atomic_dec_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], -; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_dec_u32 [[VPTR]], [[DATA]] define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) nounwind { +; CI-LABEL: lds_atomic_dec_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_dec_u32 v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_dec_u32 v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_dec_u32 v1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) nounwind { +; CI-LABEL: lds_atomic_dec_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_dec_u32 v1, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_dec_u32 v1, v0 offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_dec_u32 v1, v0 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_dec_ret_i32: -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: buffer_atomic_dec v0, off, s[8:11], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: buffer_atomic_dec v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset: -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} - -; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: buffer_atomic_dec v0, off, s[8:11], 0 offset:16 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: buffer_atomic_dec v0, off, s[8:11], 0 offset:16 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}global_atomic_dec_noret_i32: -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} - -; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) nounwind { +; CI-LABEL: global_atomic_dec_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_dec_noret_i32_offset: -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} - -; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %ptr) nounwind { +; CI-LABEL: global_atomic_dec_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset_addr64: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}} -; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[2:3] +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: buffer_atomic_dec v2, v[0:1], s[8:11], 0 addr64 offset:20 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id @@ -123,11 +432,47 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ret void } -; GCN-LABEL: {{^}}global_atomic_dec_noret_i32_offset_addr64: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} -; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 offset:20 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 @@ -135,49 +480,254 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -; GFX9: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16 glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) nounwind { +; CI-LABEL: flat_atomic_dec_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -; GFX9: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16{{$}} define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) nounwind { +; CI-LABEL: flat_atomic_dec_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset_addr64: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -; GFX9: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20 glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_store_dword v[0:1], v3 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_dec v3, v[0:1], v3 offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %out.gep = getelementptr i32, ptr %out, i32 %id @@ -187,11 +737,52 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset_addr64: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -; GFX9: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20{{$}} define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #0 { +; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %gep = getelementptr i32, ptr %gep.tid, i32 5 @@ -199,54 +790,269 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #0 ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} -; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32 glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}} define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) nounwind { +; CI-LABEL: flat_atomic_dec_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}} -; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32{{$}} define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) nounwind { +; CI-LABEL: flat_atomic_dec_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} -; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40 glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v2, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id @@ -256,12 +1062,55 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ret void } -; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}} -; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40{{$}} define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0 { +; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 @@ -271,13 +1120,64 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0 @lds0 = addrspace(3) global [512 x i32] poison -; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { +; CI-LABEL: atomic_dec_shl_base_lds_0: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; CI-NEXT: v_mov_b32_e32 v2, 9 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_dec_shl_base_lds_0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 9 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s2 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_dec_shl_base_lds_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v1, s[0:1] +; GFX9-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 @@ -287,114 +1187,443 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ret void } -; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}} define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +; CI-LABEL: lds_atomic_dec_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64_offset: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32 define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +; CI-LABEL: lds_atomic_dec_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: ds_dec_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}} define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) nounwind { +; CI-LABEL: lds_atomic_dec_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_dec_u64 v2, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_dec_u64 v2, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_dec_u64 v2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64_offset: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: ds_dec_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32{{$}} define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr) nounwind { +; CI-LABEL: lds_atomic_dec_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_dec_u64 v2, v[0:1] offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_dec_u64 v2, v[0:1] offset:32 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_dec_u64 v2, v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_dec_ret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} - -; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[8:11], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} -; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[8:11], 0 offset:32 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[8:11], 0 offset:32 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}global_atomic_dec_noret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GFX9: global_atomic_dec_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) nounwind { +; CI-LABEL: global_atomic_dec_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} -; GFX9: global_atomic_dec_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %ptr) nounwind { +; CI-LABEL: global_atomic_dec_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} -; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[8:11], 0 addr64 offset:40 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id @@ -404,13 +1633,50 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ret void } -; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} -; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]]{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:40 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 @@ -420,13 +1686,67 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa @lds1 = addrspace(3) global [512 x i64] poison, align 8 -; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} -; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { +; CI-LABEL: atomic_dec_shl_base_lds_0_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v1, 9 +; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_dec_shl_base_lds_0_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v1, 9 +; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s2 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX9-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index b28405f4ff113..36b9ddac8ef41 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -1,6 +1,7 @@ -; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #2 declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2 @@ -12,101 +13,416 @@ declare i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr nocapture, i64, i32, i32, i1) #2 declare i32 @llvm.amdgcn.workitem.id.x() #1 -; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +; CI-LABEL: lds_atomic_inc_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +; CI-LABEL: lds_atomic_inc_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], -; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_inc_u32 [[VPTR]], [[DATA]] define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounwind { +; CI-LABEL: lds_atomic_inc_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_inc_u32 v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_inc_u32 v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_inc_u32 v1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: -; CIVI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 - -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) nounwind { +; CI-LABEL: lds_atomic_inc_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_inc_u32 v1, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_inc_u32 v1, v0 offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_inc_u32 v1, v0 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_inc_ret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: buffer_atomic_inc v0, off, s[8:11], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: buffer_atomic_inc v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} -; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: buffer_atomic_inc v0, off, s[8:11], 0 offset:16 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: buffer_atomic_inc v0, off, s[8:11], 0 offset:16 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}global_atomic_inc_noret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) nounwind { +; CI-LABEL: global_atomic_inc_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_inc_noret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} -; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) nounwind { +; CI-LABEL: global_atomic_inc_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset_addr64: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}} -; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[2:3] +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: buffer_atomic_inc v2, v[0:1], s[8:11], 0 addr64 offset:20 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id @@ -116,11 +432,47 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ret void } -; GCN-LABEL: {{^}}global_atomic_inc_noret_i32_offset_addr64: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} -; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 offset:20 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 @@ -130,10 +482,64 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa @lds0 = addrspace(3) global [512 x i32] poison, align 4 -; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { +; CI-LABEL: atomic_inc_shl_base_lds_0_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; CI-NEXT: v_mov_b32_e32 v2, 9 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_inc_shl_base_lds_0_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 9 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s2 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v1, s[0:1] +; GFX9-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 @@ -143,102 +549,443 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ret void } -; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}} define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +; CI-LABEL: lds_atomic_inc_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32 define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +; CI-LABEL: lds_atomic_inc_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}lds_atomic_inc_noret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: ds_inc_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}} define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounwind { +; CI-LABEL: lds_atomic_inc_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_inc_u64 v2, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_inc_u64 v2, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_inc_u64 v2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}lds_atomic_inc_noret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: ds_inc_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32{{$}} define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) nounwind { +; CI-LABEL: lds_atomic_inc_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_inc_ret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[8:11], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} -; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[8:11], 0 offset:32 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[8:11], 0 offset:32 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}global_atomic_inc_noret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} - -; GFX9: global_atomic_inc_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) nounwind { +; CI-LABEL: global_atomic_inc_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} -; GFX9: global_atomic_inc_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) nounwind { +; CI-LABEL: global_atomic_inc_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} -; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[8:11], 0 addr64 offset:40 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id @@ -248,13 +995,50 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ret void } -; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} -; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]]{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #0 { +; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:40 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 @@ -262,49 +1046,254 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -; GFX9: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16 glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) nounwind { +; CI-LABEL: flat_atomic_inc_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -; GFX9: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16{{$}} define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind { +; CI-LABEL: flat_atomic_inc_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset_addr64: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -; GFX9: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20 glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_store_dword v[0:1], v3 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_inc v3, v[0:1], v3 offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %out.gep = getelementptr i32, ptr %out, i32 %id @@ -314,11 +1303,52 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset_addr64: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; CIVI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -; GFX9: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20{{$}} define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 { +; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %gep = getelementptr i32, ptr %gep.tid, i32 5 @@ -328,10 +1358,67 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 @lds1 = addrspace(3) global [512 x i64] poison, align 8 -; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} -; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { +; CI-LABEL: atomic_inc_shl_base_lds_0_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v1, 9 +; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_inc_shl_base_lds_0_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v1, 9 +; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s2 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX9-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 @@ -341,54 +1428,269 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} -; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32 glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}} define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind { +; CI-LABEL: flat_atomic_inc_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}} -; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32{{$}} define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind { +; CI-LABEL: flat_atomic_inc_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} -; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40 glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v2, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id @@ -398,12 +1700,55 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ret void } -; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} -; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}} -; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40{{$}} define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0 { +; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 @@ -411,11 +1756,70 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0 ret void } -; GCN-LABEL: {{^}}nocse_lds_atomic_inc_ret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 -; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] -; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #0 { +; CI-LABEL: nocse_lds_atomic_inc_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dword s8, s[4:5], 0xd +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s8 +; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: nocse_lds_atomic_inc_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s2 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: s_endpgm %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false) %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index 1a6fa3c518ca7..9e518589ac5b3 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -1,24 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Combine on select c, (load x), (load y) -> load (select c, x, y) ; drops MachinePointerInfo, so it can't be relied on for correctness. -; GCN-LABEL: {{^}}select_ptr_crash_i64_flat: -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 - -; GCN: s_cmp_eq_u32 -; GCN: s_cselect_b32 -; GCN: s_cselect_b32 - -; GCN-NOT: load_dword -; GCN: flat_load_dword -; GCN: flat_load_dword -; GCN-NOT: load_dword - -; GCN: flat_store_dwordx2 define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %ptr0, [8 x i32], ptr %ptr1, [8 x i32], ptr addrspace(1) %ptr2) { +; GCN-LABEL: select_ptr_crash_i64_flat: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s6, s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x28 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x50 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x78 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, s2 +; GCN-NEXT: s_cselect_b32 s1, s1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_add_u32 s0, s0, 4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: flat_load_dword v1, v[1:2] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_endpgm %tmp2 = icmp eq i32 %tmp, 0 %tmp3 = load i64, ptr %ptr0, align 8 %tmp4 = load i64, ptr %ptr1, align 8 @@ -30,16 +42,29 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p ; The transform currently doesn't happen for non-addrspace 0, but it ; should. -; GCN-LABEL: {{^}}select_ptr_crash_i64_global: -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN: s_cselect_b32 -; GCN: s_cselect_b32 -; GCN: flat_store_dwordx2 define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr addrspace(1) %ptr0, [8 x i32], ptr addrspace(1) %ptr1, [8 x i32], ptr addrspace(1) %ptr2) { +; GCN-LABEL: select_ptr_crash_i64_global: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x28 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x50 +; GCN-NEXT: s_load_dword s6, s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x78 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NEXT: s_cmp_eq_u32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cselect_b32 s1, s1, s3 +; GCN-NEXT: s_cselect_b32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_endpgm %tmp2 = icmp eq i32 %tmp, 0 %tmp3 = load i64, ptr addrspace(1) %ptr0, align 8 %tmp4 = load i64, ptr addrspace(1) %ptr1, align 8 @@ -48,13 +73,29 @@ define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr ret void } -; GCN-LABEL: {{^}}select_ptr_crash_i64_local: -; GCN: ds_read_b64 -; GCN: ds_read_b64 -; GCN: v_cndmask_b32 -; GCN: v_cndmask_b32 -; GCN: flat_store_dwordx2 define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3) %ptr0, ptr addrspace(3) %ptr1, ptr addrspace(1) %ptr2) { +; GCN-LABEL: select_ptr_crash_i64_local: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ds_read_b64 v[0:1], v0 +; GCN-NEXT: ds_read_b64 v[2:3], v2 +; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_endpgm %tmp2 = icmp eq i32 %tmp, 0 %tmp3 = load i64, ptr addrspace(3) %ptr0, align 8 %tmp4 = load i64, ptr addrspace(3) %ptr1, align 8 @@ -66,12 +107,29 @@ define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3) ; The transform will break addressing mode matching, so unclear it ; would be good to do -; GCN-LABEL: {{^}}select_ptr_crash_i64_local_offsets: -; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:128 -; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:512 -; GCN: v_cndmask_b32 -; GCN: v_cndmask_b32 define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addrspace(3) %ptr0, ptr addrspace(3) %ptr1, ptr addrspace(1) %ptr2) { +; GCN-LABEL: select_ptr_crash_i64_local_offsets: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; GCN-NEXT: ds_read_b64 v[2:3], v2 offset:512 +; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_endpgm %tmp2 = icmp eq i32 %tmp, 0 %gep0 = getelementptr inbounds i64, ptr addrspace(3) %ptr0, i64 16 %gep1 = getelementptr inbounds i64, ptr addrspace(3) %ptr1, i64 64 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll index db1399cc74dc6..a476a5830ffad 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll @@ -6,12 +6,27 @@ ; ( MergeConsecutiveStores() ) and breaking the resulting ST8 ; apart ( LegalizeStoreOps() ). -target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" - -; GCN-LABEL: {{^}}_Z6brokenPd: -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} define amdgpu_kernel void @_Z6brokenPd(ptr %arg) { +; GCN-LABEL: _Z6brokenPd: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s4, s4, 4 +; GCN-NEXT: s_addc_u32 s5, s5, 0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_endpgm bb: %tmp = alloca double, align 8, addrspace(5) %tmp1 = alloca double, align 8, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index db7ab57d80ed9..61da875cf2f28 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,5 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s %struct.lds = type { [64 x ptr], [16 x i8] } @stored_lds_struct = addrspace(3) global %struct.lds poison, align 16 @@ -7,11 +8,41 @@ @stored_constant_ptr = addrspace(3) global ptr addrspace(4) poison, align 8 @stored_global_ptr = addrspace(3) global ptr addrspace(1) poison, align 8 -; GCN-LABEL: {{^}}no_reorder_flat_load_local_store_local_load: -; GCN: flat_load_dwordx4 -; GCN: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:512 -; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:129 offset1:130 define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrspace(3) %out, ptr %fptr) #0 { +; CI-LABEL: no_reorder_flat_load_local_store_local_load: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: s_load_dword s0, s[4:5], 0x9 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: ds_write_b128 v4, v[0:3] offset:512 +; CI-NEXT: ds_read2_b32 v[0:1], v4 offset0:129 offset1:130 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: no_reorder_flat_load_local_store_local_load: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b128 v4, v[0:3] offset:512 +; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:129 offset1:130 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 +; GFX9-NEXT: s_endpgm %ptr1 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1 %ptr2 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1, i32 4 call void @llvm.memcpy.p3.p0(ptr addrspace(3) align 16 %ptr1, ptr align 8 %fptr, i64 16, i1 false) @@ -20,14 +51,43 @@ define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrs ret void } -; GCN-LABEL: {{^}}reorder_local_load_global_store_local_load: -; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3 -; CI: buffer_store_dword - -; GFX9: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3 -; GFX9: global_store_dword -; GFX9: global_store_dword define amdgpu_kernel void @reorder_local_load_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 { +; CI-LABEL: reorder_local_load_global_store_local_load: +; CI: ; %bb.0: +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_mov_b32_e32 v2, 0x63 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:3 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: reorder_local_load_global_store_local_load: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b32 v0, v2 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x63 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:3 +; GFX9-NEXT: global_store_dword v2, v3, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: s_endpgm %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1 @@ -43,15 +103,47 @@ define amdgpu_kernel void @reorder_local_load_global_store_local_load(ptr addrsp ret void } -; GCN-LABEL: {{^}}no_reorder_local_load_volatile_global_store_local_load: -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; CI: buffer_store_dword -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 - -; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; GFX9: global_store_dword -; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 { +; CI-LABEL: no_reorder_local_load_volatile_global_store_local_load: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: v_mov_b32_e32 v2, 0x63 +; CI-NEXT: ds_read_b32 v1, v0 offset:4 +; CI-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: ds_read_b32 v0, v0 offset:12 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: no_reorder_local_load_volatile_global_store_local_load: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: ds_read_b32 v1, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x63 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v3, v1 offset:4 +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_read_b32 v1, v1 offset:12 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1 @@ -67,17 +159,49 @@ define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_loa ret void } -; GCN-LABEL: {{^}}no_reorder_barrier_local_load_global_store_local_load: -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 -; CI: buffer_store_dword - -; GFX9-DAG: global_store_dword -; GFX9-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; GFX9: s_barrier -; GFX9-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 -; GFX9-DAG: global_store_dword define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 { +; CI-LABEL: no_reorder_barrier_local_load_global_store_local_load: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: v_mov_b32_e32 v2, 0x63 +; CI-NEXT: ds_read_b32 v1, v0 offset:4 +; CI-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_barrier +; CI-NEXT: ds_read_b32 v0, v0 offset:12 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: no_reorder_barrier_local_load_global_store_local_load: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: ds_read_b32 v1, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x63 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v3, v1 offset:4 +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_barrier +; GFX9-NEXT: ds_read_b32 v1, v1 offset:12 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1 @@ -94,21 +218,51 @@ define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load ret void } -; GCN-LABEL: {{^}}reorder_constant_load_global_store_constant_load: -; GCN-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} -; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} - -; CI: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x1 -; CI: buffer_store_dword -; CI: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x3 - -; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x4 -; GFX9: global_store_dword -; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0xc - -; CI: buffer_store_dword -; GFX9: global_store_dword define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 { +; CI-LABEL: reorder_constant_load_global_store_constant_load: +; CI: ; %bb.0: +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: ds_read_b64 v[0:1], v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: v_readfirstlane_b32 s2, v0 +; CI-NEXT: v_readfirstlane_b32 s3, v1 +; CI-NEXT: v_mov_b32_e32 v0, 0x63 +; CI-NEXT: s_load_dword s12, s[2:3], 0x1 +; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CI-NEXT: s_load_dword s2, s[2:3], 0x3 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s0, s12, s2 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: reorder_constant_load_global_store_constant_load: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[0:1], v2 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x63 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4 +; GFX9-NEXT: global_store_dword v2, v3, s[2:3] +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: s_endpgm %ptr0 = load ptr addrspace(4), ptr addrspace(3) @stored_constant_ptr, align 8 %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1 @@ -124,20 +278,49 @@ define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(ptr ret void } -; GCN-LABEL: {{^}}reorder_constant_load_local_store_constant_load: -; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} -; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} - -; CI-DAG: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x1 -; CI-DAG: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x3 - -; GFX9-DAG: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x4 -; GFX9-DAG: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0xc - -; GCN-DAG: ds_write_b32 -; CI: buffer_store_dword -; GFX9: global_store_dword define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(ptr addrspace(1) %out, ptr addrspace(3) %lptr) #0 { +; CI-LABEL: reorder_constant_load_local_store_constant_load: +; CI: ; %bb.0: +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b64 v[0:1], v0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_load_dword s6, s[4:5], 0xb +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_readfirstlane_b32 s4, v0 +; CI-NEXT: v_readfirstlane_b32 s5, v1 +; CI-NEXT: s_load_dword s7, s[4:5], 0x1 +; CI-NEXT: s_load_dword s4, s[4:5], 0x3 +; CI-NEXT: v_mov_b32_e32 v0, 0x63 +; CI-NEXT: v_mov_b32_e32 v1, s6 +; CI-NEXT: ds_write_b32 v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s4, s7, s4 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: reorder_constant_load_local_store_constant_load: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[0:1], v2 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x4 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0xc +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s0, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: s_endpgm %ptr0 = load ptr addrspace(4), ptr addrspace(3) @stored_constant_ptr, align 8 %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1 @@ -153,14 +336,42 @@ define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(ptr a ret void } -; GCN-LABEL: {{^}}reorder_smrd_load_local_store_smrd_load: -; GCN: s_load_dword -; GCN: s_load_dword -; GCN: s_load_dword -; GCN: ds_write_b32 -; CI: buffer_store_dword -; GFX9: global_store_dword define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(ptr addrspace(1) %out, ptr addrspace(3) noalias %lptr, ptr addrspace(4) %ptr0) #0 { +; CI-LABEL: reorder_smrd_load_local_store_smrd_load: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_load_dword s8, s[4:5], 0xb +; CI-NEXT: v_mov_b32_e32 v0, 0x63 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v1, s8 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: ds_write_b32 v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s4, s4, s5 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: reorder_smrd_load_local_store_smrd_load: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: ds_write_b32 v2, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s0, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1 %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 2 @@ -174,16 +385,46 @@ define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(ptr addrspace ret void } -; GCN-LABEL: {{^}}reorder_global_load_local_store_global_load: -; CI: buffer_load_dword -; CI: buffer_load_dword -; CI: ds_write_b32 -; CI: buffer_store_dword - -; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 -; GFX9: ds_write_b32 define amdgpu_kernel void @reorder_global_load_local_store_global_load(ptr addrspace(1) %out, ptr addrspace(3) %lptr, ptr addrspace(1) %ptr0) #0 { +; CI-LABEL: reorder_global_load_local_store_global_load: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 +; CI-NEXT: s_load_dword s0, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 0x63 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: ds_write_b32 v3, v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: reorder_global_load_local_store_global_load: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x63 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:12 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: ds_write_b32 v4, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i64 1 %ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i64 3 @@ -197,14 +438,44 @@ define amdgpu_kernel void @reorder_global_load_local_store_global_load(ptr addrs ret void } -; GCN-LABEL: {{^}}reorder_local_offsets: -; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102 -; GCN-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100 -; GCN-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408 -; CI: buffer_store_dword -; GFX9: global_store_dword -; GCN: s_endpgm define amdgpu_kernel void @reorder_local_offsets(ptr addrspace(1) nocapture %out, ptr addrspace(1) noalias nocapture readnone %gptr, ptr addrspace(3) noalias nocapture %ptr0) #0 { +; CI-LABEL: reorder_local_offsets: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s6, s[4:5], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_mov_b32_e32 v2, 0x7b +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s6 +; CI-NEXT: ds_read2_b32 v[0:1], v3 offset0:100 offset1:102 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: ds_write2_b32 v3, v2, v2 offset0:3 offset1:100 +; CI-NEXT: v_mov_b32_e32 v2, 0x315 +; CI-NEXT: ds_write_b32 v3, v2 offset:408 +; CI-NEXT: s_waitcnt lgkmcnt(2) +; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 0x7b, v0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: reorder_local_offsets: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: ds_read2_b32 v[0:1], v3 offset0:100 offset1:102 +; GFX9-NEXT: ds_write2_b32 v3, v4, v4 offset0:3 offset1:100 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x315 +; GFX9-NEXT: ds_write_b32 v3, v4 offset:408 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: s_endpgm %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3 %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 100 %ptr3 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 102 @@ -222,23 +493,49 @@ define amdgpu_kernel void @reorder_local_offsets(ptr addrspace(1) nocapture %out ret void } -; GCN-LABEL: {{^}}reorder_global_offsets: -; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408 -; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408 -; CI: buffer_store_dword -; CI: s_endpgm - -; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:400 -; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:408 -; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 -; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:400 -; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:408 -; GFX9: global_store_dword -; GFX9: s_endpgm define amdgpu_kernel void @reorder_global_offsets(ptr addrspace(1) nocapture %out, ptr addrspace(1) noalias nocapture readnone %gptr, ptr addrspace(1) noalias nocapture %ptr0) #0 { +; CI-LABEL: reorder_global_offsets: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:400 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:408 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 0x7b +; CI-NEXT: v_mov_b32_e32 v3, 0x315 +; CI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 +; CI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:400 +; CI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:408 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 0x7b, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: reorder_global_offsets: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:400 +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:408 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_store_dword v0, v3, s[0:1] offset:12 +; GFX9-NEXT: global_store_dword v0, v3, s[0:1] offset:400 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x315 +; GFX9-NEXT: global_store_dword v0, v3, s[0:1] offset:408 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7b, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 3 %ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 100 %ptr3 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 102 @@ -256,32 +553,50 @@ define amdgpu_kernel void @reorder_global_offsets(ptr addrspace(1) nocapture %ou ret void } -; GCN-LABEL: {{^}}reorder_global_offsets_addr64_soffset0: -; CI: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; CI-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:28{{$}} -; CI-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:44{{$}} - -; CI: v_mov_b32 -; CI: v_mov_b32 - -; CI-DAG: v_add_i32 -; CI-DAG: v_add_i32 - -; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} -; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} -; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} - -; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 -; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 -; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 - -; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 -; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 -; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 - define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(ptr addrspace(1) noalias nocapture %ptr.base) #0 { +; CI-LABEL: reorder_global_offsets_addr64_soffset0: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:12 +; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:28 +; CI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:44 +; CI-NEXT: v_mov_b32_e32 v5, 0x315 +; CI-NEXT: v_mov_b32_e32 v6, 0x7b +; CI-NEXT: buffer_store_dword v5, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64 offset:20 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:36 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:52 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: reorder_global_offsets_addr64_soffset0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x315 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7b +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:12 +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:28 +; GFX9-NEXT: global_load_dword v3, v0, s[0:1] offset:44 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_store_dword v0, v4, s[0:1] +; GFX9-NEXT: global_store_dword v0, v5, s[0:1] offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:52 +; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 @@ -305,10 +620,39 @@ define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(ptr addrspace( ret void } -; GCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load: -; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:2 -; GCN: tbuffer_store_format define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(ptr addrspace(1) %out, i32 %a1, i32 %vaddr) #0 { +; CI-LABEL: reorder_local_load_tbuffer_store_local_load: +; CI: ; %bb.0: +; CI-NEXT: v_mov_b32_e32 v4, stored_lds_ptr@abs32@lo +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v4, v4 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_read2_b32 v[4:5], v4 offset0:1 offset1:2 +; CI-NEXT: v_add_i32_e32 v3, vcc, 32, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: tbuffer_store_format_xyzw v[2:5], v3, s[0:3], 0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_SNORM_OGL] idxen glc slc +; CI-NEXT: s_nop 0 +; CI-NEXT: v_add_i32_e32 v2, vcc, v4, v5 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: reorder_local_load_tbuffer_store_local_load: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, stored_lds_ptr@abs32@lo +; GFX9-NEXT: ds_read_b32 v4, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 32, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset0:1 offset1:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: tbuffer_store_format_xyzw v[2:5], v3, s[0:3], 0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_RESERVED_6] idxen glc slc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_e32 v2, v4, v5 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index cfb2d66df8a71..a376262e6d539 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -1,15 +1,43 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9PLUS,GFX8_9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX9PLUS %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX9PLUS %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: barrier_vmcnt_global: -; GFX8: flat_load_dword -; GFX9PLUS: global_load_{{dword|b32}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX9PLUS: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) { +; GFX8-LABEL: barrier_vmcnt_global: +; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_barrier +; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: barrier_vmcnt_global: +; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_barrier +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -26,14 +54,46 @@ bb: ret void } -; GCN-LABEL: barrier_vscnt_global: -; GFX8: flat_store_dword -; GFX9PLUS: global_store_{{dword|b32}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX9: s_waitcnt vmcnt(0){{$}} -; GFX10PLUS: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) { +; GFX8-LABEL: barrier_vscnt_global: +; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_barrier +; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: barrier_vscnt_global: +; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_barrier +; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -52,14 +112,50 @@ bb: ret void } -; GCN-LABEL: barrier_vmcnt_vscnt_global: -; GFX8: flat_load_dword -; GFX9PLUS: global_load_{{dword|b32}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX9PLUS: s_waitcnt vmcnt(0){{$}} -; GFX10PLUS: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) { +; GFX8-LABEL: barrier_vmcnt_vscnt_global: +; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_barrier +; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: barrier_vmcnt_vscnt_global: +; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_barrier +; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -80,11 +176,42 @@ bb: ret void } -; GCN-LABEL: barrier_vmcnt_flat: -; GCN: flat_load_{{dword|b32}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_flat(ptr %arg) { +; GFX8-LABEL: barrier_vmcnt_flat: +; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_barrier +; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: barrier_vmcnt_flat: +; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: flat_load_dword v4, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_barrier +; GFX9-NEXT: flat_store_dword v[0:1], v4 +; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -101,13 +228,46 @@ bb: ret void } -; GCN-LABEL: barrier_vscnt_flat: -; GCN: flat_store_{{dword|b32}} -; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10PLUS: s_waitcnt lgkmcnt(0){{$}} -; GFX10PLUS: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) { +; GFX8-LABEL: barrier_vscnt_flat: +; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_barrier +; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: barrier_vscnt_flat: +; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: flat_store_dword v[2:3], v1 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_barrier +; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -126,12 +286,52 @@ bb: ret void } -; GCN-LABEL: barrier_vmcnt_vscnt_flat: -; GCN: flat_load_{{dword|b32}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10PLUS: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) { +; GFX8-LABEL: barrier_vmcnt_vscnt_flat: +; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_barrier +; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: barrier_vmcnt_vscnt_flat: +; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: flat_store_dword v[2:3], v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dword v3, v[2:3] +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_barrier +; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -152,14 +352,54 @@ bb: ret void } -; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup: -; GCN: flat_load_{{dword|b32}} -; GFX8_9: s_waitcnt lgkmcnt(0){{$}} -; GFX8_9: s_waitcnt vmcnt(0){{$}} -; GFX10PLUS: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10PLUS: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { +; GFX8-LABEL: barrier_vmcnt_vscnt_flat_workgroup: +; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_barrier +; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: barrier_vmcnt_vscnt_flat_workgroup: +; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: flat_store_dword v[2:3], v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dword v3, v[2:3] +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_barrier +; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -180,13 +420,38 @@ bb: ret void } -; GCN-LABEL: load_vmcnt_global: -; GFX8: flat_load_dword -; GFX9PLUS: global_load_{{dword|b32}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX9PLUS: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: {{global|flat}}_store_{{dword|b32}} define amdgpu_kernel void @load_vmcnt_global(ptr addrspace(1) %arg) { +; GFX8-LABEL: load_vmcnt_global: +; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: load_vmcnt_global: +; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -200,12 +465,40 @@ bb: ret void } -; GCN-LABEL: load_vmcnt_flat: -; GCN: flat_load_{{dword|b32}} -; GCN-NOT: vscnt -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: {{global|flat}}_store_{{dword|b32}} define amdgpu_kernel void @load_vmcnt_flat(ptr %arg) { +; GFX8-LABEL: load_vmcnt_flat: +; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: load_vmcnt_flat: +; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: flat_load_dword v4, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dword v[0:1], v4 +; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -219,39 +512,64 @@ bb: ret void } -; GCN-LABEL: store_vscnt_private: -; GCN: {{buffer|scratch}}_store_{{dword|b32}} -; GFX8_9: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @store_vscnt_private(ptr addrspace(5) %p) { +; GFX8-LABEL: store_vscnt_private: +; GFX8: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: store_vscnt_private: +; GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i32 0, ptr addrspace(5) %p ret void } -; GCN-LABEL: store_vscnt_global: -; GFX8: flat_store_dword -; GFX9PLUS: global_store_{{dword|b32}} -; GFX8_9: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @store_vscnt_global(ptr addrspace(1) %p) { +; GFX8-LABEL: store_vscnt_global: +; GFX8: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: store_vscnt_global: +; GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i32 0, ptr addrspace(1) %p ret void } -; GCN-LABEL: store_vscnt_flat: -; GCN: flat_store_{{dword|b32}} -; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10PLUS: s_waitcnt lgkmcnt(0){{$}} -; GCN-NEXT: s_setpc_b64 define void @store_vscnt_flat(ptr %p) { +; GFX8-LABEL: store_vscnt_flat: +; GFX8: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: store_vscnt_flat: +; GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i32 0, ptr %p ret void } -; GCN-LABEL: function_prologue: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: s_setpc_b64 define void @function_prologue() { +; GCN-LABEL: function_prologue: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ret void } diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll index eb39684a98b5f..99fe986cf6378 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll @@ -1,80 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name _ --version 5 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s ; Trivial optimization of generic addressing -; CHECK-LABEL: @load_global_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(1) -; CHECK-NEXT: %tmp1 = load float, ptr addrspace(1) %tmp0 -; CHECK-NEXT: ret float %tmp1 define float @load_global_from_flat(ptr %generic_scalar) #0 { +; CHECK-LABEL: define float @load_global_from_flat( +; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(1) +; CHECK-NEXT: [[_TMP1:%.*]] = load float, ptr addrspace(1) [[_TMP0]], align 4 +; CHECK-NEXT: ret float [[_TMP1]] +; %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(1) %tmp1 = load float, ptr addrspace(1) %tmp0 ret float %tmp1 } -; CHECK-LABEL: @load_constant_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(4) -; CHECK-NEXT: %tmp1 = load float, ptr addrspace(4) %tmp0 -; CHECK-NEXT: ret float %tmp1 define float @load_constant_from_flat(ptr %generic_scalar) #0 { +; CHECK-LABEL: define float @load_constant_from_flat( +; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(4) +; CHECK-NEXT: [[_TMP1:%.*]] = load float, ptr addrspace(4) [[_TMP0]], align 4 +; CHECK-NEXT: ret float [[_TMP1]] +; %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(4) %tmp1 = load float, ptr addrspace(4) %tmp0 ret float %tmp1 } -; CHECK-LABEL: @load_group_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(3) -; CHECK-NEXT: %tmp1 = load float, ptr addrspace(3) %tmp0 -; CHECK-NEXT: ret float %tmp1 define float @load_group_from_flat(ptr %generic_scalar) #0 { +; CHECK-LABEL: define float @load_group_from_flat( +; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(3) +; CHECK-NEXT: [[_TMP1:%.*]] = load float, ptr addrspace(3) [[_TMP0]], align 4 +; CHECK-NEXT: ret float [[_TMP1]] +; %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(3) %tmp1 = load float, ptr addrspace(3) %tmp0 ret float %tmp1 } -; CHECK-LABEL: @load_private_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(5) -; CHECK-NEXT: %tmp1 = load float, ptr addrspace(5) %tmp0 -; CHECK-NEXT: ret float %tmp1 define float @load_private_from_flat(ptr %generic_scalar) #0 { +; CHECK-LABEL: define float @load_private_from_flat( +; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(5) +; CHECK-NEXT: [[_TMP1:%.*]] = load float, ptr addrspace(5) [[_TMP0]], align 4 +; CHECK-NEXT: ret float [[_TMP1]] +; %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(5) %tmp1 = load float, ptr addrspace(5) %tmp0 ret float %tmp1 } -; CHECK-LABEL: @store_global_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(1) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) %tmp0 define amdgpu_kernel void @store_global_from_flat(ptr %generic_scalar) #0 { +; CHECK-LABEL: define amdgpu_kernel void @store_global_from_flat( +; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(1) +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[_TMP0]], align 4 +; CHECK-NEXT: ret void +; %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(1) store float 0.0, ptr addrspace(1) %tmp0 ret void } -; CHECK-LABEL: @store_group_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(3) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(3) %tmp0 define amdgpu_kernel void @store_group_from_flat(ptr %generic_scalar) #0 { +; CHECK-LABEL: define amdgpu_kernel void @store_group_from_flat( +; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(3) +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(3) [[_TMP0]], align 4 +; CHECK-NEXT: ret void +; %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(3) store float 0.0, ptr addrspace(3) %tmp0 ret void } -; CHECK-LABEL: @store_private_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(5) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) %tmp0 define amdgpu_kernel void @store_private_from_flat(ptr %generic_scalar) #0 { +; CHECK-LABEL: define amdgpu_kernel void @store_private_from_flat( +; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(5) +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[_TMP0]], align 4 +; CHECK-NEXT: ret void +; %tmp0 = addrspacecast ptr %generic_scalar to ptr addrspace(5) store float 0.0, ptr addrspace(5) %tmp0 ret void } ; optimized to global load/store. -; CHECK-LABEL: @load_store_global( -; CHECK-NEXT: %val = load i32, ptr addrspace(1) %input, align 4 -; CHECK-NEXT: store i32 %val, ptr addrspace(1) %output, align 4 -; CHECK-NEXT: ret void define amdgpu_kernel void @load_store_global(ptr addrspace(1) nocapture %input, ptr addrspace(1) nocapture %output) #0 { +; CHECK-LABEL: define amdgpu_kernel void @load_store_global( +; CHECK-SAME: ptr addrspace(1) captures(none) [[INPUT:%.*]], ptr addrspace(1) captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[INPUT]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUTPUT]], align 4 +; CHECK-NEXT: ret void +; %tmp0 = addrspacecast ptr addrspace(1) %input to ptr %tmp1 = addrspacecast ptr addrspace(1) %output to ptr %val = load i32, ptr %tmp0, align 4 @@ -83,11 +103,13 @@ define amdgpu_kernel void @load_store_global(ptr addrspace(1) nocapture %input, } ; Optimized to group load/store. -; CHECK-LABEL: @load_store_group( -; CHECK-NEXT: %val = load i32, ptr addrspace(3) %input, align 4 -; CHECK-NEXT: store i32 %val, ptr addrspace(3) %output, align 4 -; CHECK-NEXT: ret void define amdgpu_kernel void @load_store_group(ptr addrspace(3) nocapture %input, ptr addrspace(3) nocapture %output) #0 { +; CHECK-LABEL: define amdgpu_kernel void @load_store_group( +; CHECK-SAME: ptr addrspace(3) captures(none) [[INPUT:%.*]], ptr addrspace(3) captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(3) [[INPUT]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(3) [[OUTPUT]], align 4 +; CHECK-NEXT: ret void +; %tmp0 = addrspacecast ptr addrspace(3) %input to ptr %tmp1 = addrspacecast ptr addrspace(3) %output to ptr %val = load i32, ptr %tmp0, align 4 @@ -96,11 +118,13 @@ define amdgpu_kernel void @load_store_group(ptr addrspace(3) nocapture %input, p } ; Optimized to private load/store. -; CHECK-LABEL: @load_store_private( -; CHECK-NEXT: %val = load i32, ptr addrspace(5) %input, align 4 -; CHECK-NEXT: store i32 %val, ptr addrspace(5) %output, align 4 -; CHECK-NEXT: ret void define amdgpu_kernel void @load_store_private(ptr addrspace(5) nocapture %input, ptr addrspace(5) nocapture %output) #0 { +; CHECK-LABEL: define amdgpu_kernel void @load_store_private( +; CHECK-SAME: ptr addrspace(5) captures(none) [[INPUT:%.*]], ptr addrspace(5) captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(5) [[INPUT]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(5) [[OUTPUT]], align 4 +; CHECK-NEXT: ret void +; %tmp0 = addrspacecast ptr addrspace(5) %input to ptr %tmp1 = addrspacecast ptr addrspace(5) %output to ptr %val = load i32, ptr %tmp0, align 4 @@ -109,72 +133,97 @@ define amdgpu_kernel void @load_store_private(ptr addrspace(5) nocapture %input, } ; No optimization. flat load/store. -; CHECK-LABEL: @load_store_flat( -; CHECK-NEXT: %val = load i32, ptr %input, align 4 -; CHECK-NEXT: store i32 %val, ptr %output, align 4 -; CHECK-NEXT: ret void define amdgpu_kernel void @load_store_flat(ptr nocapture %input, ptr nocapture %output) #0 { +; CHECK-LABEL: define amdgpu_kernel void @load_store_flat( +; CHECK-SAME: ptr captures(none) [[INPUT:%.*]], ptr captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[INPUT]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[OUTPUT]], align 4 +; CHECK-NEXT: ret void +; %val = load i32, ptr %input, align 4 store i32 %val, ptr %output, align 4 ret void } -; CHECK-LABEL: @store_addrspacecast_ptr_value( -; CHECK: %cast = addrspacecast ptr addrspace(1) %input to ptr -; CHECK-NEXT: store ptr %cast, ptr addrspace(1) %output, align 4 define amdgpu_kernel void @store_addrspacecast_ptr_value(ptr addrspace(1) nocapture %input, ptr addrspace(1) nocapture %output) #0 { +; CHECK-LABEL: define amdgpu_kernel void @store_addrspacecast_ptr_value( +; CHECK-SAME: ptr addrspace(1) captures(none) [[INPUT:%.*]], ptr addrspace(1) captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(1) [[INPUT]] to ptr +; CHECK-NEXT: store ptr [[CAST]], ptr addrspace(1) [[OUTPUT]], align 4 +; CHECK-NEXT: ret void +; %cast = addrspacecast ptr addrspace(1) %input to ptr store ptr %cast, ptr addrspace(1) %output, align 4 ret void } -; CHECK-LABEL: @atomicrmw_add_global_to_flat( -; CHECK-NEXT: %ret = atomicrmw add ptr addrspace(1) %global.ptr, i32 %y seq_cst define i32 @atomicrmw_add_global_to_flat(ptr addrspace(1) %global.ptr, i32 %y) #0 { +; CHECK-LABEL: define i32 @atomicrmw_add_global_to_flat( +; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = atomicrmw add ptr addrspace(1) [[GLOBAL_PTR]], i32 [[Y]] seq_cst, align 4 +; CHECK-NEXT: ret i32 [[RET]] +; %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr %ret = atomicrmw add ptr %cast, i32 %y seq_cst ret i32 %ret } -; CHECK-LABEL: @atomicrmw_add_group_to_flat( -; CHECK-NEXT: %ret = atomicrmw add ptr addrspace(3) %group.ptr, i32 %y seq_cst define i32 @atomicrmw_add_group_to_flat(ptr addrspace(3) %group.ptr, i32 %y) #0 { +; CHECK-LABEL: define i32 @atomicrmw_add_group_to_flat( +; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = atomicrmw add ptr addrspace(3) [[GROUP_PTR]], i32 [[Y]] seq_cst, align 4 +; CHECK-NEXT: ret i32 [[RET]] +; %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr %ret = atomicrmw add ptr %cast, i32 %y seq_cst ret i32 %ret } -; CHECK-LABEL: @cmpxchg_global_to_flat( -; CHECK: %ret = cmpxchg ptr addrspace(1) %global.ptr, i32 %cmp, i32 %val seq_cst monotonic define { i32, i1 } @cmpxchg_global_to_flat(ptr addrspace(1) %global.ptr, i32 %cmp, i32 %val) #0 { +; CHECK-LABEL: define { i32, i1 } @cmpxchg_global_to_flat( +; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], i32 [[CMP:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = cmpxchg ptr addrspace(1) [[GLOBAL_PTR]], i32 [[CMP]], i32 [[VAL]] seq_cst monotonic, align 4 +; CHECK-NEXT: ret { i32, i1 } [[RET]] +; %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr %ret = cmpxchg ptr %cast, i32 %cmp, i32 %val seq_cst monotonic ret { i32, i1 } %ret } -; CHECK-LABEL: @cmpxchg_group_to_flat( -; CHECK: %ret = cmpxchg ptr addrspace(3) %group.ptr, i32 %cmp, i32 %val seq_cst monotonic define { i32, i1 } @cmpxchg_group_to_flat(ptr addrspace(3) %group.ptr, i32 %cmp, i32 %val) #0 { +; CHECK-LABEL: define { i32, i1 } @cmpxchg_group_to_flat( +; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]], i32 [[CMP:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = cmpxchg ptr addrspace(3) [[GROUP_PTR]], i32 [[CMP]], i32 [[VAL]] seq_cst monotonic, align 4 +; CHECK-NEXT: ret { i32, i1 } [[RET]] +; %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr %ret = cmpxchg ptr %cast, i32 %cmp, i32 %val seq_cst monotonic ret { i32, i1 } %ret } ; Not pointer operand -; CHECK-LABEL: @cmpxchg_group_to_flat_wrong_operand( -; CHECK: %cast.cmp = addrspacecast ptr addrspace(3) %cmp.ptr to ptr -; CHECK: %ret = cmpxchg ptr addrspace(3) %cas.ptr, ptr %cast.cmp, ptr %val seq_cst monotonic define { ptr, i1 } @cmpxchg_group_to_flat_wrong_operand(ptr addrspace(3) %cas.ptr, ptr addrspace(3) %cmp.ptr, ptr %val) #0 { +; CHECK-LABEL: define { ptr, i1 } @cmpxchg_group_to_flat_wrong_operand( +; CHECK-SAME: ptr addrspace(3) [[CAS_PTR:%.*]], ptr addrspace(3) [[CMP_PTR:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[CAST_CMP:%.*]] = addrspacecast ptr addrspace(3) [[CMP_PTR]] to ptr +; CHECK-NEXT: [[RET:%.*]] = cmpxchg ptr addrspace(3) [[CAS_PTR]], ptr [[CAST_CMP]], ptr [[VAL]] seq_cst monotonic, align 8 +; CHECK-NEXT: ret { ptr, i1 } [[RET]] +; %cast.cmp = addrspacecast ptr addrspace(3) %cmp.ptr to ptr %ret = cmpxchg ptr addrspace(3) %cas.ptr, ptr %cast.cmp, ptr %val seq_cst monotonic ret { ptr, i1 } %ret } ; Null pointer in local addr space -; CHECK-LABEL: @local_nullptr -; CHECK: icmp ne ptr addrspace(3) %a, addrspacecast (ptr addrspace(5) null to ptr addrspace(3)) -; CHECK-NOT: ptr addrspace(3) null define void @local_nullptr(ptr addrspace(1) nocapture %results, ptr addrspace(3) %a) { +; CHECK-LABEL: define void @local_nullptr( +; CHECK-SAME: ptr addrspace(1) captures(none) [[RESULTS:%.*]], ptr addrspace(3) [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne ptr addrspace(3) [[A]], addrspacecast (ptr addrspace(5) null to ptr addrspace(3)) +; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32 +; CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[RESULTS]], align 4 +; CHECK-NEXT: ret void +; entry: %tobool = icmp ne ptr addrspace(3) %a, addrspacecast (ptr addrspace(5) null to ptr addrspace(3)) %conv = zext i1 %tobool to i32 @@ -182,18 +231,23 @@ entry: ret void } -; CHECK-LABEL: @atomicrmw_add_global_to_flat_preserve_amdgpu_md( -; CHECK-NEXT: %ret = atomicrmw add ptr addrspace(1) %global.ptr, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 define i32 @atomicrmw_add_global_to_flat_preserve_amdgpu_md(ptr addrspace(1) %global.ptr, i32 %y) #0 { +; CHECK-LABEL: define i32 @atomicrmw_add_global_to_flat_preserve_amdgpu_md( +; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = atomicrmw add ptr addrspace(1) [[GLOBAL_PTR]], i32 [[Y]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.no.remote.memory [[META0]] +; CHECK-NEXT: ret i32 [[RET]] +; %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr %ret = atomicrmw add ptr %cast, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret i32 %ret } ; Make sure there's no assert -; CHECK-LABEL: @try_infer_getelementptr_constant_null( -; CHECK-NEXT: %ce = getelementptr i8, ptr getelementptr inbounds (i8, ptr null, i64 8), i64 0 define ptr @try_infer_getelementptr_constant_null() { +; CHECK-LABEL: define ptr @try_infer_getelementptr_constant_null() { +; CHECK-NEXT: [[CE:%.*]] = getelementptr i8, ptr getelementptr inbounds (i8, ptr null, i64 8), i64 0 +; CHECK-NEXT: ret ptr [[CE]] +; %ce = getelementptr i8, ptr getelementptr inbounds (i8, ptr null, i64 8), i64 0 ret ptr %ce } @@ -201,3 +255,6 @@ define ptr @try_infer_getelementptr_constant_null() { attributes #0 = { nounwind } !0 = !{} +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll index 50b0e7a0f5471..48becdeba1c6a 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll @@ -1,107 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s -; CHECK-LABEL: @memset_group_to_flat( -; CHECK: call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 %group.ptr, i8 4, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memset_group_to_flat(ptr addrspace(3) %group.ptr, i32 %y) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memset_group_to_flat( +; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], i8 4, i64 32, i1 false), !tbaa [[TBAA0:![0-9]+]], !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]] +; CHECK-NEXT: ret void +; %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr call void @llvm.memset.p0.i64(ptr align 4 %cast, i8 4, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void } -; CHECK-LABEL: @memset_global_to_flat( -; CHECK: call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %global.ptr, i8 4, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memset_global_to_flat(ptr addrspace(1) %global.ptr, i32 %y) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memset_global_to_flat( +; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 [[GLOBAL_PTR]], i8 4, i64 32, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: ret void +; %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr call void @llvm.memset.p0.i64(ptr align 4 %cast, i8 4, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void } -; CHECK-LABEL: @memset_group_to_flat_no_md( -; CHECK: call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 %group.ptr, i8 4, i64 %size, i1 false){{$}} define amdgpu_kernel void @memset_group_to_flat_no_md(ptr addrspace(3) %group.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memset_group_to_flat_no_md( +; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], i8 4, i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr call void @llvm.memset.p0.i64(ptr align 4 %cast, i8 4, i64 %size, i1 false) ret void } -; CHECK-LABEL: @memset_global_to_flat_no_md( -; CHECK: call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %global.ptr, i8 4, i64 %size, i1 false){{$}} define amdgpu_kernel void @memset_global_to_flat_no_md(ptr addrspace(1) %global.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memset_global_to_flat_no_md( +; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 [[GLOBAL_PTR]], i8 4, i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr call void @llvm.memset.p0.i64(ptr align 4 %cast, i8 4, i64 %size, i1 false) ret void } -; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group( -; CHECK: call void @llvm.memcpy.p0.p3.i64(ptr align 4 %dest, ptr addrspace(3) align 4 %src.group.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group( +; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: ret void +; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void } -; CHECK-LABEL: @memcpy_inline_flat_to_flat_replace_src_with_group( -; CHECK: call void @llvm.memcpy.inline.p0.p3.i64(ptr align 4 %dest, ptr addrspace(3) align 4 %src.group.ptr, i64 42, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group( +; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: ret void +; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr call void @llvm.memcpy.inline.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 42, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void } -; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group( -; CHECK: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 4 %dest.group.ptr, ptr align 4 %src.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(ptr addrspace(3) %dest.group.ptr, ptr %src.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group( +; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr [[SRC_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr align 4 [[SRC_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: ret void +; %cast.dest = addrspacecast ptr addrspace(3) %dest.group.ptr to ptr call void @llvm.memcpy.p0.p0.i64(ptr align 4 %cast.dest, ptr align 4 %src.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void } -; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group( -; CHECK: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 %src.group.ptr, ptr addrspace(3) align 4 %src.group.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(ptr addrspace(3) %dest.group.ptr, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group( +; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: ret void +; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr %cast.dest = addrspacecast ptr addrspace(3) %src.group.ptr to ptr call void @llvm.memcpy.p0.p0.i64(ptr align 4 %cast.dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void } -; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global( -; CHECK: call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 %dest.group.ptr, ptr addrspace(1) align 4 %src.global.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(ptr addrspace(3) %dest.group.ptr, ptr addrspace(1) %src.global.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global( +; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr addrspace(1) [[SRC_GLOBAL_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr addrspace(1) align 4 [[SRC_GLOBAL_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: ret void +; %cast.src = addrspacecast ptr addrspace(1) %src.global.ptr to ptr %cast.dest = addrspacecast ptr addrspace(3) %dest.group.ptr to ptr call void @llvm.memcpy.p0.p0.i64(ptr align 4 %cast.dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void } -; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global( -; CHECK: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %dest.global.ptr, ptr addrspace(3) align 4 %src.group.ptr, i32 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(ptr addrspace(1) %dest.global.ptr, ptr addrspace(3) %src.group.ptr, i32 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global( +; CHECK-SAME: ptr addrspace(1) [[DEST_GLOBAL_PTR:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i32 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 [[DEST_GLOBAL_PTR]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i32 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: ret void +; %cast.dest = addrspacecast ptr addrspace(1) %dest.global.ptr to ptr call void @llvm.memcpy.p0.p3.i32(ptr align 4 %cast.dest, ptr addrspace(3) align 4 %src.group.ptr, i32 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void } -; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct( -; CHECK: call void @llvm.memcpy.p0.p3.i64(ptr align 4 %dest, ptr addrspace(3) align 4 %src.group.ptr, i64 %size, i1 false), !tbaa.struct !8 define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct( +; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]] +; CHECK-NEXT: ret void +; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa.struct !8 ret void } -; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md( -; CHECK: call void @llvm.memcpy.p0.p3.i64(ptr align 4 %dest, ptr addrspace(3) align 4 %src.group.ptr, i64 %size, i1 false){{$}} define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md( +; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 %size, i1 false) ret void } -; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md( -; CHECK: call void @llvm.memcpy.p0.p3.i64(ptr align 4 %dest0, ptr addrspace(3) align 4 %src.group.ptr, i64 %size, i1 false){{$}} -; CHECK: call void @llvm.memcpy.p0.p3.i64(ptr align 4 %dest1, ptr addrspace(3) align 4 %src.group.ptr, i64 %size, i1 false){{$}} define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest0, ptr %dest1, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md( +; CHECK-SAME: ptr [[DEST0:%.*]], ptr [[DEST1:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST0]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest0, ptr align 4 %cast.src, i64 %size, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest1, ptr align 4 %cast.src, i64 %size, i1 false) @@ -109,16 +149,22 @@ define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_n } ; Check for iterator problems if the pointer has 2 uses in the same call -; CHECK-LABEL: @memcpy_group_flat_to_flat_self( -; CHECK: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 %group.ptr, ptr addrspace(3) align 4 %group.ptr, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memcpy_group_flat_to_flat_self(ptr addrspace(3) %group.ptr) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memcpy_group_flat_to_flat_self( +; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], ptr addrspace(3) align 4 [[GROUP_PTR]], i64 32, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: ret void +; %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr call void @llvm.memcpy.p0.p0.i64(ptr align 4 %cast, ptr align 4 %cast, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void } -; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group( -; CHECK: call void @llvm.memmove.p0.p3.i64(ptr align 4 %dest, ptr addrspace(3) align 4 %src.group.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { +; CHECK-LABEL: define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group( +; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memmove.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: ret void +; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr call void @llvm.memmove.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6 ret void @@ -142,3 +188,14 @@ attributes #1 = { argmemonly nounwind } !6 = !{!7} !7 = distinct !{!7, !5, !"some scope 2"} !8 = !{i64 0, i64 8, null} +;. +; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK: [[META1]] = !{!"A", [[META2:![0-9]+]]} +; CHECK: [[META2]] = !{!"tbaa root"} +; CHECK: [[META3]] = !{[[META4:![0-9]+]]} +; CHECK: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]], !"some scope 1"} +; CHECK: [[META5]] = distinct !{[[META5]], !"some domain"} +; CHECK: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], [[META5]], !"some scope 2"} +; CHECK: [[TBAA_STRUCT8]] = !{i64 0, i64 8, null} +;.