diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 496548382d528..94dd45f1333b0 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1425,19 +1425,23 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; @@ -1532,12 +1536,14 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; @@ -1545,6 +1551,8 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6f9c88e617617..4946129c65a95 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -859,19 +859,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, - MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8}, + MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, + MVT::i8}, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, - {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16, - MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16, - MVT::i16, MVT::i8, MVT::i128}, + {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16, + MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16, + MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16, + MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, Custom); setOperationAction(ISD::INTRINSIC_VOID, - {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16, - MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16, - MVT::i8, MVT::i128}, + {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16, + MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, + MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, + MVT::f16, MVT::i16, MVT::i8, MVT::i128}, Custom); setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll new file mode 100644 index 0000000000000..3c800d0369e70 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s + +define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { +; GFX7-LABEL: raw_ptr_buffer_load_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: raw_ptr_buffer_load_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: raw_ptr_buffer_load_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: raw_ptr_buffer_load_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: raw_ptr_buffer_load_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + ret bfloat %val +} + +define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { +; GFX7-LABEL: raw_ptr_buffer_load_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: raw_ptr_buffer_load_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: raw_ptr_buffer_load_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: raw_ptr_buffer_load_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: raw_ptr_buffer_load_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + ret <2 x bfloat> %val +} + +define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { +; GFX7-LABEL: raw_ptr_buffer_load_v4bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: raw_ptr_buffer_load_v4bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: raw_ptr_buffer_load_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: raw_ptr_buffer_load_v4bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: raw_ptr_buffer_load_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + ret <4 x bfloat> %val +} + +; FIXME +; define <6 x bfloat> @raw_ptr_buffer_load_v6bf16(ptr addrspace(8) inreg %rsrc) { +; %val = call <6 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v6bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) +; ret <6 x bfloat> %val +; } + +define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { +; GFX7-LABEL: raw_ptr_buffer_load_v8bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: raw_ptr_buffer_load_v8bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: raw_ptr_buffer_load_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: raw_ptr_buffer_load_v8bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: raw_ptr_buffer_load_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = call <8 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v8bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + ret <8 x bfloat> %val +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index 07a5b511f2dc8..3e3371091ef72 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -944,7 +944,7 @@ main_body: define amdgpu_ps void @raw_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { ; PREGFX10-LABEL: raw_ptr_buffer_load_v4f16: -; PREGFX10: ; %bb.0: ; %main_body +; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 ; PREGFX10-NEXT: s_mov_b32 m0, -1 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) @@ -952,24 +952,49 @@ define amdgpu_ps void @raw_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, p ; PREGFX10-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_load_v4f16: -; GFX10: ; %bb.0: ; %main_body +; GFX10: ; %bb.0: ; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b64 v0, v[1:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: raw_ptr_buffer_load_v4f16: -; GFX11: ; %bb.0: ; %main_body +; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ds_store_b64 v0, v[1:2] ; GFX11-NEXT: s_endpgm -main_body: %val = call <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) store <4 x half> %val, ptr addrspace(3) %ptr ret void } +; FIXME +; define amdgpu_ps void @raw_ptr_buffer_load_v6f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { +; %val = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) +; store <6 x half> %val, ptr addrspace(3) %ptr +; ret void +; } + +define amdgpu_ps void @raw_ptr_buffer_load_v8f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { +; GFX10-LABEL: raw_ptr_buffer_load_v8f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ds_write_b128 v0, v[1:4] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: raw_ptr_buffer_load_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ds_store_b128 v0, v[1:4] +; GFX11-NEXT: s_endpgm + %val = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + store <8 x half> %val, ptr addrspace(3) %ptr + ret void +} + define amdgpu_ps void @raw_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { ; PREGFX10-LABEL: raw_ptr_buffer_load_v2i16: ; PREGFX10: ; %bb.0: ; %main_body @@ -1000,7 +1025,7 @@ main_body: define amdgpu_ps void @raw_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { ; PREGFX10-LABEL: raw_ptr_buffer_load_v4i16: -; PREGFX10: ; %bb.0: ; %main_body +; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 ; PREGFX10-NEXT: s_mov_b32 m0, -1 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) @@ -1008,24 +1033,49 @@ define amdgpu_ps void @raw_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, p ; PREGFX10-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_load_v4i16: -; GFX10: ; %bb.0: ; %main_body +; GFX10: ; %bb.0: ; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b64 v0, v[1:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: raw_ptr_buffer_load_v4i16: -; GFX11: ; %bb.0: ; %main_body +; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ds_store_b64 v0, v[1:2] ; GFX11-NEXT: s_endpgm -main_body: %val = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) store <4 x i16> %val, ptr addrspace(3) %ptr ret void } +; FIXME +; define amdgpu_ps void @raw_ptr_buffer_load_v6i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { +; %val = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) +; store <6 x i16> %val, ptr addrspace(3) %ptr +; ret void +; } + +define amdgpu_ps void @raw_ptr_buffer_load_v8i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { +; GFX10-LABEL: raw_ptr_buffer_load_v8i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ds_write_b128 v0, v[1:4] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: raw_ptr_buffer_load_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ds_store_b128 v0, v[1:4] +; GFX11-NEXT: s_endpgm + %val = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + store <8 x i16> %val, ptr addrspace(3) %ptr + ret void +} + define amdgpu_ps void @raw_ptr_buffer_load_x1_offset_merged(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: raw_ptr_buffer_load_x1_offset_merged: ; PREGFX10: ; %bb.0: ; %main_body diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll new file mode 100644 index 0000000000000..f7f3742a90633 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s + +; FIXME +; define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) { +; call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) +; ret void +; } + +define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) { +; GFX7-LABEL: buffer_store_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: buffer_store_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_store_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 { +; GFX7-LABEL: buffer_store_v4bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v4bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: buffer_store_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_store_v4bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +; FIXME +; define amdgpu_ps void @buffer_store_v6bf16(ptr addrspace(8) inreg %rsrc, <6 x bfloat> %data, i32 %offset) #0 { +; call void @llvm.amdgcn.raw.ptr.buffer.store.v6bf16(<6 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) +; ret void +; } + +define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bfloat> %data, i32 %offset) #0 { +; GFX7-LABEL: buffer_store_v8bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v8bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: buffer_store_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_store_v8bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.store.v8bf16(<8 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll index b66cccd0b7e8a..8dd9b4ab61d4f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -246,6 +246,31 @@ main_body: ret void } +;CHECK-LABEL: {{^}}buffer_store_v8f16: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x half> %data, i32 %offset) #0 { +main_body: + call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_v2bf16: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) { + call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_v4bf16: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 { + call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + ;CHECK-LABEL: {{^}}raw_ptr_buffer_store_i16: ;CHECK-NEXT: %bb. ;CHECK-NOT: v0 @@ -276,6 +301,22 @@ main_body: ret void } +; FIXME: +; define amdgpu_ps void @buffer_store_v6i16(ptr addrspace(8) inreg %rsrc, <6 x i16> %data, i32 %offset) #0 { +; main_body: +; call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) +; ret void +; } + +;CHECK-LABEL: {{^}}buffer_store_v8i16: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_v8i16(ptr addrspace(8) inreg %rsrc, <8 x i16> %data, i32 %offset) #0 { +main_body: + call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + ;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4