-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Add support for v_permlane16_swap_b32
on gfx1250
#149518
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-mc @llvm/pr-subscribers-clang Author: Shilei Tian (shiltian) ChangesCo-authored-by: Mekhanoshin, Stanislav <[email protected]> Patch is 21.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149518.diff 8 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index d42e51d04ab9d..4c3f308a6cf75 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -5,6 +5,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef unsigned int uint;
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef half __attribute__((ext_vector_type(2))) half2;
// CHECK-LABEL: @test_setprio_inc_wg(
@@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
}
+// CHECK-LABEL: @test_permlane16_swap(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
+// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
+// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
+// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
+// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
+// CHECK-NEXT: ret void
+//
+void test_permlane16_swap(global uint2* out, uint old, uint src) {
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
+ *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
+}
+
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 3ee90857b34b8..80eb5d8b7d571 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1080,6 +1080,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
}
+multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ def _e64_gfx1250 :
+ VOP3_Real_Gen<ps, GFX1250Gen>,
+ VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
+}
+
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
@@ -1147,6 +1154,7 @@ defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
+defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 2b91ea7386be4..a25ebdf3e5f6d 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
// Special case for v_permlane16_swap_b32/v_permlane32_swap_b32
// op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands.
-class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
+class VOP3OpSelIsDPP_base {
bits<1> fi;
bits<1> bound_ctrl;
+}
+
+class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> {
+ // OPSEL[0] specifies FI
+ let Inst{11} = fi;
+ // OPSEL[1] specifies BOUND_CTRL
+ let Inst{12} = bound_ctrl;
+}
+class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> {
// OPSEL[0] specifies FI
let Inst{11} = fi;
// OPSEL[1] specifies BOUND_CTRL
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
index 814086685880d..ed6a02b62ae9a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
@@ -17,6 +19,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -29,6 +43,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vi:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -41,6 +71,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vl:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -54,6 +100,23 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_iv:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v0, 1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_iv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -67,6 +130,23 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_ss:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_mov_b32_e32 v1, s1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_ss:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -80,6 +160,23 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_sv:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_sv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -92,6 +189,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vs:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, s0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -102,6 +215,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_fi:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 false)
ret { i32, i32 } %v
}
@@ -112,6 +237,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_bc(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_bc:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_bc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 true)
ret { i32, i32 } %v
}
@@ -122,6 +259,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi_bc(i32 %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_fi_bc:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi_bc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 true)
ret { i32, i32 } %v
}
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
index 5f310a9954ad0..f2cf3d58fb0cf 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
@@ -627,3 +627,9 @@ v_cvt_f32_fp8_e32 v1, 3
v_cvt_f32_fp8_e32 v1, v3
// GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_permlane16_swap_b32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
+
+v_permlane16_swap_b32_e32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index aa2e028f661e1..b1c4dc62edd6d 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -663,3 +663,9 @@ v_cvt_f32_fp8_e32 v1, 3
v_cvt_f32_fp8_e32 v1, v3
// GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_permlane16_swap_b32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
+
+v_permlane16_swap_b32_e32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
index 9c6a9127d82e4..6b45930a53d73 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
@@ -720,3 +720,24 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]
v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
// GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesCo-authored-by: Mekhanoshin, Stanislav <[email protected]> Patch is 21.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149518.diff 8 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index d42e51d04ab9d..4c3f308a6cf75 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -5,6 +5,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef unsigned int uint;
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef half __attribute__((ext_vector_type(2))) half2;
// CHECK-LABEL: @test_setprio_inc_wg(
@@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
}
+// CHECK-LABEL: @test_permlane16_swap(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
+// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
+// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
+// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
+// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
+// CHECK-NEXT: ret void
+//
+void test_permlane16_swap(global uint2* out, uint old, uint src) {
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
+ *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
+}
+
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 3ee90857b34b8..80eb5d8b7d571 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1080,6 +1080,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
}
+multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ def _e64_gfx1250 :
+ VOP3_Real_Gen<ps, GFX1250Gen>,
+ VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
+}
+
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
@@ -1147,6 +1154,7 @@ defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
+defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 2b91ea7386be4..a25ebdf3e5f6d 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
// Special case for v_permlane16_swap_b32/v_permlane32_swap_b32
// op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands.
-class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
+class VOP3OpSelIsDPP_base {
bits<1> fi;
bits<1> bound_ctrl;
+}
+
+class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> {
+ // OPSEL[0] specifies FI
+ let Inst{11} = fi;
+ // OPSEL[1] specifies BOUND_CTRL
+ let Inst{12} = bound_ctrl;
+}
+class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> {
// OPSEL[0] specifies FI
let Inst{11} = fi;
// OPSEL[1] specifies BOUND_CTRL
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
index 814086685880d..ed6a02b62ae9a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
@@ -17,6 +19,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -29,6 +43,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vi:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -41,6 +71,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vl:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -54,6 +100,23 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_iv:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v0, 1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_iv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -67,6 +130,23 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_ss:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_mov_b32_e32 v1, s1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_ss:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -80,6 +160,23 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_sv:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_sv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -92,6 +189,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vs:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, s0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -102,6 +215,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_fi:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 false)
ret { i32, i32 } %v
}
@@ -112,6 +237,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_bc(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_bc:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_bc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 true)
ret { i32, i32 } %v
}
@@ -122,6 +259,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi_bc(i32 %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_fi_bc:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi_bc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 true)
ret { i32, i32 } %v
}
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
index 5f310a9954ad0..f2cf3d58fb0cf 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
@@ -627,3 +627,9 @@ v_cvt_f32_fp8_e32 v1, 3
v_cvt_f32_fp8_e32 v1, v3
// GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_permlane16_swap_b32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
+
+v_permlane16_swap_b32_e32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index aa2e028f661e1..b1c4dc62edd6d 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -663,3 +663,9 @@ v_cvt_f32_fp8_e32 v1, 3
v_cvt_f32_fp8_e32 v1, v3
// GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_permlane16_swap_b32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
+
+v_permlane16_swap_b32_e32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
index 9c6a9127d82e4..6b45930a53d73 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
@@ -720,3 +720,24 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]
v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
// GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12...
[truncated]
|
345d4d9
to
33991ac
Compare
Merge activity
|
e6d5fd1
to
f7c11e6
Compare
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
f7c11e6
to
d58f960
Compare
Co-authored-by: Mekhanoshin, Stanislav [email protected]