Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit b3995aa

Browse files
authoredNov 19, 2024
[AMDGPU] Decrease default NSA threshold from 3 to 2 (llvm#116624)
In graphics shaders it is better overall to use NSA encoding for IMAGE instructions, because the benefit of less constrained register allocation outweighs the cost of larger encoding. In particular NSA form often avoids the need for extra V_MOV_B32 instructions between IMAGE instructions, which can allow the IMAGE instructions to be claused. Note that in GFX12 there is no longer a bit in the encoding to choose between NSA and non-NSA forms, so this only affects GFX10 and GFX11.
1 parent 21fc36b commit b3995aa

21 files changed

+230
-301
lines changed
 

‎llvm/lib/Target/AMDGPU/GCNSubtarget.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
5454
static cl::opt<unsigned>
5555
NSAThreshold("amdgpu-nsa-threshold",
5656
cl::desc("Number of addresses from which to enable MIMG NSA."),
57-
cl::init(3), cl::Hidden);
57+
cl::init(2), cl::Hidden);
5858

5959
GCNSubtarget::~GCNSubtarget() = default;
6060

‎llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -31,42 +31,39 @@ define void @main(<19 x i32> %arg) {
3131
; GFX10-LABEL: main:
3232
; GFX10: ; %bb.0: ; %bb
3333
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34-
; GFX10-NEXT: s_mov_b32 s4, 0
3534
; GFX10-NEXT: v_mov_b32_e32 v1, 0
3635
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
37-
; GFX10-NEXT: s_mov_b32 s10, s4
38-
; GFX10-NEXT: s_mov_b32 s11, s4
39-
; GFX10-NEXT: v_mov_b32_e32 v4, s10
36+
; GFX10-NEXT: s_mov_b32 s4, 0
37+
; GFX10-NEXT: s_mov_b32 s5, s4
4038
; GFX10-NEXT: v_mov_b32_e32 v2, v1
4139
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
4240
; GFX10-NEXT: v_mov_b32_e32 v3, v1
43-
; GFX10-NEXT: v_mov_b32_e32 v5, s11
44-
; GFX10-NEXT: s_mov_b32 s5, s4
4541
; GFX10-NEXT: s_mov_b32 s6, s4
4642
; GFX10-NEXT: s_mov_b32 s7, s4
4743
; GFX10-NEXT: s_mov_b32 s8, s4
4844
; GFX10-NEXT: s_mov_b32 s9, s4
49-
; GFX10-NEXT: image_store v[0:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm
45+
; GFX10-NEXT: s_mov_b32 s10, s4
46+
; GFX10-NEXT: s_mov_b32 s11, s4
47+
; GFX10-NEXT: image_store v[0:3], [v1, v1], s[4:11] dim:SQ_RSRC_IMG_2D unorm
5048
; GFX10-NEXT: s_setpc_b64 s[30:31]
5149
;
5250
; GFX11-LABEL: main:
5351
; GFX11: ; %bb.0: ; %bb
5452
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55-
; GFX11-NEXT: s_mov_b32 s0, 0
53+
; GFX11-NEXT: v_mov_b32_e32 v1, 0
5654
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
57-
; GFX11-NEXT: s_mov_b32 s6, s0
58-
; GFX11-NEXT: s_mov_b32 s7, s0
59-
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s6
60-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
61-
; GFX11-NEXT: v_mov_b32_e32 v5, s7
55+
; GFX11-NEXT: s_mov_b32 s0, 0
6256
; GFX11-NEXT: s_mov_b32 s1, s0
6357
; GFX11-NEXT: v_mov_b32_e32 v2, v1
58+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
6459
; GFX11-NEXT: v_mov_b32_e32 v3, v1
6560
; GFX11-NEXT: s_mov_b32 s2, s0
6661
; GFX11-NEXT: s_mov_b32 s3, s0
6762
; GFX11-NEXT: s_mov_b32 s4, s0
6863
; GFX11-NEXT: s_mov_b32 s5, s0
69-
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dim:SQ_RSRC_IMG_2D unorm
64+
; GFX11-NEXT: s_mov_b32 s6, s0
65+
; GFX11-NEXT: s_mov_b32 s7, s0
66+
; GFX11-NEXT: image_store v[0:3], [v1, v1], s[0:7] dim:SQ_RSRC_IMG_2D unorm
7067
; GFX11-NEXT: s_setpc_b64 s[30:31]
7168
bb:
7269
%i = bitcast <19 x i32> %arg to <38 x i16>

‎llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,11 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
5555
; CHECK-NEXT: v_writelane_b32 v16, s5, 1
5656
; CHECK-NEXT: v_writelane_b32 v16, s6, 2
5757
; CHECK-NEXT: v_writelane_b32 v16, s7, 3
58-
; CHECK-NEXT: s_mov_b32 s6, 0
59-
; CHECK-NEXT: s_mov_b32 s4, s6
60-
; CHECK-NEXT: s_mov_b32 s5, s6
58+
; CHECK-NEXT: s_mov_b32 s4, 0
59+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
60+
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
6161
; CHECK-NEXT: v_mov_b32_e32 v0, s4
62-
; CHECK-NEXT: v_mov_b32_e32 v1, s5
6362
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
64-
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
6563
; CHECK-NEXT: s_mov_b32 s4, exec_lo
6664
; CHECK-NEXT: v_writelane_b32 v16, s4, 4
6765
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
@@ -154,10 +152,10 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
154152
; CHECK-NEXT: v_readlane_b32 s17, v16, 1
155153
; CHECK-NEXT: v_readlane_b32 s18, v16, 2
156154
; CHECK-NEXT: v_readlane_b32 s19, v16, 3
157-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
158-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
155+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
156+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
159157
; CHECK-NEXT: s_waitcnt vmcnt(0)
160-
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
158+
; CHECK-NEXT: image_sample v0, [v0, v1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
161159
; CHECK-NEXT: s_waitcnt vmcnt(0)
162160
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
163161
; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4

‎llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,8 +1074,7 @@ define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s,
10741074
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
10751075
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
10761076
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1077-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1078-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1077+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
10791078
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
10801079
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
10811080
;
@@ -1163,8 +1162,7 @@ define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s
11631162
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
11641163
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
11651164
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1166-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1167-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1165+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
11681166
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
11691167
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
11701168
;
@@ -1327,8 +1325,7 @@ define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i16
13271325
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
13281326
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
13291327
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1330-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1331-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1328+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
13321329
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
13331330
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
13341331
;
@@ -1416,8 +1413,7 @@ define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16
14161413
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
14171414
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
14181415
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1419-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1420-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1416+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
14211417
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
14221418
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
14231419
;
@@ -1507,8 +1503,7 @@ define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data,
15071503
; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32)
15081504
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
15091505
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
1510-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1511-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1506+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
15121507
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
15131508
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
15141509
;
@@ -1754,8 +1749,7 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
17541749
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
17551750
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
17561751
; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1757-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
1758-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1752+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
17591753
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
17601754
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
17611755
;
@@ -1851,8 +1845,7 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
18511845
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
18521846
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
18531847
; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
1854-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
1855-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1848+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
18561849
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
18571850
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
18581851
;

‎llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll

Lines changed: 22 additions & 44 deletions
Large diffs are not rendered by default.

‎llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll

Lines changed: 28 additions & 56 deletions
Large diffs are not rendered by default.

‎llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,7 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
8787
; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
8888
; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
8989
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
90-
; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
91-
; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (s16), addrspace 8)
90+
; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (s16), addrspace 8)
9291
; GFX10-NEXT: S_ENDPGM 0
9392
;
9493
; GFX12-LABEL: name: image_store_f16
@@ -198,8 +197,7 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
198197
; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
199198
; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
200199
; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
201-
; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
202-
; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<2 x s16>), addrspace 8)
200+
; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<2 x s16>), addrspace 8)
203201
; GFX10-NEXT: S_ENDPGM 0
204202
;
205203
; GFX12-LABEL: name: image_store_v2f16
@@ -330,8 +328,7 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
330328
; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
331329
; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
332330
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
333-
; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
334-
; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
331+
; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
335332
; GFX10-NEXT: S_ENDPGM 0
336333
;
337334
; GFX12-LABEL: name: image_store_v3f16
@@ -452,8 +449,7 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
452449
; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
453450
; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
454451
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
455-
; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
456-
; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<4 x s16>), addrspace 8)
452+
; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<4 x s16>), addrspace 8)
457453
; GFX10-NEXT: S_ENDPGM 0
458454
;
459455
; GFX12-LABEL: name: image_store_v4f16

‎llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -706,12 +706,12 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16
706706
; GFX10-NEXT: s_mov_b32 s1, s3
707707
; GFX10-NEXT: s_mov_b32 s2, s4
708708
; GFX10-NEXT: s_mov_b32 s3, s5
709-
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
709+
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
710710
; GFX10-NEXT: s_mov_b32 s4, s6
711711
; GFX10-NEXT: s_mov_b32 s5, s7
712712
; GFX10-NEXT: s_mov_b32 s6, s8
713713
; GFX10-NEXT: s_mov_b32 s7, s9
714-
; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16
714+
; GFX10-NEXT: image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16
715715
; GFX10-NEXT: s_waitcnt vmcnt(0)
716716
; GFX10-NEXT: ; return to shader part epilog
717717
;
@@ -760,12 +760,12 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1
760760
; GFX10-NEXT: s_mov_b32 s1, s3
761761
; GFX10-NEXT: s_mov_b32 s2, s4
762762
; GFX10-NEXT: s_mov_b32 s3, s5
763-
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
763+
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
764764
; GFX10-NEXT: s_mov_b32 s4, s6
765765
; GFX10-NEXT: s_mov_b32 s5, s7
766766
; GFX10-NEXT: s_mov_b32 s6, s8
767767
; GFX10-NEXT: s_mov_b32 s7, s9
768-
; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16
768+
; GFX10-NEXT: image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16
769769
; GFX10-NEXT: s_waitcnt vmcnt(0)
770770
; GFX10-NEXT: ; return to shader part epilog
771771
;
@@ -868,12 +868,12 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data,
868868
; GFX10-NEXT: s_mov_b32 s1, s3
869869
; GFX10-NEXT: s_mov_b32 s2, s4
870870
; GFX10-NEXT: s_mov_b32 s3, s5
871-
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
871+
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
872872
; GFX10-NEXT: s_mov_b32 s4, s6
873873
; GFX10-NEXT: s_mov_b32 s5, s7
874874
; GFX10-NEXT: s_mov_b32 s6, s8
875875
; GFX10-NEXT: s_mov_b32 s7, s9
876-
; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
876+
; GFX10-NEXT: image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
877877
; GFX10-NEXT: s_waitcnt vmcnt(0)
878878
; GFX10-NEXT: ; return to shader part epilog
879879
;
@@ -922,12 +922,12 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data,
922922
; GFX10-NEXT: s_mov_b32 s1, s3
923923
; GFX10-NEXT: s_mov_b32 s2, s4
924924
; GFX10-NEXT: s_mov_b32 s3, s5
925-
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
925+
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
926926
; GFX10-NEXT: s_mov_b32 s4, s6
927927
; GFX10-NEXT: s_mov_b32 s5, s7
928928
; GFX10-NEXT: s_mov_b32 s6, s8
929929
; GFX10-NEXT: s_mov_b32 s7, s9
930-
; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
930+
; GFX10-NEXT: image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
931931
; GFX10-NEXT: s_waitcnt vmcnt(0)
932932
; GFX10-NEXT: ; return to shader part epilog
933933
;
@@ -1762,12 +1762,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data
17621762
; GFX10-NEXT: s_mov_b32 s1, s3
17631763
; GFX10-NEXT: s_mov_b32 s2, s4
17641764
; GFX10-NEXT: s_mov_b32 s3, s5
1765-
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
1765+
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
17661766
; GFX10-NEXT: s_mov_b32 s4, s6
17671767
; GFX10-NEXT: s_mov_b32 s5, s7
17681768
; GFX10-NEXT: s_mov_b32 s6, s8
17691769
; GFX10-NEXT: s_mov_b32 s7, s9
1770-
; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16
1770+
; GFX10-NEXT: image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16
17711771
; GFX10-NEXT: s_waitcnt vmcnt(0)
17721772
; GFX10-NEXT: ; return to shader part epilog
17731773
;
@@ -1816,12 +1816,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da
18161816
; GFX10-NEXT: s_mov_b32 s1, s3
18171817
; GFX10-NEXT: s_mov_b32 s2, s4
18181818
; GFX10-NEXT: s_mov_b32 s3, s5
1819-
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
1819+
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
18201820
; GFX10-NEXT: s_mov_b32 s4, s6
18211821
; GFX10-NEXT: s_mov_b32 s5, s7
18221822
; GFX10-NEXT: s_mov_b32 s6, s8
18231823
; GFX10-NEXT: s_mov_b32 s7, s9
1824-
; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16
1824+
; GFX10-NEXT: image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16
18251825
; GFX10-NEXT: s_waitcnt vmcnt(0)
18261826
; GFX10-NEXT: ; return to shader part epilog
18271827
;
@@ -1924,12 +1924,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64
19241924
; GFX10-NEXT: s_mov_b32 s1, s3
19251925
; GFX10-NEXT: s_mov_b32 s2, s4
19261926
; GFX10-NEXT: s_mov_b32 s3, s5
1927-
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
1927+
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
19281928
; GFX10-NEXT: s_mov_b32 s4, s6
19291929
; GFX10-NEXT: s_mov_b32 s5, s7
19301930
; GFX10-NEXT: s_mov_b32 s6, s8
19311931
; GFX10-NEXT: s_mov_b32 s7, s9
1932-
; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
1932+
; GFX10-NEXT: image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
19331933
; GFX10-NEXT: s_waitcnt vmcnt(0)
19341934
; GFX10-NEXT: ; return to shader part epilog
19351935
;
@@ -1978,12 +1978,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %
19781978
; GFX10-NEXT: s_mov_b32 s1, s3
19791979
; GFX10-NEXT: s_mov_b32 s2, s4
19801980
; GFX10-NEXT: s_mov_b32 s3, s5
1981-
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
1981+
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
19821982
; GFX10-NEXT: s_mov_b32 s4, s6
19831983
; GFX10-NEXT: s_mov_b32 s5, s7
19841984
; GFX10-NEXT: s_mov_b32 s6, s8
19851985
; GFX10-NEXT: s_mov_b32 s7, s9
1986-
; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
1986+
; GFX10-NEXT: image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
19871987
; GFX10-NEXT: s_waitcnt vmcnt(0)
19881988
; GFX10-NEXT: ; return to shader part epilog
19891989
;

‎llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
119119
; GFX10NSA-NEXT: s_mov_b32 s9, s11
120120
; GFX10NSA-NEXT: s_mov_b32 s10, s12
121121
; GFX10NSA-NEXT: s_mov_b32 s11, s13
122-
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0
122+
; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0
123123
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
124-
; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
124+
; GFX10NSA-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
125125
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
126126
; GFX10NSA-NEXT: ; return to shader part epilog
127127
;
@@ -193,9 +193,9 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
193193
; GFX10NSA-NEXT: s_mov_b32 s9, s11
194194
; GFX10NSA-NEXT: s_mov_b32 s10, s12
195195
; GFX10NSA-NEXT: s_mov_b32 s11, s13
196-
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0
196+
; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0
197197
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
198-
; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
198+
; GFX10NSA-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
199199
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
200200
; GFX10NSA-NEXT: ; return to shader part epilog
201201
;
@@ -341,9 +341,9 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
341341
; GFX10NSA-NEXT: s_mov_b32 s9, s11
342342
; GFX10NSA-NEXT: s_mov_b32 s10, s12
343343
; GFX10NSA-NEXT: s_mov_b32 s11, s13
344-
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0
344+
; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0
345345
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
346-
; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
346+
; GFX10NSA-NEXT: image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
347347
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
348348
; GFX10NSA-NEXT: ; return to shader part epilog
349349
;
@@ -778,7 +778,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
778778
; GFX10NSA-NEXT: s_mov_b32 s1, s3
779779
; GFX10NSA-NEXT: s_mov_b32 s2, s4
780780
; GFX10NSA-NEXT: s_mov_b32 s3, s5
781-
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0
781+
; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0
782782
; GFX10NSA-NEXT: s_mov_b32 s4, s6
783783
; GFX10NSA-NEXT: s_mov_b32 s5, s7
784784
; GFX10NSA-NEXT: s_mov_b32 s6, s8
@@ -787,7 +787,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
787787
; GFX10NSA-NEXT: s_mov_b32 s9, s11
788788
; GFX10NSA-NEXT: s_mov_b32 s10, s12
789789
; GFX10NSA-NEXT: s_mov_b32 s11, s13
790-
; GFX10NSA-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
790+
; GFX10NSA-NEXT: image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
791791
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
792792
; GFX10NSA-NEXT: ; return to shader part epilog
793793
;

‎llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll

Lines changed: 54 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,12 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
2828
; GFX10PLUS-NEXT: s_mov_b32 s1, s3
2929
; GFX10PLUS-NEXT: s_mov_b32 s2, s4
3030
; GFX10PLUS-NEXT: s_mov_b32 s3, s5
31-
; GFX10PLUS-NEXT: v_lshl_or_b32 v1, v1, 16, v0
31+
; GFX10PLUS-NEXT: v_lshl_or_b32 v0, v1, 16, v0
3232
; GFX10PLUS-NEXT: s_mov_b32 s4, s6
3333
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
3434
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
3535
; GFX10PLUS-NEXT: s_mov_b32 s7, s9
36-
; GFX10PLUS-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
36+
; GFX10PLUS-NEXT: image_load v[0:3], [v0, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
3737
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
3838
; GFX10PLUS-NEXT: ; return to shader part epilog
3939
;
@@ -88,55 +88,56 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
8888
;
8989
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
9090
; GFX10: ; %bb.0:
91-
; GFX10-NEXT: v_mov_b32_e32 v7, 0
91+
; GFX10-NEXT: v_mov_b32_e32 v6, 0
9292
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
93-
; GFX10-NEXT: v_mov_b32_e32 v6, v2
93+
; GFX10-NEXT: v_mov_b32_e32 v12, v2
9494
; GFX10-NEXT: s_mov_b32 s0, s2
9595
; GFX10-NEXT: s_mov_b32 s1, s3
96-
; GFX10-NEXT: v_mov_b32_e32 v8, v7
97-
; GFX10-NEXT: v_mov_b32_e32 v9, v7
98-
; GFX10-NEXT: v_mov_b32_e32 v10, v7
99-
; GFX10-NEXT: v_mov_b32_e32 v11, v7
100-
; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0
96+
; GFX10-NEXT: v_mov_b32_e32 v7, v6
97+
; GFX10-NEXT: v_mov_b32_e32 v8, v6
98+
; GFX10-NEXT: v_mov_b32_e32 v9, v6
99+
; GFX10-NEXT: v_mov_b32_e32 v10, v6
100+
; GFX10-NEXT: v_lshl_or_b32 v11, v1, 16, v0
101101
; GFX10-NEXT: s_mov_b32 s2, s4
102102
; GFX10-NEXT: s_mov_b32 s3, s5
103103
; GFX10-NEXT: s_mov_b32 s4, s6
104104
; GFX10-NEXT: s_mov_b32 s5, s7
105105
; GFX10-NEXT: s_mov_b32 s6, s8
106106
; GFX10-NEXT: s_mov_b32 s7, s9
107-
; GFX10-NEXT: v_mov_b32_e32 v0, v7
108-
; GFX10-NEXT: v_mov_b32_e32 v1, v8
109-
; GFX10-NEXT: v_mov_b32_e32 v2, v9
110-
; GFX10-NEXT: v_mov_b32_e32 v3, v10
111-
; GFX10-NEXT: v_mov_b32_e32 v4, v11
112-
; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
107+
; GFX10-NEXT: v_mov_b32_e32 v0, v6
108+
; GFX10-NEXT: v_mov_b32_e32 v1, v7
109+
; GFX10-NEXT: v_mov_b32_e32 v2, v8
110+
; GFX10-NEXT: v_mov_b32_e32 v3, v9
111+
; GFX10-NEXT: v_mov_b32_e32 v4, v10
112+
; GFX10-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
113113
; GFX10-NEXT: s_waitcnt vmcnt(0)
114-
; GFX10-NEXT: global_store_dword v7, v4, s[10:11]
114+
; GFX10-NEXT: global_store_dword v6, v4, s[10:11]
115115
; GFX10-NEXT: ; return to shader part epilog
116116
;
117117
; GFX11-LABEL: load_3d_v4f32_xyzw_tfe:
118118
; GFX11: ; %bb.0:
119-
; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0
119+
; GFX11-NEXT: v_mov_b32_e32 v6, 0
120120
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
121121
; GFX11-NEXT: s_mov_b32 s0, s2
122122
; GFX11-NEXT: s_mov_b32 s1, s3
123123
; GFX11-NEXT: s_mov_b32 s2, s4
124-
; GFX11-NEXT: v_mov_b32_e32 v9, v7
125-
; GFX11-NEXT: v_mov_b32_e32 v8, v7
126-
; GFX11-NEXT: v_mov_b32_e32 v10, v7
127-
; GFX11-NEXT: v_mov_b32_e32 v11, v7
128-
; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0
124+
; GFX11-NEXT: v_mov_b32_e32 v7, v6
125+
; GFX11-NEXT: v_mov_b32_e32 v8, v6
126+
; GFX11-NEXT: v_mov_b32_e32 v9, v6
127+
; GFX11-NEXT: v_mov_b32_e32 v10, v6
128+
; GFX11-NEXT: v_mov_b32_e32 v12, v2
129+
; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0
129130
; GFX11-NEXT: s_mov_b32 s3, s5
130131
; GFX11-NEXT: s_mov_b32 s4, s6
131132
; GFX11-NEXT: s_mov_b32 s5, s7
132133
; GFX11-NEXT: s_mov_b32 s6, s8
133134
; GFX11-NEXT: s_mov_b32 s7, s9
134-
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
135-
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
136-
; GFX11-NEXT: v_mov_b32_e32 v4, v11
137-
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
135+
; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
136+
; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
137+
; GFX11-NEXT: v_mov_b32_e32 v4, v10
138+
; GFX11-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
138139
; GFX11-NEXT: s_waitcnt vmcnt(0)
139-
; GFX11-NEXT: global_store_b32 v7, v4, s[10:11]
140+
; GFX11-NEXT: global_store_b32 v6, v4, s[10:11]
140141
; GFX11-NEXT: ; return to shader part epilog
141142
;
142143
; GFX12-LABEL: load_3d_v4f32_xyzw_tfe:
@@ -200,55 +201,56 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
200201
;
201202
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
202203
; GFX10: ; %bb.0:
203-
; GFX10-NEXT: v_mov_b32_e32 v7, 0
204+
; GFX10-NEXT: v_mov_b32_e32 v6, 0
204205
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
205-
; GFX10-NEXT: v_mov_b32_e32 v6, v2
206+
; GFX10-NEXT: v_mov_b32_e32 v12, v2
206207
; GFX10-NEXT: s_mov_b32 s0, s2
207208
; GFX10-NEXT: s_mov_b32 s1, s3
208-
; GFX10-NEXT: v_mov_b32_e32 v8, v7
209-
; GFX10-NEXT: v_mov_b32_e32 v9, v7
210-
; GFX10-NEXT: v_mov_b32_e32 v10, v7
211-
; GFX10-NEXT: v_mov_b32_e32 v11, v7
212-
; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0
209+
; GFX10-NEXT: v_mov_b32_e32 v7, v6
210+
; GFX10-NEXT: v_mov_b32_e32 v8, v6
211+
; GFX10-NEXT: v_mov_b32_e32 v9, v6
212+
; GFX10-NEXT: v_mov_b32_e32 v10, v6
213+
; GFX10-NEXT: v_lshl_or_b32 v11, v1, 16, v0
213214
; GFX10-NEXT: s_mov_b32 s2, s4
214215
; GFX10-NEXT: s_mov_b32 s3, s5
215216
; GFX10-NEXT: s_mov_b32 s4, s6
216217
; GFX10-NEXT: s_mov_b32 s5, s7
217218
; GFX10-NEXT: s_mov_b32 s6, s8
218219
; GFX10-NEXT: s_mov_b32 s7, s9
219-
; GFX10-NEXT: v_mov_b32_e32 v0, v7
220-
; GFX10-NEXT: v_mov_b32_e32 v1, v8
221-
; GFX10-NEXT: v_mov_b32_e32 v2, v9
222-
; GFX10-NEXT: v_mov_b32_e32 v3, v10
223-
; GFX10-NEXT: v_mov_b32_e32 v4, v11
224-
; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
220+
; GFX10-NEXT: v_mov_b32_e32 v0, v6
221+
; GFX10-NEXT: v_mov_b32_e32 v1, v7
222+
; GFX10-NEXT: v_mov_b32_e32 v2, v8
223+
; GFX10-NEXT: v_mov_b32_e32 v3, v9
224+
; GFX10-NEXT: v_mov_b32_e32 v4, v10
225+
; GFX10-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
225226
; GFX10-NEXT: s_waitcnt vmcnt(0)
226-
; GFX10-NEXT: global_store_dword v7, v4, s[10:11]
227+
; GFX10-NEXT: global_store_dword v6, v4, s[10:11]
227228
; GFX10-NEXT: ; return to shader part epilog
228229
;
229230
; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
230231
; GFX11: ; %bb.0:
231-
; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0
232+
; GFX11-NEXT: v_mov_b32_e32 v6, 0
232233
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
233234
; GFX11-NEXT: s_mov_b32 s0, s2
234235
; GFX11-NEXT: s_mov_b32 s1, s3
235236
; GFX11-NEXT: s_mov_b32 s2, s4
236-
; GFX11-NEXT: v_mov_b32_e32 v9, v7
237-
; GFX11-NEXT: v_mov_b32_e32 v8, v7
238-
; GFX11-NEXT: v_mov_b32_e32 v10, v7
239-
; GFX11-NEXT: v_mov_b32_e32 v11, v7
240-
; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0
237+
; GFX11-NEXT: v_mov_b32_e32 v7, v6
238+
; GFX11-NEXT: v_mov_b32_e32 v8, v6
239+
; GFX11-NEXT: v_mov_b32_e32 v9, v6
240+
; GFX11-NEXT: v_mov_b32_e32 v10, v6
241+
; GFX11-NEXT: v_mov_b32_e32 v12, v2
242+
; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0
241243
; GFX11-NEXT: s_mov_b32 s3, s5
242244
; GFX11-NEXT: s_mov_b32 s4, s6
243245
; GFX11-NEXT: s_mov_b32 s5, s7
244246
; GFX11-NEXT: s_mov_b32 s6, s8
245247
; GFX11-NEXT: s_mov_b32 s7, s9
246-
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
247-
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
248-
; GFX11-NEXT: v_mov_b32_e32 v4, v11
249-
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
248+
; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
249+
; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
250+
; GFX11-NEXT: v_mov_b32_e32 v4, v10
251+
; GFX11-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
250252
; GFX11-NEXT: s_waitcnt vmcnt(0)
251-
; GFX11-NEXT: global_store_b32 v7, v4, s[10:11]
253+
; GFX11-NEXT: global_store_b32 v6, v4, s[10:11]
252254
; GFX11-NEXT: ; return to shader part epilog
253255
;
254256
; GFX12-LABEL: load_3d_v4f32_xyzw_tfe_lwe:

‎llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,11 @@ define float @test() {
66
; GFX10: bb.0.bb:
77
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
88
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3, [[S_MOV_B32_]], %subreg.sub4, [[S_MOV_B32_]], %subreg.sub5, [[S_MOV_B32_]], %subreg.sub6, [[S_MOV_B32_]], %subreg.sub7
9-
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
10-
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
11-
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
12-
; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
13-
; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
14-
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_gfx10_]].sub1
15-
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_gfx10_]].sub0
9+
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
10+
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
11+
; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[COPY]], [[COPY1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
12+
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
13+
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
1614
; GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY2]], 0, killed [[COPY3]], 0, 0, implicit $mode, implicit $exec
1715
; GFX10-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
1816
; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_1]]

‎llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,16 @@ define void @issue92561(ptr addrspace(1) %arg) {
6363
; SDAG-NEXT: image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
6464
; SDAG-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
6565
; SDAG-NEXT: image_sample_c_lz v2, [v1, v2, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
66-
; SDAG-NEXT: v_mov_b32_e32 v4, v1
6766
; SDAG-NEXT: s_waitcnt vmcnt(2)
6867
; SDAG-NEXT: v_add_f32_e32 v0, v9, v0
6968
; SDAG-NEXT: s_waitcnt vmcnt(0)
7069
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
7170
; SDAG-NEXT: v_add_f32_e32 v0, v2, v0
7271
; SDAG-NEXT: v_mov_b32_e32 v2, v1
73-
; SDAG-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v1
72+
; SDAG-NEXT: v_add_f32_e32 v0, v3, v0
7473
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
7574
; SDAG-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0
76-
; SDAG-NEXT: image_store v[0:2], v[3:4], s[0:7] dim:SQ_RSRC_IMG_2D unorm
75+
; SDAG-NEXT: image_store v[0:2], [v1, v1], s[0:7] dim:SQ_RSRC_IMG_2D unorm
7776
; SDAG-NEXT: s_setpc_b64 s[30:31]
7877
;
7978
; GISEL-LABEL: issue92561:
@@ -131,18 +130,16 @@ define void @issue92561(ptr addrspace(1) %arg) {
131130
; GISEL-NEXT: image_sample_c_lz v0, [v2, v2, v0, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
132131
; GISEL-NEXT: image_sample_c_lz v3, [v2, v3, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
133132
; GISEL-NEXT: image_sample_c_lz v4, [v2, v2, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
134-
; GISEL-NEXT: s_mov_b32 s21, s20
135133
; GISEL-NEXT: s_waitcnt vmcnt(2)
136134
; GISEL-NEXT: v_add_f32_e32 v0, v1, v0
137135
; GISEL-NEXT: s_waitcnt vmcnt(1)
138136
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
139137
; GISEL-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v2
140138
; GISEL-NEXT: s_waitcnt vmcnt(0)
141139
; GISEL-NEXT: v_add_f32_e32 v0, v4, v0
142-
; GISEL-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21
143-
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
140+
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
144141
; GISEL-NEXT: v_mul_f32_e32 v1, 0x3e800000, v0
145-
; GISEL-NEXT: image_store v[1:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm
142+
; GISEL-NEXT: image_store v[1:3], [v2, v2], s[4:11] dim:SQ_RSRC_IMG_2D unorm
146143
; GISEL-NEXT: s_setpc_b64 s[30:31]
147144
bb:
148145
%descriptor = load <8 x i32>, ptr addrspace(1) %arg, align 32

‎llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
5656
; GFX10: ; %bb.0: ; %main_body
5757
; GFX10-NEXT: s_mov_b32 s12, exec_lo
5858
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
59-
; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
59+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
6060
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
61-
; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
61+
; GFX10-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
6262
; GFX10-NEXT: s_waitcnt vmcnt(0)
6363
; GFX10-NEXT: ; return to shader part epilog
6464
;
@@ -92,9 +92,9 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
9292
; GFX10: ; %bb.0: ; %main_body
9393
; GFX10-NEXT: s_mov_b32 s12, exec_lo
9494
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
95-
; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
95+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
9696
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
97-
; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
97+
; GFX10-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
9898
; GFX10-NEXT: s_waitcnt vmcnt(0)
9999
; GFX10-NEXT: ; return to shader part epilog
100100
;
@@ -164,9 +164,9 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
164164
; GFX10: ; %bb.0: ; %main_body
165165
; GFX10-NEXT: s_mov_b32 s12, exec_lo
166166
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
167-
; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
167+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
168168
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
169-
; GFX10-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
169+
; GFX10-NEXT: image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
170170
; GFX10-NEXT: s_waitcnt vmcnt(0)
171171
; GFX10-NEXT: ; return to shader part epilog
172172
;
@@ -382,8 +382,8 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
382382
;
383383
; GFX10-LABEL: gather4_l_2d:
384384
; GFX10: ; %bb.0: ; %main_body
385-
; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
386-
; GFX10-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
385+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
386+
; GFX10-NEXT: image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
387387
; GFX10-NEXT: s_waitcnt vmcnt(0)
388388
; GFX10-NEXT: ; return to shader part epilog
389389
;

‎llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -283,8 +283,8 @@ main_body:
283283
define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %fragid) {
284284
; GFX11-LABEL: load_2dmsaa_a16:
285285
; GFX11: ; %bb.0: ; %main_body
286-
; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
287-
; GFX11-NEXT: image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x01,0x61,0xf0,0x01,0x00,0x00,0x00]
286+
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
287+
; GFX11-NEXT: image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x99,0x01,0x61,0xf0,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
288288
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
289289
; GFX11-NEXT: ; return to shader part epilog
290290
;

‎llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
1+
; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
22
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
33
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
4-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1010-NSA %s
5-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1030-NSA %s
6-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
4+
; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1010-NSA %s
5+
; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1030-NSA %s
6+
; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
77
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
88
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
9-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX11-NSA %s
9+
; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX11-NSA %s
1010

1111
; Default NSA threshold is 3 addresses
1212
; GCN-LABEL: {{^}}sample_2d:

‎llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -107,19 +107,19 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
107107
; GFX10: ; %bb.0: ; %main_body
108108
; GFX10-NEXT: s_mov_b32 s12, exec_lo
109109
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
110-
; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
110+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
111111
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
112-
; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
112+
; GFX10-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
113113
; GFX10-NEXT: s_waitcnt vmcnt(0)
114114
; GFX10-NEXT: ; return to shader part epilog
115115
;
116116
; GFX11-LABEL: sample_3d:
117117
; GFX11: ; %bb.0: ; %main_body
118118
; GFX11-NEXT: s_mov_b32 s12, exec_lo
119119
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
120-
; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
120+
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
121121
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
122-
; GFX11-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
122+
; GFX11-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
123123
; GFX11-NEXT: s_waitcnt vmcnt(0)
124124
; GFX11-NEXT: ; return to shader part epilog
125125
;
@@ -153,19 +153,19 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
153153
; GFX10: ; %bb.0: ; %main_body
154154
; GFX10-NEXT: s_mov_b32 s12, exec_lo
155155
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
156-
; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
156+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
157157
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
158-
; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
158+
; GFX10-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
159159
; GFX10-NEXT: s_waitcnt vmcnt(0)
160160
; GFX10-NEXT: ; return to shader part epilog
161161
;
162162
; GFX11-LABEL: sample_cube:
163163
; GFX11: ; %bb.0: ; %main_body
164164
; GFX11-NEXT: s_mov_b32 s12, exec_lo
165165
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
166-
; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
166+
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
167167
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
168-
; GFX11-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
168+
; GFX11-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
169169
; GFX11-NEXT: s_waitcnt vmcnt(0)
170170
; GFX11-NEXT: ; return to shader part epilog
171171
;
@@ -245,19 +245,19 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
245245
; GFX10: ; %bb.0: ; %main_body
246246
; GFX10-NEXT: s_mov_b32 s12, exec_lo
247247
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
248-
; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
248+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
249249
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
250-
; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
250+
; GFX10-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
251251
; GFX10-NEXT: s_waitcnt vmcnt(0)
252252
; GFX10-NEXT: ; return to shader part epilog
253253
;
254254
; GFX11-LABEL: sample_2darray:
255255
; GFX11: ; %bb.0: ; %main_body
256256
; GFX11-NEXT: s_mov_b32 s12, exec_lo
257257
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
258-
; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
258+
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
259259
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
260-
; GFX11-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
260+
; GFX11-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
261261
; GFX11-NEXT: s_waitcnt vmcnt(0)
262262
; GFX11-NEXT: ; return to shader part epilog
263263
;
@@ -424,19 +424,19 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
424424
; GFX10: ; %bb.0: ; %main_body
425425
; GFX10-NEXT: s_mov_b32 s12, exec_lo
426426
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
427-
; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
427+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
428428
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
429-
; GFX10-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
429+
; GFX10-NEXT: image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
430430
; GFX10-NEXT: s_waitcnt vmcnt(0)
431431
; GFX10-NEXT: ; return to shader part epilog
432432
;
433433
; GFX11-LABEL: sample_cl_2d:
434434
; GFX11: ; %bb.0: ; %main_body
435435
; GFX11-NEXT: s_mov_b32 s12, exec_lo
436436
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
437-
; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
437+
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
438438
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
439-
; GFX11-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
439+
; GFX11-NEXT: image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
440440
; GFX11-NEXT: s_waitcnt vmcnt(0)
441441
; GFX11-NEXT: ; return to shader part epilog
442442
;
@@ -1304,15 +1304,15 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
13041304
;
13051305
; GFX10-LABEL: sample_l_2d:
13061306
; GFX10: ; %bb.0: ; %main_body
1307-
; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
1308-
; GFX10-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
1307+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
1308+
; GFX10-NEXT: image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
13091309
; GFX10-NEXT: s_waitcnt vmcnt(0)
13101310
; GFX10-NEXT: ; return to shader part epilog
13111311
;
13121312
; GFX11-LABEL: sample_l_2d:
13131313
; GFX11: ; %bb.0: ; %main_body
1314-
; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
1315-
; GFX11-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
1314+
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
1315+
; GFX11-NEXT: image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
13161316
; GFX11-NEXT: s_waitcnt vmcnt(0)
13171317
; GFX11-NEXT: ; return to shader part epilog
13181318
;

‎llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -110,28 +110,29 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
110110
; GFX10-NEXT: s_mov_b32 s14, exec_lo
111111
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
112112
; GFX10-NEXT: v_mov_b32_e32 v4, 0
113+
; GFX10-NEXT: v_mov_b32_e32 v3, v1
114+
; GFX10-NEXT: v_mov_b32_e32 v2, v0
113115
; GFX10-NEXT: v_mov_b32_e32 v5, v4
114-
; GFX10-NEXT: v_mov_b32_e32 v2, v4
115-
; GFX10-NEXT: v_mov_b32_e32 v3, v5
116+
; GFX10-NEXT: v_mov_b32_e32 v0, v4
117+
; GFX10-NEXT: v_mov_b32_e32 v1, v5
116118
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14
117-
; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
119+
; GFX10-NEXT: image_sample v[0:1], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
118120
; GFX10-NEXT: s_waitcnt vmcnt(0)
119-
; GFX10-NEXT: v_mov_b32_e32 v0, v2
120-
; GFX10-NEXT: global_store_dword v4, v3, s[12:13]
121+
; GFX10-NEXT: global_store_dword v4, v1, s[12:13]
121122
; GFX10-NEXT: ; return to shader part epilog
122123
;
123124
; GFX11-LABEL: image_sample_2d_f16_tfe:
124125
; GFX11: ; %bb.0: ; %main_body
125126
; GFX11-NEXT: s_mov_b32 s14, exec_lo
126127
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
127-
; GFX11-NEXT: v_mov_b32_e32 v4, 0
128+
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, v1
129+
; GFX11-NEXT: v_mov_b32_e32 v2, v0
128130
; GFX11-NEXT: v_mov_b32_e32 v5, v4
129-
; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5
131+
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
130132
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14
131-
; GFX11-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
133+
; GFX11-NEXT: image_sample v[0:1], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
132134
; GFX11-NEXT: s_waitcnt vmcnt(0)
133-
; GFX11-NEXT: v_mov_b32_e32 v0, v2
134-
; GFX11-NEXT: global_store_b32 v4, v3, s[12:13]
135+
; GFX11-NEXT: global_store_b32 v4, v1, s[12:13]
135136
; GFX11-NEXT: ; return to shader part epilog
136137
;
137138
; GFX12-LABEL: image_sample_2d_f16_tfe:

‎llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,47 +25,44 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) {
2525
; GFX11-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
2626
; GFX11-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
2727
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3, [[COPY3]], %subreg.sub4, [[COPY2]], %subreg.sub5, [[COPY1]], %subreg.sub6, [[COPY]], %subreg.sub7
28-
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
29-
; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
30-
; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
31-
; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
32-
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
28+
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
29+
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
3330
; GFX11-NEXT: {{ $}}
3431
; GFX11-NEXT: bb.1:
3532
; GFX11-NEXT: successors: %bb.2(0x80000000)
3633
; GFX11-NEXT: {{ $}}
3734
; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
3835
; GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
39-
; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
40-
; GFX11-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
36+
; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
37+
; GFX11-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
4138
; GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
4239
; GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
43-
; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
44-
; GFX11-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
40+
; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
41+
; GFX11-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
4542
; GFX11-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
4643
; GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec
4744
; GFX11-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec
48-
; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1
49-
; GFX11-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE]].sub4_sub5, implicit $exec
45+
; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1
46+
; GFX11-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub4_sub5, implicit $exec
5047
; GFX11-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U64_e64_2]], implicit-def $scc
5148
; GFX11-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec
5249
; GFX11-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec
53-
; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1
54-
; GFX11-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE]].sub6_sub7, implicit $exec
50+
; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1
51+
; GFX11-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE]].sub6_sub7, implicit $exec
5552
; GFX11-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_1]], [[V_CMP_EQ_U64_e64_3]], implicit-def $scc
56-
; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3, [[V_READFIRSTLANE_B32_4]], %subreg.sub4, [[V_READFIRSTLANE_B32_5]], %subreg.sub5, [[V_READFIRSTLANE_B32_6]], %subreg.sub6, [[V_READFIRSTLANE_B32_7]], %subreg.sub7
53+
; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3, [[V_READFIRSTLANE_B32_4]], %subreg.sub4, [[V_READFIRSTLANE_B32_5]], %subreg.sub5, [[V_READFIRSTLANE_B32_6]], %subreg.sub6, [[V_READFIRSTLANE_B32_7]], %subreg.sub7
5754
; GFX11-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec
5855
; GFX11-NEXT: {{ $}}
5956
; GFX11-NEXT: bb.2:
6057
; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
6158
; GFX11-NEXT: {{ $}}
62-
; GFX11-NEXT: [[IMAGE_LOAD_V1_V2_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 [[REG_SEQUENCE1]], killed [[REG_SEQUENCE6]], 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
59+
; GFX11-NEXT: [[IMAGE_LOAD_V1_V2_nsa_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_LOAD_V1_V2_nsa_gfx11 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
6360
; GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
6461
; GFX11-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
6562
; GFX11-NEXT: {{ $}}
6663
; GFX11-NEXT: bb.3:
67-
; GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]]
68-
; GFX11-NEXT: $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_gfx11_]]
64+
; GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
65+
; GFX11-NEXT: $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_nsa_gfx11_]]
6966
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
7067
;
7168
; GFX12-LABEL: name: vimage_move_to_valu

‎llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
357357
; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
358358
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
359359
; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
360-
; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
360+
; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
361361
; CHECK-NEXT: S_ENDPGM 0
362362
.expVert:
363363
%0 = extractelement <31 x i32> %userData, i64 2

‎llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
635635
; SI-NEXT: bb.5:
636636
; SI-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000)
637637
; SI-NEXT: {{ $}}
638-
; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %29:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
638+
; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_nsa_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_nsa_gfx10 undef %29:vgpr_32, undef %31:vgpr_32, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
639639
; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc
640640
; SI-NEXT: SI_WATERFALL_LOOP %bb.4, implicit $exec
641641
; SI-NEXT: {{ $}}
@@ -648,7 +648,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
648648
; SI-NEXT: {{ $}}
649649
; SI-NEXT: bb.7:
650650
; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]]
651-
; SI-NEXT: GLOBAL_STORE_DWORD undef %32:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
651+
; SI-NEXT: GLOBAL_STORE_DWORD undef %34:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_nsa_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
652652
; SI-NEXT: S_ENDPGM 0
653653
entry:
654654
%0 = tail call i32 @llvm.amdgcn.workitem.id.x()

‎llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2205,8 +2205,8 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
22052205
; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
22062206
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB39_2
22072207
; GFX10-W32-NEXT: ; %bb.1: ; %else
2208-
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
22092208
; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1
2209+
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
22102210
; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
22112211
; GFX10-W32-NEXT: s_cbranch_execz .LBB39_3
22122212
; GFX10-W32-NEXT: s_branch .LBB39_4

0 commit comments

Comments
 (0)
Please sign in to comment.