|
12 | 12 | ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
|
13 | 13 | ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
|
14 | 14 | ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
|
| 15 | +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s |
15 | 16 |
|
16 | 17 | define amdgpu_kernel void @private_nontemporal_load_0(
|
17 | 18 | ; GFX6-LABEL: private_nontemporal_load_0:
|
@@ -201,6 +202,17 @@ define amdgpu_kernel void @private_nontemporal_load_0(
|
201 | 202 | ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
|
202 | 203 | ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
203 | 204 | ; GFX12-CU-NEXT: s_endpgm
|
| 205 | +; |
| 206 | +; GFX1250-LABEL: private_nontemporal_load_0: |
| 207 | +; GFX1250: ; %bb.0: ; %entry |
| 208 | +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 |
| 209 | +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 |
| 210 | +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 |
| 211 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 212 | +; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT |
| 213 | +; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| 214 | +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] |
| 215 | +; GFX1250-NEXT: s_endpgm |
204 | 216 | ptr addrspace(5) %in, ptr addrspace(1) %out) {
|
205 | 217 | entry:
|
206 | 218 | %val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0
|
@@ -450,6 +462,20 @@ define amdgpu_kernel void @private_nontemporal_load_1(
|
450 | 462 | ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
|
451 | 463 | ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
452 | 464 | ; GFX12-CU-NEXT: s_endpgm
|
| 465 | +; |
| 466 | +; GFX1250-LABEL: private_nontemporal_load_1: |
| 467 | +; GFX1250: ; %bb.0: ; %entry |
| 468 | +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 |
| 469 | +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 |
| 470 | +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 |
| 471 | +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 |
| 472 | +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff |
| 473 | +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 |
| 474 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 475 | +; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset th:TH_LOAD_NT |
| 476 | +; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| 477 | +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] |
| 478 | +; GFX1250-NEXT: s_endpgm |
453 | 479 | ptr addrspace(5) %in, ptr addrspace(1) %out) {
|
454 | 480 | entry:
|
455 | 481 | %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
@@ -627,6 +653,17 @@ define amdgpu_kernel void @private_nontemporal_store_0(
|
627 | 653 | ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
|
628 | 654 | ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT
|
629 | 655 | ; GFX12-CU-NEXT: s_endpgm
|
| 656 | +; |
| 657 | +; GFX1250-LABEL: private_nontemporal_store_0: |
| 658 | +; GFX1250: ; %bb.0: ; %entry |
| 659 | +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 |
| 660 | +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 |
| 661 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 662 | +; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| 663 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 664 | +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 |
| 665 | +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT |
| 666 | +; GFX1250-NEXT: s_endpgm |
630 | 667 | ptr addrspace(1) %in, ptr addrspace(5) %out) {
|
631 | 668 | entry:
|
632 | 669 | %val = load i32, ptr addrspace(1) %in, align 4
|
@@ -846,6 +883,20 @@ define amdgpu_kernel void @private_nontemporal_store_1(
|
846 | 883 | ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
|
847 | 884 | ; GFX12-CU-NEXT: scratch_store_b32 v1, v0, s0 th:TH_STORE_NT
|
848 | 885 | ; GFX12-CU-NEXT: s_endpgm
|
| 886 | +; |
| 887 | +; GFX1250-LABEL: private_nontemporal_store_1: |
| 888 | +; GFX1250: ; %bb.0: ; %entry |
| 889 | +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 |
| 890 | +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 |
| 891 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 892 | +; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| 893 | +; GFX1250-NEXT: s_wait_xcnt 0x0 |
| 894 | +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff |
| 895 | +; GFX1250-NEXT: v_and_b32_e64 v1, v0, s2 |
| 896 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 897 | +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 |
| 898 | +; GFX1250-NEXT: scratch_store_b32 v1, v0, s0 scale_offset th:TH_STORE_NT |
| 899 | +; GFX1250-NEXT: s_endpgm |
849 | 900 | ptr addrspace(1) %in, ptr addrspace(5) %out) {
|
850 | 901 | entry:
|
851 | 902 | %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
@@ -1047,6 +1098,17 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
|
1047 | 1098 | ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
|
1048 | 1099 | ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
1049 | 1100 | ; GFX12-CU-NEXT: s_endpgm
|
| 1101 | +; |
| 1102 | +; GFX1250-LABEL: private_nontemporal_volatile_load: |
| 1103 | +; GFX1250: ; %bb.0: ; %entry |
| 1104 | +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 |
| 1105 | +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 |
| 1106 | +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 |
| 1107 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 1108 | +; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS |
| 1109 | +; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| 1110 | +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] |
| 1111 | +; GFX1250-NEXT: s_endpgm |
1050 | 1112 | ptr addrspace(5) %in, ptr addrspace(1) %out) {
|
1051 | 1113 | entry:
|
1052 | 1114 | %val = load volatile i32, ptr addrspace(5) %in, align 4, !nontemporal !0
|
|
0 commit comments