From d21307f84fe70a1cf7153d7d2d83dc16b52950eb Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 26 May 2025 19:22:31 +0100 Subject: [PATCH 1/3] Precommit tests --- .../rvv/fixed-vectors-deinterleave-load.ll | 191 +++++++++++++++- .../rvv/fixed-vectors-interleave-store.ll | 148 +++++++++++++ .../RISCV/rvv/vector-deinterleave-load.ll | 127 ++++++++++- .../RISCV/rvv/vector-interleave-store.ll | 207 +++++++++++++++++- .../RISCV/interleaved-accesses.ll | 166 +++++++++++++- 5 files changed, 826 insertions(+), 13 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index 31529b1783651..6b6c64b54956b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -277,6 +277,55 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 24 +; CHECK-NEXT: vslidedown.vi v12, v8, 16 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 8 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv.v.v v9, v12 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load <32 x i8>, ptr %p + %d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave4(<32 x i8> %vec) + %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0 + %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1 + %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2 + %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3 + %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0 + %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1 + %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2 + %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3 + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3 +} + +; TODO: Remove once recursive deinterleaving support is removed +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4_recursive(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor4_recursive: +; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret @@ -319,6 +368,67 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4 } +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor6(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: li a1, 48 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, 40 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: add a0, a2, a2 +; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v14, v8, 24 +; CHECK-NEXT: vslidedown.vi v20, v8, 16 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v13, v8, 8 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v20, v14, a2 +; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v13, a2 +; CHECK-NEXT: vslideup.vx v16, v12, a2 +; CHECK-NEXT: vmv1r.v v9, v20 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v10, v16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vlseg6e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load <48 x i8>, ptr %p + %d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave6(<48 x i8> %vec) + %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0 + %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1 + %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2 + %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3 + %t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4 + %t5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 5 + %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0 + %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1 + %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2 + %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3 + %res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4 + %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4, <8 x i8> %t5, 5 + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res5 +} + define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor7(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor7: ; CHECK: # %bb.0: @@ -339,14 +449,89 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2 %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3 %res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4 - %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t5, 5 - %res6 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t6, 6 + %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4, <8 x i8> %t5, 5 + %res6 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res5, <8 x i8> %t6, 6 ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6 } -define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8(ptr %ptr) { +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor8(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor8: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, 56 +; CHECK-NEXT: li a1, 48 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: li a0, 40 +; CHECK-NEXT: vslidedown.vx v12, v8, a1 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: vslidedown.vx v20, v8, a0 +; CHECK-NEXT: add a0, a2, a2 +; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v14, v8, 24 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v12, v16, a2 +; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v22, v8, 16 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v22, v14, a2 +; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a1 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 8 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a2 +; CHECK-NEXT: vslideup.vx v16, v20, a2 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v17, v12 +; CHECK-NEXT: vmv2r.v v10, v16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vlseg8e8.v v8, (a0) +; CHECK-NEXT: vmv1r.v v15, v14 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load <64 x i8>, ptr %p + %d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave8(<64 x i8> %vec) + %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0 + %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1 + %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2 + %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3 + %t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4 + %t5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 5 + %t6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 6 + %t7 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 7 + %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0 + %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1 + %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2 + %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3 + %res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4 + %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4, <8 x i8> %t5, 5 + %res6 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res5, <8 x i8> %t6, 6 + %res7 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6, <8 x i8> %t6, 7 + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res7 +} + +; TODO: Remove once recursive deinterleaving support is removed +define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8_recursive(ptr %ptr) { +; CHECK-LABEL: vector_deinterleave_load_factor8_recursive: +; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll index 8244db45a7ef2..6497675bd56f1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll @@ -195,6 +195,45 @@ define void @vector_interleave_store_factor3(<4 x i32> %a, <4 x i32> %b, <4 x i3 define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor4: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: add a3, a1, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a1) +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: vl1re32.v v8, (a4) +; CHECK-NEXT: vl1re32.v v10, (a2) +; CHECK-NEXT: vl1re32.v v12, (a1) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vl1re32.v v10, (a3) +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslideup.vi v12, v8, 8 +; CHECK-NEXT: vse32.v v12, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %v = call <16 x i32> @llvm.vector.interleave4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) + store <16 x i32> %v, ptr %p + ret void +} + +; TODO: Remove once recursive interleaving support is removed +define void @vector_interleave_store_factor4_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) { +; CHECK-LABEL: vector_interleave_store_factor4_recursive: +; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vsseg4e32.v v8, (a0) ; CHECK-NEXT: ret @@ -216,6 +255,60 @@ define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i3 ret void } +define void @vector_interleave_store_factor6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, ptr %p) { +; CHECK-LABEL: vector_interleave_store_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 6 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: add a4, a1, a2 +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg6e32.v v8, (a1) +; CHECK-NEXT: vl1re32.v v12, (a5) +; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: vl1re32.v v10, (a5) +; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: vl1re32.v v14, (a4) +; CHECK-NEXT: vl1re32.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: vslideup.vi v8, v14, 4 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vl1re32.v v16, (a5) +; CHECK-NEXT: add a2, a5, a2 +; CHECK-NEXT: vl1re32.v v12, (a2) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v16, v12, 4 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vslideup.vi v8, v16, 16 +; CHECK-NEXT: vsetivli zero, 24, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %v = call <24 x i32> @llvm.vector.interleave6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f) + store <24 x i32> %v, ptr %p + ret void +} + define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor7: ; CHECK: # %bb.0: @@ -230,6 +323,61 @@ define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i3 define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor8: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a1, a3 +; CHECK-NEXT: add a4, a2, a3 +; CHECK-NEXT: add a5, a4, a3 +; CHECK-NEXT: add a6, a5, a3 +; CHECK-NEXT: add a7, a6, a3 +; CHECK-NEXT: add t0, a7, a3 +; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a1) +; CHECK-NEXT: add a3, t0, a3 +; CHECK-NEXT: vl1re32.v v10, (a3) +; CHECK-NEXT: vl1re32.v v12, (t0) +; CHECK-NEXT: vl1re32.v v14, (a7) +; CHECK-NEXT: vl1re32.v v8, (a6) +; CHECK-NEXT: vl1re32.v v18, (a5) +; CHECK-NEXT: vl1re32.v v20, (a4) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: vslideup.vi v8, v14, 4 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vl1re32.v v16, (a1) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v20, v18, 4 +; CHECK-NEXT: vl1re32.v v12, (a2) +; CHECK-NEXT: vslideup.vi v16, v12, 4 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslideup.vi v16, v20, 8 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vslideup.vi v16, v8, 16 +; CHECK-NEXT: vse32.v v16, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %v = call <32 x i32> @llvm.vector.interleave8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) + store <32 x i32> %v, ptr %p + ret void +} + +; TODO: Remove once recursive interleaving support is removed +define void @vector_interleave_store_factor8_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) { +; CHECK-LABEL: vector_interleave_store_factor8_recursive: +; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vsseg8e32.v v8, (a0) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 0483bbbd35b39..b15ccdedda8f1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -364,6 +364,41 @@ define { , , } @vector_deint define { , , , } @vector_deinterleave_load_factor4(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: vl4r.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load , ptr %p + %d0 = call { , , , } @llvm.vector.deinterleave4( %vec) + %t0 = extractvalue { , , , } %d0, 0 + %t1 = extractvalue { , , , } %d0, 1 + %t2 = extractvalue { , , , } %d0, 2 + %t3 = extractvalue { , , , } %d0, 3 + %res0 = insertvalue { , , , } poison, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +; TODO: Remove once recursive deinterleaving support is removed +define { , , , } @vector_deinterleave_load_factor4_recursive(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor4_recursive: +; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret @@ -406,6 +441,49 @@ define { , , , , , , , } %res4 } +define { , , , , , } @vector_deinterleave_load_factor6(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub a1, a1, a2 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg6e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load , ptr %p + %d0 = call { , , , , , } @llvm.vector.deinterleave6( %vec) + %t0 = extractvalue { , , , , , } %d0, 0 + %t1 = extractvalue { , , , , , } %d0, 1 + %t2 = extractvalue { , , , , , } %d0, 2 + %t3 = extractvalue { , , , , , } %d0, 3 + %t4 = extractvalue { , , , , , } %d0, 4 + %t5 = extractvalue { , , , , , } %d0, 5 + %res0 = insertvalue { , , , , , } poison, %t0, 0 + %res1 = insertvalue { , , , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , , , } %res2, %t3, 3 + %res4 = insertvalue { , , , , , } %res3, %t4, 4 + %res5 = insertvalue { , , , , , } %res4, %t5, 5 + ret { , , , , , } %res5 +} + define { , , , , , , } @vector_deinterleave_load_factor7(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor7: ; CHECK: # %bb.0: @@ -426,14 +504,57 @@ define { , , , , , , , , , } %res1, %t2, 2 %res3 = insertvalue { , , , , , , } %res2, %t3, 3 %res4 = insertvalue { , , , , , , } %res3, %t4, 4 - %res5 = insertvalue { , , , , , , } %res3, %t5, 5 - %res6 = insertvalue { , , , , , , } %res3, %t6, 6 + %res5 = insertvalue { , , , , , , } %res4, %t5, 5 + %res6 = insertvalue { , , , , , , } %res5, %t6, 6 ret { , , , , , , } %res6 } -define {, , , , , , , } @vector_deinterleave_load_factor8(ptr %ptr) { +define { , , , , , , , } @vector_deinterleave_load_factor8(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor8: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vl8r.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg8e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load , ptr %p + %d0 = call { , , , , , , , } @llvm.vector.deinterleave8( %vec) + %t0 = extractvalue { , , , , , , , } %d0, 0 + %t1 = extractvalue { , , , , , , , } %d0, 1 + %t2 = extractvalue { , , , , , , , } %d0, 2 + %t3 = extractvalue { , , , , , , , } %d0, 3 + %t4 = extractvalue { , , , , , , , } %d0, 4 + %t5 = extractvalue { , , , , , , , } %d0, 5 + %t6 = extractvalue { , , , , , , , } %d0, 6 + %t7 = extractvalue { , , , , , , , } %d0, 7 + %res0 = insertvalue { , , , , , , , } poison, %t0, 0 + %res1 = insertvalue { , , , , , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , , , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , , , , , } %res2, %t3, 3 + %res4 = insertvalue { , , , , , , , } %res3, %t4, 4 + %res5 = insertvalue { , , , , , , , } %res4, %t5, 5 + %res6 = insertvalue { , , , , , , , } %res5, %t6, 6 + %res7 = insertvalue { , , , , , , , } %res6, %t7, 7 + ret { , , , , , , , } %res7 +} + +; TODO: Remove once recursive deinterleaving support is removed +define {, , , , , , , } @vector_deinterleave_load_factor8_recursive(ptr %ptr) { +; CHECK-LABEL: vector_deinterleave_load_factor8_recursive: +; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index 4332ca411d91b..26d387baa5e22 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -250,9 +250,43 @@ define void @vector_interleave_store_factor3( %a, %a, %b, %c, %d, ptr %p) { +define void @vector_interleave_store_factor4( %a, %b, %c, %d, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor4: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: add a3, a1, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a1) +; CHECK-NEXT: vl1re32.v v10, (a4) +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: vl1re32.v v11, (a2) +; CHECK-NEXT: vl1re32.v v8, (a1) +; CHECK-NEXT: vl1re32.v v9, (a3) +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %v = call @llvm.vector.interleave4( %a, %b, %c, %d) + store %v, ptr %p + ret void +} + +; TODO: Remove once recursive interleaving support is removed +define void @vector_interleave_store_factor4_recursive( %a, %b, %c, %d, ptr %p) { +; CHECK-LABEL: vector_interleave_store_factor4_recursive: +; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; CHECK-NEXT: vsseg4e32.v v8, (a0) ; CHECK-NEXT: ret @@ -274,6 +308,135 @@ define void @vector_interleave_store_factor5( %a, %a, %b, %c, %d, %e, %f, ptr %p) { +; RV32-LABEL: vector_interleave_store_factor6: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: add a2, a2, a1 +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 14 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: add a3, a1, a2 +; RV32-NEXT: add a4, a3, a2 +; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV32-NEXT: vsseg6e32.v v8, (a1) +; RV32-NEXT: vl1re32.v v10, (a4) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v11, (a4) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v8, (a1) +; RV32-NEXT: vl1re32.v v9, (a3) +; RV32-NEXT: vl1re32.v v12, (a4) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v13, (a4) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: srli a0, a2, 3 +; RV32-NEXT: li a1, 12 +; RV32-NEXT: call __mulsi3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vse32.v v8, (s0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 32 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_store_factor6: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: mv a2, a1 +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: add a2, a2, a1 +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 14 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: add a3, a1, a2 +; RV64-NEXT: add a4, a3, a2 +; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV64-NEXT: vsseg6e32.v v8, (a1) +; RV64-NEXT: vl1re32.v v10, (a4) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v11, (a4) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v8, (a1) +; RV64-NEXT: vl1re32.v v9, (a3) +; RV64-NEXT: vl1re32.v v12, (a4) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v13, (a4) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: srli a0, a2, 3 +; RV64-NEXT: li a1, 12 +; RV64-NEXT: call __muldi3 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vse32.v v8, (s0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 32 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %v = call @llvm.vector.interleave6( %a, %b, %c, %d, %e, %f) + store %v, ptr %p + ret void +} + define void @vector_interleave_store_factor7( %a, %b, %c, %d, %e, %f, %g, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor7: ; CHECK: # %bb.0: @@ -288,6 +451,48 @@ define void @vector_interleave_store_factor7( %a, %a, %b, %c, %d, %e, %f, %g, %h, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor8: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: add a3, a1, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: add a6, a5, a2 +; CHECK-NEXT: add a7, a6, a2 +; CHECK-NEXT: add t0, a7, a2 +; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a1) +; CHECK-NEXT: vl1re32.v v14, (t0) +; CHECK-NEXT: add a2, t0, a2 +; CHECK-NEXT: vl1re32.v v15, (a2) +; CHECK-NEXT: vl1re32.v v12, (a6) +; CHECK-NEXT: vl1re32.v v13, (a7) +; CHECK-NEXT: vl1re32.v v10, (a4) +; CHECK-NEXT: vl1re32.v v11, (a5) +; CHECK-NEXT: vl1re32.v v8, (a1) +; CHECK-NEXT: vl1re32.v v9, (a3) +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %v = call @llvm.vector.interleave8( %a, %b, %c, %d, %e, %f, %g, %h) + store %v, ptr %p + ret void +} + +; TODO: Remove once recursive interleaving support is removed +define void @vector_interleave_store_factor8_recursive( %a, %b, %c, %d, %e, %f, %g, %h, ptr %p) { +; CHECK-LABEL: vector_interleave_store_factor8_recursive: +; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vsseg8e32.v v8, (a0) ; CHECK-NEXT: ret diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll index 0a20e03d0dff1..ebdeb70538d4a 100644 --- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll @@ -135,6 +135,35 @@ define void @load_factor4(ptr %ptr) { define void @load_factor4_vscale(ptr %ptr) { ; RV32-LABEL: @load_factor4_vscale( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 +; RV32-NEXT: [[TMP9:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[INTERLEAVED_VEC]]) +; RV32-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP9]], 0 +; RV32-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP9]], 1 +; RV32-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[TMP9]], 2 +; RV32-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP9]], 3 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor4_vscale( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 +; RV64-NEXT: [[TMP9:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[INTERLEAVED_VEC]]) +; RV64-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP9]], 0 +; RV64-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP9]], 1 +; RV64-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[TMP9]], 2 +; RV64-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP9]], 3 +; RV64-NEXT: ret void +; + %interleaved.vec = load , ptr %ptr + %v = call { , , , } @llvm.vector.deinterleave4.nxv16i32( %interleaved.vec) + %t0 = extractvalue { , , , } %v, 0 + %t1 = extractvalue { , , , } %v, 1 + %t2 = extractvalue { , , , } %v, 2 + %t3 = extractvalue { , , , } %v, 3 + ret void +} + +; TODO: Remove once recursive deinterleaving support is removed +define void @load_factor4_vscale_recursive(ptr %ptr) { +; RV32-LABEL: @load_factor4_vscale_recursive( ; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i32(target("riscv.vector.tuple", , 4) poison, ptr [[PTR:%.*]], i32 -1, i32 5) ; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 0) ; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , } poison, [[TMP2]], 0 @@ -150,7 +179,7 @@ define void @load_factor4_vscale(ptr %ptr) { ; RV32-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP9]], 3 ; RV32-NEXT: ret void ; -; RV64-LABEL: @load_factor4_vscale( +; RV64-LABEL: @load_factor4_vscale_recursive( ; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", , 4) poison, ptr [[PTR:%.*]], i64 -1, i64 5) ; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 0) ; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , } poison, [[TMP2]], 0 @@ -287,6 +316,40 @@ define void @load_factor6(ptr %ptr) { ret void } +define void @load_factor6_vscale(ptr %ptr) { +; RV32-LABEL: @load_factor6_vscale( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 +; RV32-NEXT: [[TMP13:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv12i32( [[INTERLEAVED_VEC]]) +; RV32-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , } [[TMP13]], 0 +; RV32-NEXT: [[TMP15:%.*]] = extractvalue { , , , , , } [[TMP13]], 1 +; RV32-NEXT: [[TMP16:%.*]] = extractvalue { , , , , , } [[TMP13]], 2 +; RV32-NEXT: [[TMP17:%.*]] = extractvalue { , , , , , } [[TMP13]], 3 +; RV32-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , } [[TMP13]], 4 +; RV32-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , } [[TMP13]], 5 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor6_vscale( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 +; RV64-NEXT: [[TMP13:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv12i32( [[INTERLEAVED_VEC]]) +; RV64-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , } [[TMP13]], 0 +; RV64-NEXT: [[TMP15:%.*]] = extractvalue { , , , , , } [[TMP13]], 1 +; RV64-NEXT: [[TMP16:%.*]] = extractvalue { , , , , , } [[TMP13]], 2 +; RV64-NEXT: [[TMP17:%.*]] = extractvalue { , , , , , } [[TMP13]], 3 +; RV64-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , } [[TMP13]], 4 +; RV64-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , } [[TMP13]], 5 +; RV64-NEXT: ret void +; + %interleaved.vec = load , ptr %ptr + %v = call { , , , , , } @llvm.vector.deinterleave6.nxv12i32( %interleaved.vec) + %t0 = extractvalue { , , , , , } %v, 0 + %t1 = extractvalue { , , , , , } %v, 1 + %t2 = extractvalue { , , , , , } %v, 2 + %t3 = extractvalue { , , , , , } %v, 3 + %t4 = extractvalue { , , , , , } %v, 4 + %t5 = extractvalue { , , , , , } %v, 5 + ret void +} + define void @load_factor7(ptr %ptr) { ; RV32-LABEL: @load_factor7( ; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg7.load.mask.v4i32.p0.i32(ptr [[PTR:%.*]], <4 x i1> splat (i1 true), i32 4) @@ -423,6 +486,47 @@ define void @load_factor8(ptr %ptr) { define void @load_factor8_vscale(ptr %ptr) { ; RV32-LABEL: @load_factor8_vscale( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 +; RV32-NEXT: [[TMP17:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv16i32( [[INTERLEAVED_VEC]]) +; RV32-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 0 +; RV32-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 1 +; RV32-NEXT: [[TMP20:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 2 +; RV32-NEXT: [[TMP21:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 3 +; RV32-NEXT: [[TMP22:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 4 +; RV32-NEXT: [[TMP23:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 5 +; RV32-NEXT: [[TMP24:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 6 +; RV32-NEXT: [[TMP25:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 7 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor8_vscale( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 +; RV64-NEXT: [[TMP17:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv16i32( [[INTERLEAVED_VEC]]) +; RV64-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 0 +; RV64-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 1 +; RV64-NEXT: [[TMP20:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 2 +; RV64-NEXT: [[TMP21:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 3 +; RV64-NEXT: [[TMP22:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 4 +; RV64-NEXT: [[TMP23:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 5 +; RV64-NEXT: [[TMP24:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 6 +; RV64-NEXT: [[TMP25:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 7 +; RV64-NEXT: ret void +; + %interleaved.vec = load , ptr %ptr + %v = call { , , , , , , , } @llvm.vector.deinterleave8.nxv16i32( %interleaved.vec) + %t0 = extractvalue { , , , , , , , } %v, 0 + %t1 = extractvalue { , , , , , , , } %v, 1 + %t2 = extractvalue { , , , , , , , } %v, 2 + %t3 = extractvalue { , , , , , , , } %v, 3 + %t4 = extractvalue { , , , , , , , } %v, 4 + %t5 = extractvalue { , , , , , , , } %v, 5 + %t6 = extractvalue { , , , , , , , } %v, 6 + %t7 = extractvalue { , , , , , , , } %v, 7 + ret void +} + +; TODO: Remove once recursive deinterleaving support is removed +define void @load_factor8_vscale_recursive(ptr %ptr) { +; RV32-LABEL: @load_factor8_vscale_recursive( ; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", , 8) poison, ptr [[PTR:%.*]], i32 -1, i32 5) ; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 0) ; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , , } poison, [[TMP2]], 0 @@ -450,7 +554,7 @@ define void @load_factor8_vscale(ptr %ptr) { ; RV32-NEXT: [[TMP25:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 7 ; RV32-NEXT: ret void ; -; RV64-LABEL: @load_factor8_vscale( +; RV64-LABEL: @load_factor8_vscale_recursive( ; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", , 8) poison, ptr [[PTR:%.*]], i64 -1, i64 5) ; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 0) ; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , , } poison, [[TMP2]], 0 @@ -616,8 +720,25 @@ define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2 ret void } -define void @store_factor4_vscale(ptr %ptr, %v0, %v1) { +define void @store_factor4_vscale(ptr %ptr, %v0, %v1, %v2, %v3) { ; RV32-LABEL: @store_factor4_vscale( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv32i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]]) +; RV32-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 32 +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor4_vscale( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv32i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]]) +; RV64-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 32 +; RV64-NEXT: ret void +; + %interleaved.vec = call @llvm.vector.interleave4.nxv8i8( %v0, %v1, %v2, %v3) + store %interleaved.vec, ptr %ptr + ret void +} + +; TODO: Remove once recursive interleaving support is removed +define void @store_factor4_vscale_recursive(ptr %ptr, %v0, %v1) { +; RV32-LABEL: @store_factor4_vscale_recursive( ; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, [[V0:%.*]], i32 0) ; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP1]], [[V0]], i32 1) ; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP2]], [[V1:%.*]], i32 2) @@ -625,7 +746,7 @@ define void @store_factor4_vscale(ptr %ptr, %v0, , 4) [[TMP4]], ptr [[PTR:%.*]], i32 -1, i32 3) ; RV32-NEXT: ret void ; -; RV64-LABEL: @store_factor4_vscale( +; RV64-LABEL: @store_factor4_vscale_recursive( ; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, [[V0:%.*]], i32 0) ; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP1]], [[V0]], i32 1) ; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP2]], [[V1:%.*]], i32 2) @@ -736,6 +857,22 @@ define void @store_factor4_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32 ret void } +define void @store_factor6_vscale(ptr %ptr, %v0, %v1, %v2, %v3, %v4, %v5) { +; RV32-LABEL: @store_factor6_vscale( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv48i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[V4:%.*]], [[V5:%.*]]) +; RV32-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64 +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor6_vscale( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv48i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[V4:%.*]], [[V5:%.*]]) +; RV64-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64 +; RV64-NEXT: ret void +; + %interleaved.vec = call @llvm.vector.interleave6.nxv8i8( %v0, %v1, %v2, %v3, %v4, %v5) + store %interleaved.vec, ptr %ptr + ret void +} + define void @store_factor7_vscale(ptr %ptr, %v0, %v1, %v2, %v3, %v4, %v5, %v6) { ; RV32-LABEL: @store_factor7_vscale( ; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) poison, [[V0:%.*]], i32 0) @@ -764,8 +901,25 @@ define void @store_factor7_vscale(ptr %ptr, %v0, %v0, %v1, %v2, %v3) { +define void @store_factor8_vscale(ptr %ptr, %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) { ; RV32-LABEL: @store_factor8_vscale( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave8.nxv64i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[V4:%.*]], [[V5:%.*]], [[V6:%.*]], [[V7:%.*]]) +; RV32-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64 +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor8_vscale( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave8.nxv64i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[V4:%.*]], [[V5:%.*]], [[V6:%.*]], [[V7:%.*]]) +; RV64-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64 +; RV64-NEXT: ret void +; + %interleaved.vec = call @llvm.vector.interleave8.nxv8i8( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + store %interleaved.vec, ptr %ptr + ret void +} + +; TODO: Remove once recursive interleaving support is removed +define void @store_factor8_vscale_recursive(ptr %ptr, %v0, %v1, %v2, %v3) { +; RV32-LABEL: @store_factor8_vscale_recursive( ; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, [[V0:%.*]], i32 0) ; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP1]], [[V2:%.*]], i32 1) ; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP2]], [[V0]], i32 2) @@ -777,7 +931,7 @@ define void @store_factor8_vscale(ptr %ptr, %v0, , 8) [[TMP8]], ptr [[PTR:%.*]], i32 -1, i32 3) ; RV32-NEXT: ret void ; -; RV64-LABEL: @store_factor8_vscale( +; RV64-LABEL: @store_factor8_vscale_recursive( ; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, [[V0:%.*]], i32 0) ; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP1]], [[V2:%.*]], i32 1) ; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP2]], [[V0]], i32 2) From 9e9c45da341de180b8fc3358b5bd6ed125c4a099 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 26 May 2025 17:07:10 +0100 Subject: [PATCH 2/3] [IA] Add support for [de]interleave{4,6,8} --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 46 ++++- .../rvv/fixed-vectors-deinterleave-load.ll | 119 +---------- .../rvv/fixed-vectors-interleave-store.ll | 125 +----------- .../RISCV/rvv/vector-deinterleave-load.ll | 50 ----- .../RISCV/rvv/vector-interleave-store.ll | 188 +----------------- .../RISCV/interleaved-accesses.ll | 144 +++++++++++--- 6 files changed, 175 insertions(+), 497 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 960c7956e0011..b684885b87a43 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -579,12 +579,21 @@ static unsigned getIntrinsicFactor(const IntrinsicInst *II) { case Intrinsic::vector_deinterleave3: case Intrinsic::vector_interleave3: return 3; + case Intrinsic::vector_deinterleave4: + case Intrinsic::vector_interleave4: + return 4; case Intrinsic::vector_deinterleave5: case Intrinsic::vector_interleave5: return 5; + case Intrinsic::vector_deinterleave6: + case Intrinsic::vector_interleave6: + return 6; case Intrinsic::vector_deinterleave7: case Intrinsic::vector_interleave7: return 7; + case Intrinsic::vector_deinterleave8: + case Intrinsic::vector_interleave8: + return 8; default: llvm_unreachable("Unexpected intrinsic"); } @@ -605,10 +614,9 @@ static unsigned getIntrinsicFactor(const IntrinsicInst *II) { // to reorder them by interleaving these values. static void interleaveLeafValues(MutableArrayRef SubLeaves) { unsigned NumLeaves = SubLeaves.size(); - if (NumLeaves == 2 || !isPowerOf2_64(NumLeaves)) - return; - assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1); + if (NumLeaves == 2) + return; const unsigned HalfLeaves = NumLeaves / 2; // Visit the sub-trees. @@ -629,8 +637,11 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, SmallVectorImpl &DeadInsts) { assert(II->getIntrinsicID() == Intrinsic::vector_interleave2 || II->getIntrinsicID() == Intrinsic::vector_interleave3 || + II->getIntrinsicID() == Intrinsic::vector_interleave4 || II->getIntrinsicID() == Intrinsic::vector_interleave5 || - II->getIntrinsicID() == Intrinsic::vector_interleave7); + II->getIntrinsicID() == Intrinsic::vector_interleave6 || + II->getIntrinsicID() == Intrinsic::vector_interleave7 || + II->getIntrinsicID() == Intrinsic::vector_interleave8); // Visit with BFS SmallVector Queue; @@ -660,13 +671,17 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, } const unsigned Factor = Operands.size(); - // Currently we only recognize factors of 3, 5, 7, and powers of 2. + // Currently we only recognize factors 2...8 and other powers of 2. // FIXME: should we assert here instead? if (Factor <= 1 || (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II))) return false; - interleaveLeafValues(Operands); + // Recursively interleaved factors need to have their values reordered + // TODO: Remove once the loop vectorizer no longer recursively interleaves + // factors 4 + 8 + if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2) + interleaveLeafValues(Operands); return true; } @@ -676,8 +691,11 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, SmallVectorImpl &DeadInsts) { assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2 || II->getIntrinsicID() == Intrinsic::vector_deinterleave3 || + II->getIntrinsicID() == Intrinsic::vector_deinterleave4 || II->getIntrinsicID() == Intrinsic::vector_deinterleave5 || - II->getIntrinsicID() == Intrinsic::vector_deinterleave7); + II->getIntrinsicID() == Intrinsic::vector_deinterleave6 || + II->getIntrinsicID() == Intrinsic::vector_deinterleave7 || + II->getIntrinsicID() == Intrinsic::vector_deinterleave8); using namespace PatternMatch; if (!II->hasNUses(getIntrinsicFactor(II))) return false; @@ -737,13 +755,17 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, } const unsigned Factor = Results.size(); - // Currently we only recognize factors of 3, 5, 7, and powers of 2. + // Currently we only recognize factors of 2...8 and other powers of 2. // FIXME: should we assert here instead? if (Factor <= 1 || (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II))) return 0; - interleaveLeafValues(Results); + // Recursively interleaved factors need to have their values reordered + // TODO: Remove once the loop vectorizer no longer recursively interleaves + // factors 4 + 8 + if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2) + interleaveLeafValues(Results); return true; } @@ -907,14 +929,20 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) { switch (II->getIntrinsicID()) { case Intrinsic::vector_deinterleave2: case Intrinsic::vector_deinterleave3: + case Intrinsic::vector_deinterleave4: case Intrinsic::vector_deinterleave5: + case Intrinsic::vector_deinterleave6: case Intrinsic::vector_deinterleave7: + case Intrinsic::vector_deinterleave8: Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts); break; case Intrinsic::vector_interleave2: case Intrinsic::vector_interleave3: + case Intrinsic::vector_interleave4: case Intrinsic::vector_interleave5: + case Intrinsic::vector_interleave6: case Intrinsic::vector_interleave7: + case Intrinsic::vector_interleave8: Changed |= lowerInterleaveIntrinsic(II, DeadInsts); break; default: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index 6b6c64b54956b..c2ae1ce491389 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -277,37 +277,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 24 -; CHECK-NEXT: vslidedown.vi v12, v8, 16 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 8 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v12, v10, a0 -; CHECK-NEXT: vslideup.vx v8, v9, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv.v.v v9, v12 -; CHECK-NEXT: vs2r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load <32 x i8>, ptr %p %d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave4(<32 x i8> %vec) @@ -371,46 +342,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor6(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor6: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: li a1, 48 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: li a0, 40 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: srli a2, a2, 1 -; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v12, v8, a0 -; CHECK-NEXT: add a0, a2, a2 -; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v14, v8, 24 -; CHECK-NEXT: vslidedown.vi v20, v8, 16 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v13, v8, 8 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v20, v14, a2 -; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a1 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v13, a2 -; CHECK-NEXT: vslideup.vx v16, v12, a2 -; CHECK-NEXT: vmv1r.v v9, v20 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv2r.v v10, v16 -; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vlseg6e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load <48 x i8>, ptr %p %d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave6(<48 x i8> %vec) @@ -457,55 +390,9 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor8(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: li a0, 56 -; CHECK-NEXT: li a1, 48 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: li a0, 40 -; CHECK-NEXT: vslidedown.vx v12, v8, a1 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: srli a2, a2, 1 -; CHECK-NEXT: vslidedown.vx v20, v8, a0 -; CHECK-NEXT: add a0, a2, a2 -; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v14, v8, 24 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v12, v16, a2 -; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v22, v8, 16 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v22, v14, a2 -; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a1 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 8 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v9, a2 -; CHECK-NEXT: vslideup.vx v16, v20, a2 -; CHECK-NEXT: vmv1r.v v9, v22 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v17, v12 -; CHECK-NEXT: vmv2r.v v10, v16 -; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: vmv1r.v v15, v14 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load <64 x i8>, ptr %p %d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave8(<64 x i8> %vec) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll index 6497675bd56f1..c394e7aa2e3e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll @@ -195,35 +195,8 @@ define void @vector_interleave_store_factor3(<4 x i32> %a, <4 x i32> %b, <4 x i3 define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor4: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: add a3, a1, a2 -; CHECK-NEXT: add a4, a3, a2 -; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg4e32.v v8, (a1) -; CHECK-NEXT: add a2, a4, a2 -; CHECK-NEXT: vl1re32.v v8, (a4) -; CHECK-NEXT: vl1re32.v v10, (a2) -; CHECK-NEXT: vl1re32.v v12, (a1) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vl1re32.v v10, (a3) -; CHECK-NEXT: vslideup.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v12, v8, 8 -; CHECK-NEXT: vse32.v v12, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) ; CHECK-NEXT: ret %v = call <16 x i32> @llvm.vector.interleave4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) store <16 x i32> %v, ptr %p @@ -258,51 +231,8 @@ define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i3 define void @vector_interleave_store_factor6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor6: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 6 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: add a4, a1, a2 -; CHECK-NEXT: add a5, a4, a2 -; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg6e32.v v8, (a1) -; CHECK-NEXT: vl1re32.v v12, (a5) -; CHECK-NEXT: add a5, a5, a2 -; CHECK-NEXT: vl1re32.v v10, (a5) -; CHECK-NEXT: add a5, a5, a2 -; CHECK-NEXT: vl1re32.v v14, (a4) -; CHECK-NEXT: vl1re32.v v8, (a1) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 4 -; CHECK-NEXT: vslideup.vi v8, v14, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: vl1re32.v v16, (a5) -; CHECK-NEXT: add a2, a5, a2 -; CHECK-NEXT: vl1re32.v v12, (a2) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v16, v12, 4 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vslideup.vi v8, v16, 16 -; CHECK-NEXT: vsetivli zero, 24, e32, m8, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg6e32.v v8, (a0) ; CHECK-NEXT: ret %v = call <24 x i32> @llvm.vector.interleave6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f) store <24 x i32> %v, ptr %p @@ -323,51 +253,8 @@ define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i3 define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: add a2, a1, a3 -; CHECK-NEXT: add a4, a2, a3 -; CHECK-NEXT: add a5, a4, a3 -; CHECK-NEXT: add a6, a5, a3 -; CHECK-NEXT: add a7, a6, a3 -; CHECK-NEXT: add t0, a7, a3 -; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg8e32.v v8, (a1) -; CHECK-NEXT: add a3, t0, a3 -; CHECK-NEXT: vl1re32.v v10, (a3) -; CHECK-NEXT: vl1re32.v v12, (t0) -; CHECK-NEXT: vl1re32.v v14, (a7) -; CHECK-NEXT: vl1re32.v v8, (a6) -; CHECK-NEXT: vl1re32.v v18, (a5) -; CHECK-NEXT: vl1re32.v v20, (a4) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 4 -; CHECK-NEXT: vslideup.vi v8, v14, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: vl1re32.v v16, (a1) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v20, v18, 4 -; CHECK-NEXT: vl1re32.v v12, (a2) -; CHECK-NEXT: vslideup.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v16, v20, 8 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vslideup.vi v16, v8, 16 -; CHECK-NEXT: vse32.v v16, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a0) ; CHECK-NEXT: ret %v = call <32 x i32> @llvm.vector.interleave8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) store <32 x i32> %v, ptr %p diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index b15ccdedda8f1..9344c52098684 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -364,23 +364,8 @@ define { , , } @vector_deint define { , , , } @vector_deinterleave_load_factor4(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: vl4r.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p %d0 = call { , , , } @llvm.vector.deinterleave4( %vec) @@ -444,28 +429,8 @@ define { , , , , , , , , } @vector_deinterleave_load_factor6(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor6: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub a1, a1, a2 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg6e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p %d0 = call { , , , , , } @llvm.vector.deinterleave6( %vec) @@ -512,23 +477,8 @@ define { , , , , , , , , , , } @vector_deinterleave_load_factor8(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vl8r.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg8e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p %d0 = call { , , , , , , , } @llvm.vector.deinterleave8( %vec) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index 26d387baa5e22..3751967f18aa4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -253,30 +253,8 @@ define void @vector_interleave_store_factor3( %a, %a, %b, %c, %d, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor4: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: add a3, a1, a2 -; CHECK-NEXT: add a4, a3, a2 -; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg4e32.v v8, (a1) -; CHECK-NEXT: vl1re32.v v10, (a4) -; CHECK-NEXT: add a2, a4, a2 -; CHECK-NEXT: vl1re32.v v11, (a2) -; CHECK-NEXT: vl1re32.v v8, (a1) -; CHECK-NEXT: vl1re32.v v9, (a3) -; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) ; CHECK-NEXT: ret %v = call @llvm.vector.interleave4( %a, %b, %c, %d) store %v, ptr %p @@ -309,129 +287,11 @@ define void @vector_interleave_store_factor5( %a, %a, %b, %c, %d, %e, %f, ptr %p) { -; RV32-LABEL: vector_interleave_store_factor6: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 1 -; RV32-NEXT: mv a2, a1 -; RV32-NEXT: slli a1, a1, 1 -; RV32-NEXT: add a2, a2, a1 -; RV32-NEXT: slli a1, a1, 1 -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 14 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: add a3, a1, a2 -; RV32-NEXT: add a4, a3, a2 -; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV32-NEXT: vsseg6e32.v v8, (a1) -; RV32-NEXT: vl1re32.v v10, (a4) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v11, (a4) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v8, (a1) -; RV32-NEXT: vl1re32.v v9, (a3) -; RV32-NEXT: vl1re32.v v12, (a4) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v13, (a4) -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: mv s0, a0 -; RV32-NEXT: srli a0, a2, 3 -; RV32-NEXT: li a1, 12 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vse32.v v8, (s0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: add a1, a1, a0 -; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 32 -; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 32 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: vector_interleave_store_factor6: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -32 -; RV64-NEXT: .cfi_def_cfa_offset 32 -; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: mv a2, a1 -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: add a2, a2, a1 -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 14 * vlenb -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: add a3, a1, a2 -; RV64-NEXT: add a4, a3, a2 -; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV64-NEXT: vsseg6e32.v v8, (a1) -; RV64-NEXT: vl1re32.v v10, (a4) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v11, (a4) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v8, (a1) -; RV64-NEXT: vl1re32.v v9, (a3) -; RV64-NEXT: vl1re32.v v12, (a4) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v13, (a4) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV64-NEXT: mv s0, a0 -; RV64-NEXT: srli a0, a2, 3 -; RV64-NEXT: li a1, 12 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV64-NEXT: vse32.v v8, (s0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: add a1, a1, a0 -; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 32 -; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: addi sp, sp, 32 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret +; CHECK-LABEL: vector_interleave_store_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg6e32.v v8, (a0) +; CHECK-NEXT: ret %v = call @llvm.vector.interleave6( %a, %b, %c, %d, %e, %f) store %v, ptr %p ret void @@ -451,38 +311,8 @@ define void @vector_interleave_store_factor7( %a, %a, %b, %c, %d, %e, %f, %g, %h, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: add a3, a1, a2 -; CHECK-NEXT: add a4, a3, a2 -; CHECK-NEXT: add a5, a4, a2 -; CHECK-NEXT: add a6, a5, a2 -; CHECK-NEXT: add a7, a6, a2 -; CHECK-NEXT: add t0, a7, a2 -; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg8e32.v v8, (a1) -; CHECK-NEXT: vl1re32.v v14, (t0) -; CHECK-NEXT: add a2, t0, a2 -; CHECK-NEXT: vl1re32.v v15, (a2) -; CHECK-NEXT: vl1re32.v v12, (a6) -; CHECK-NEXT: vl1re32.v v13, (a7) -; CHECK-NEXT: vl1re32.v v10, (a4) -; CHECK-NEXT: vl1re32.v v11, (a5) -; CHECK-NEXT: vl1re32.v v8, (a1) -; CHECK-NEXT: vl1re32.v v9, (a3) -; CHECK-NEXT: vs8r.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a0) ; CHECK-NEXT: ret %v = call @llvm.vector.interleave8( %a, %b, %c, %d, %e, %f, %g, %h) store %v, ptr %p diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll index ebdeb70538d4a..87b16d17aa5f0 100644 --- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll @@ -135,8 +135,15 @@ define void @load_factor4(ptr %ptr) { define void @load_factor4_vscale(ptr %ptr) { ; RV32-LABEL: @load_factor4_vscale( -; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 -; RV32-NEXT: [[TMP9:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[INTERLEAVED_VEC]]) +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i32(target("riscv.vector.tuple", , 4) poison, ptr [[PTR:%.*]], i32 -1, i32 5) +; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 0) +; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , } poison, [[TMP2]], 0 +; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 1) +; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , , } [[TMP3]], [[TMP4]], 1 +; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 2) +; RV32-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 2 +; RV32-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 3) +; RV32-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 3 ; RV32-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP9]], 0 ; RV32-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP9]], 1 ; RV32-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[TMP9]], 2 @@ -144,8 +151,15 @@ define void @load_factor4_vscale(ptr %ptr) { ; RV32-NEXT: ret void ; ; RV64-LABEL: @load_factor4_vscale( -; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 -; RV64-NEXT: [[TMP9:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[INTERLEAVED_VEC]]) +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", , 4) poison, ptr [[PTR:%.*]], i64 -1, i64 5) +; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 0) +; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , } poison, [[TMP2]], 0 +; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 1) +; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , , } [[TMP3]], [[TMP4]], 1 +; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 2) +; RV64-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 2 +; RV64-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 3) +; RV64-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 3 ; RV64-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP9]], 0 ; RV64-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP9]], 1 ; RV64-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[TMP9]], 2 @@ -318,8 +332,19 @@ define void @load_factor6(ptr %ptr) { define void @load_factor6_vscale(ptr %ptr) { ; RV32-LABEL: @load_factor6_vscale( -; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 -; RV32-NEXT: [[TMP13:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv12i32( [[INTERLEAVED_VEC]]) +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t.p0.i32(target("riscv.vector.tuple", , 6) poison, ptr [[PTR:%.*]], i32 -1, i32 5) +; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 0) +; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , } poison, [[TMP2]], 0 +; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 1) +; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , } [[TMP3]], [[TMP4]], 1 +; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 2) +; RV32-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , } [[TMP5]], [[TMP6]], 2 +; RV32-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 3) +; RV32-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , } [[TMP7]], [[TMP8]], 3 +; RV32-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 4) +; RV32-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , } [[TMP9]], [[TMP10]], 4 +; RV32-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 5) +; RV32-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , } [[TMP11]], [[TMP12]], 5 ; RV32-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , } [[TMP13]], 0 ; RV32-NEXT: [[TMP15:%.*]] = extractvalue { , , , , , } [[TMP13]], 1 ; RV32-NEXT: [[TMP16:%.*]] = extractvalue { , , , , , } [[TMP13]], 2 @@ -329,8 +354,19 @@ define void @load_factor6_vscale(ptr %ptr) { ; RV32-NEXT: ret void ; ; RV64-LABEL: @load_factor6_vscale( -; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 -; RV64-NEXT: [[TMP13:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv12i32( [[INTERLEAVED_VEC]]) +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t.p0.i64(target("riscv.vector.tuple", , 6) poison, ptr [[PTR:%.*]], i64 -1, i64 5) +; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 0) +; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , } poison, [[TMP2]], 0 +; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 1) +; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , } [[TMP3]], [[TMP4]], 1 +; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 2) +; RV64-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , } [[TMP5]], [[TMP6]], 2 +; RV64-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 3) +; RV64-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , } [[TMP7]], [[TMP8]], 3 +; RV64-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 4) +; RV64-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , } [[TMP9]], [[TMP10]], 4 +; RV64-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", , 6) [[TMP1]], i32 5) +; RV64-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , } [[TMP11]], [[TMP12]], 5 ; RV64-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , } [[TMP13]], 0 ; RV64-NEXT: [[TMP15:%.*]] = extractvalue { , , , , , } [[TMP13]], 1 ; RV64-NEXT: [[TMP16:%.*]] = extractvalue { , , , , , } [[TMP13]], 2 @@ -486,8 +522,23 @@ define void @load_factor8(ptr %ptr) { define void @load_factor8_vscale(ptr %ptr) { ; RV32-LABEL: @load_factor8_vscale( -; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 -; RV32-NEXT: [[TMP17:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv16i32( [[INTERLEAVED_VEC]]) +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", , 8) poison, ptr [[PTR:%.*]], i32 -1, i32 5) +; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 0) +; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , , } poison, [[TMP2]], 0 +; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 1) +; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , , , } [[TMP3]], [[TMP4]], 1 +; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 2) +; RV32-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , , , } [[TMP5]], [[TMP6]], 2 +; RV32-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 3) +; RV32-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , , , } [[TMP7]], [[TMP8]], 3 +; RV32-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 4) +; RV32-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , , , } [[TMP9]], [[TMP10]], 4 +; RV32-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 5) +; RV32-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , , , } [[TMP11]], [[TMP12]], 5 +; RV32-NEXT: [[TMP14:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 6) +; RV32-NEXT: [[TMP15:%.*]] = insertvalue { , , , , , , , } [[TMP13]], [[TMP14]], 6 +; RV32-NEXT: [[TMP16:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 7) +; RV32-NEXT: [[TMP17:%.*]] = insertvalue { , , , , , , , } [[TMP15]], [[TMP16]], 7 ; RV32-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 0 ; RV32-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 1 ; RV32-NEXT: [[TMP20:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 2 @@ -499,8 +550,23 @@ define void @load_factor8_vscale(ptr %ptr) { ; RV32-NEXT: ret void ; ; RV64-LABEL: @load_factor8_vscale( -; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr [[PTR:%.*]], align 64 -; RV64-NEXT: [[TMP17:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv16i32( [[INTERLEAVED_VEC]]) +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", , 8) poison, ptr [[PTR:%.*]], i64 -1, i64 5) +; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 0) +; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , , } poison, [[TMP2]], 0 +; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 1) +; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , , , } [[TMP3]], [[TMP4]], 1 +; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 2) +; RV64-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , , , } [[TMP5]], [[TMP6]], 2 +; RV64-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 3) +; RV64-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , , , } [[TMP7]], [[TMP8]], 3 +; RV64-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 4) +; RV64-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , , , } [[TMP9]], [[TMP10]], 4 +; RV64-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 5) +; RV64-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , , , } [[TMP11]], [[TMP12]], 5 +; RV64-NEXT: [[TMP14:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 6) +; RV64-NEXT: [[TMP15:%.*]] = insertvalue { , , , , , , , } [[TMP13]], [[TMP14]], 6 +; RV64-NEXT: [[TMP16:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 7) +; RV64-NEXT: [[TMP17:%.*]] = insertvalue { , , , , , , , } [[TMP15]], [[TMP16]], 7 ; RV64-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 0 ; RV64-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 1 ; RV64-NEXT: [[TMP20:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 2 @@ -722,13 +788,19 @@ define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2 define void @store_factor4_vscale(ptr %ptr, %v0, %v1, %v2, %v3) { ; RV32-LABEL: @store_factor4_vscale( -; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv32i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]]) -; RV32-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 32 +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, [[V0:%.*]], i32 0) +; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP1]], [[V1:%.*]], i32 1) +; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP2]], [[V2:%.*]], i32 2) +; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP3]], [[V3:%.*]], i32 3) +; RV32-NEXT: call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i32(target("riscv.vector.tuple", , 4) [[TMP4]], ptr [[PTR:%.*]], i32 -1, i32 3) ; RV32-NEXT: ret void ; ; RV64-LABEL: @store_factor4_vscale( -; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv32i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]]) -; RV64-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 32 +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, [[V0:%.*]], i32 0) +; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP1]], [[V1:%.*]], i32 1) +; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP2]], [[V2:%.*]], i32 2) +; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP3]], [[V3:%.*]], i32 3) +; RV64-NEXT: call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", , 4) [[TMP4]], ptr [[PTR:%.*]], i64 -1, i64 3) ; RV64-NEXT: ret void ; %interleaved.vec = call @llvm.vector.interleave4.nxv8i8( %v0, %v1, %v2, %v3) @@ -859,13 +931,23 @@ define void @store_factor4_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32 define void @store_factor6_vscale(ptr %ptr, %v0, %v1, %v2, %v3, %v4, %v5) { ; RV32-LABEL: @store_factor6_vscale( -; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv48i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[V4:%.*]], [[V5:%.*]]) -; RV32-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64 +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) poison, [[V0:%.*]], i32 0) +; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP1]], [[V1:%.*]], i32 1) +; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP2]], [[V2:%.*]], i32 2) +; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP3]], [[V3:%.*]], i32 3) +; RV32-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP4]], [[V4:%.*]], i32 4) +; RV32-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP5]], [[V5:%.*]], i32 5) +; RV32-NEXT: call void @llvm.riscv.vsseg6.triscv.vector.tuple_nxv8i8_6t.p0.i32(target("riscv.vector.tuple", , 6) [[TMP6]], ptr [[PTR:%.*]], i32 -1, i32 3) ; RV32-NEXT: ret void ; ; RV64-LABEL: @store_factor6_vscale( -; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv48i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[V4:%.*]], [[V5:%.*]]) -; RV64-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64 +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) poison, [[V0:%.*]], i32 0) +; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP1]], [[V1:%.*]], i32 1) +; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP2]], [[V2:%.*]], i32 2) +; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP3]], [[V3:%.*]], i32 3) +; RV64-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP4]], [[V4:%.*]], i32 4) +; RV64-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) [[TMP5]], [[V5:%.*]], i32 5) +; RV64-NEXT: call void @llvm.riscv.vsseg6.triscv.vector.tuple_nxv8i8_6t.p0.i64(target("riscv.vector.tuple", , 6) [[TMP6]], ptr [[PTR:%.*]], i64 -1, i64 3) ; RV64-NEXT: ret void ; %interleaved.vec = call @llvm.vector.interleave6.nxv8i8( %v0, %v1, %v2, %v3, %v4, %v5) @@ -903,13 +985,27 @@ define void @store_factor7_vscale(ptr %ptr, %v0, %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) { ; RV32-LABEL: @store_factor8_vscale( -; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave8.nxv64i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[V4:%.*]], [[V5:%.*]], [[V6:%.*]], [[V7:%.*]]) -; RV32-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64 +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, [[V0:%.*]], i32 0) +; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP1]], [[V1:%.*]], i32 1) +; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP2]], [[V2:%.*]], i32 2) +; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP3]], [[V3:%.*]], i32 3) +; RV32-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP4]], [[V4:%.*]], i32 4) +; RV32-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP5]], [[V5:%.*]], i32 5) +; RV32-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP6]], [[V6:%.*]], i32 6) +; RV32-NEXT: [[TMP8:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP7]], [[V7:%.*]], i32 7) +; RV32-NEXT: call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", , 8) [[TMP8]], ptr [[PTR:%.*]], i32 -1, i32 3) ; RV32-NEXT: ret void ; ; RV64-LABEL: @store_factor8_vscale( -; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave8.nxv64i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[V4:%.*]], [[V5:%.*]], [[V6:%.*]], [[V7:%.*]]) -; RV64-NEXT: store [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64 +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, [[V0:%.*]], i32 0) +; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP1]], [[V1:%.*]], i32 1) +; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP2]], [[V2:%.*]], i32 2) +; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP3]], [[V3:%.*]], i32 3) +; RV64-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP4]], [[V4:%.*]], i32 4) +; RV64-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP5]], [[V5:%.*]], i32 5) +; RV64-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP6]], [[V6:%.*]], i32 6) +; RV64-NEXT: [[TMP8:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP7]], [[V7:%.*]], i32 7) +; RV64-NEXT: call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", , 8) [[TMP8]], ptr [[PTR:%.*]], i64 -1, i64 3) ; RV64-NEXT: ret void ; %interleaved.vec = call @llvm.vector.interleave8.nxv8i8( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) From 2ee6bd1d924648f6b7f16652f482a13018a2e252 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 28 May 2025 10:05:00 +0100 Subject: [PATCH 3/3] Extract helper functions --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 70 +++++++++++----------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index b684885b87a43..49f1504d244ed 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -571,6 +571,36 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } +static bool isInterleaveIntrinsic(Intrinsic::ID IID) { + switch (IID) { + case Intrinsic::vector_interleave2: + case Intrinsic::vector_interleave3: + case Intrinsic::vector_interleave4: + case Intrinsic::vector_interleave5: + case Intrinsic::vector_interleave6: + case Intrinsic::vector_interleave7: + case Intrinsic::vector_interleave8: + return true; + default: + return false; + } +} + +static bool isDeinterleaveIntrinsic(Intrinsic::ID IID) { + switch (IID) { + case Intrinsic::vector_deinterleave2: + case Intrinsic::vector_deinterleave3: + case Intrinsic::vector_deinterleave4: + case Intrinsic::vector_deinterleave5: + case Intrinsic::vector_deinterleave6: + case Intrinsic::vector_deinterleave7: + case Intrinsic::vector_deinterleave8: + return true; + default: + return false; + } +} + static unsigned getIntrinsicFactor(const IntrinsicInst *II) { switch (II->getIntrinsicID()) { case Intrinsic::vector_deinterleave2: @@ -635,13 +665,7 @@ static void interleaveLeafValues(MutableArrayRef SubLeaves) { static bool getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, SmallVectorImpl &DeadInsts) { - assert(II->getIntrinsicID() == Intrinsic::vector_interleave2 || - II->getIntrinsicID() == Intrinsic::vector_interleave3 || - II->getIntrinsicID() == Intrinsic::vector_interleave4 || - II->getIntrinsicID() == Intrinsic::vector_interleave5 || - II->getIntrinsicID() == Intrinsic::vector_interleave6 || - II->getIntrinsicID() == Intrinsic::vector_interleave7 || - II->getIntrinsicID() == Intrinsic::vector_interleave8); + assert(isInterleaveIntrinsic(II->getIntrinsicID())); // Visit with BFS SmallVector Queue; @@ -689,13 +713,7 @@ static bool getVectorDeinterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Results, SmallVectorImpl &DeadInsts) { - assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2 || - II->getIntrinsicID() == Intrinsic::vector_deinterleave3 || - II->getIntrinsicID() == Intrinsic::vector_deinterleave4 || - II->getIntrinsicID() == Intrinsic::vector_deinterleave5 || - II->getIntrinsicID() == Intrinsic::vector_deinterleave6 || - II->getIntrinsicID() == Intrinsic::vector_deinterleave7 || - II->getIntrinsicID() == Intrinsic::vector_deinterleave8); + assert(isDeinterleaveIntrinsic(II->getIntrinsicID())); using namespace PatternMatch; if (!II->hasNUses(getIntrinsicFactor(II))) return false; @@ -924,30 +942,10 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) { Changed |= lowerInterleavedStore(&I, DeadInsts); if (auto *II = dyn_cast(&I)) { - // At present, we only have intrinsics to represent (de)interleaving - // with a factor of 2,3,5 and 7. - switch (II->getIntrinsicID()) { - case Intrinsic::vector_deinterleave2: - case Intrinsic::vector_deinterleave3: - case Intrinsic::vector_deinterleave4: - case Intrinsic::vector_deinterleave5: - case Intrinsic::vector_deinterleave6: - case Intrinsic::vector_deinterleave7: - case Intrinsic::vector_deinterleave8: + if (isDeinterleaveIntrinsic(II->getIntrinsicID())) Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts); - break; - case Intrinsic::vector_interleave2: - case Intrinsic::vector_interleave3: - case Intrinsic::vector_interleave4: - case Intrinsic::vector_interleave5: - case Intrinsic::vector_interleave6: - case Intrinsic::vector_interleave7: - case Intrinsic::vector_interleave8: + else if (isInterleaveIntrinsic(II->getIntrinsicID())) Changed |= lowerInterleaveIntrinsic(II, DeadInsts); - break; - default: - break; - } } }