From 12e18154affece2c835dd83b1acf7b569efcc3f7 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 9 Sep 2024 14:06:51 +0100 Subject: [PATCH 1/5] [Compiler-rt] Add AArch64 routines for __arm_agnostic("sme_za_state") The specification of these routines can be found here: https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines --- compiler-rt/cmake/builtin-config-ix.cmake | 3 +- .../lib/builtins/aarch64/sme-abi-assert.c | 1 + compiler-rt/lib/builtins/aarch64/sme-abi.S | 159 +++++++++++++++++- 3 files changed, 161 insertions(+), 2 deletions(-) diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake index 1f63e158409ca..706a1ff7eeb6d 100644 --- a/compiler-rt/cmake/builtin-config-ix.cmake +++ b/compiler-rt/cmake/builtin-config-ix.cmake @@ -43,8 +43,9 @@ asm(\"cas w0, w1, [x2]\"); builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME " void foo(void) __arm_streaming_compatible { - asm(\".arch armv9-a+sme\"); + asm(\".arch armv9-a+sme2\"); asm(\"smstart\"); + asm(\"ldr zt0, [sp]\"); } ") diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c index 4333353f8d2d1..37305ceb39c50 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c +++ b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c @@ -8,3 +8,4 @@ #include "../cpu_model/AArch64CPUFeatures.inc" _Static_assert(FEAT_SVE == 30, "sme-abi.S assumes FEAT_SVE = 30"); _Static_assert(FEAT_SME == 42, "sme-abi.S assumes FEAT_SME = 42"); +_Static_assert(FEAT_SME2 == 57, "sme-abi.S assumes FEAT_SME2 = 57"); diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index 45bd221655fd6..90b3f1bf180ff 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -10,6 +10,8 @@ .set FEAT_SVE_BIT, 30 .set FEAT_SME_BIT, 42 +.set FEAT_SME2_BIT, 57 +.set FEAT_SME2_MASK, 1 << 57 .set SVCR_PSTATE_SM_BIT, 0 #if !defined(__APPLE__) @@ -22,7 +24,7 @@ #define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff #endif -.arch armv9-a+sme +.arch armv9-a+sme2 // Utility function which calls a system's abort() routine. Because the function // is streaming-compatible it should disable streaming-SVE mode before calling @@ -204,6 +206,161 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg) ret END_COMPILERRT_FUNCTION(__arm_get_current_vg) +DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size) + .variant_pcs __arm_sme_state_size + BTI_C + + // Test if SME is available and PSTATE = 1. + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME_BIT, 0f + mrs x16, SVCR + tbz x16, #1, 0f + + // Size = HAS_FEAT_SME2 ? 32 : 96 + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tst x16, #FEAT_SME2_MASK + mov w17, #32 + mov w16, #96 + csel x16, x17, x16, eq + + // Size = Size + (SVLB * SVLB) + rdsvl x17, #1 + madd x0, x17, x17, x16 + ret + +0: + // Default case, 16 bytes is minimum (to encode VALID bit, multiple of 16 bytes) + mov w0, #16 + ret +END_COMPILERRT_FUNCTION(__arm_sme_state_size) + +DEFINE_COMPILERRT_FUNCTION(__arm_sme_save) + .variant_pcs __arm_sme_save + BTI_C + + // Clear internal state bits + stp xzr, xzr, [x0] + + // If PTR is not 16-byte aligned, abort. + tst x0, #0xF + b.ne 3f + + // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return. + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME_BIT, 2f + mrs x16, SVCR + tbz x16, #1, 2f + mrs x16, TPIDR2_EL0 + cbnz x16, 2f + + # ZA or ZT0 need saving, we can now set internal VALID bit to 1 + mov w16, #1 + str x16, [x0] + + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME2_BIT, 0f + + // Store ZT0 and ZA + add x16, x0, #32 + str zt0, [x16] + add x18, x0, #96 + b 1f + +0: + // Has SME only + add x18, x0, #32 + +1: + // Set up lazy-save (x18 = pointer to buffer) + rdsvl x17, #1 + str x18, [x0, #16]! + strh w17, [x0, #8] + stur wzr, [x0, #10] + strh wzr, [x0, #14] + msr TPIDR2_EL0, x0 + ret + +2: + // Do nothing + ret + +3: + b SYMBOL_NAME(do_abort) +END_COMPILERRT_FUNCTION(__arm_sme_save) + +DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore) + .cfi_startproc + .variant_pcs __arm_sme_save + BTI_C + + stp x29, x30, [sp, #-16]! + .cfi_def_cfa_offset 16 + mov x29, sp + .cfi_def_cfa w29, 16 + .cfi_offset w30, -8 + .cfi_offset w29, -16 + + // If PTR is not 16-byte aligned, abort. + tst x0, #0xF + b.ne 3f + + // If the VALID bit is 0, return early. + ldr x16, [x0] + tbz x16, #0, 2f + + // If SME is not available, abort. + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME_BIT, 3f + + // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0. + mrs x16, TPIDR2_EL0 + cbnz x16, 0f + + // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'), + // abort. + mrs x16, SVCR + tbnz x16, #1, 3f + + // Restore za. + smstart za + mov x16, x0 + add x0, x0, #16 + bl __arm_tpidr2_restore + mov x0, x16 + msr TPIDR2_EL0, xzr + +0: + smstart za + +1: + // Check if zt0 needs restoring. + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME2_BIT, 2f + + // Restore zt0. + add x16, x0, #32 + ldr zt0, [x16] + +2: + // Do nothing + .cfi_def_cfa wsp, 16 + ldp x29, x30, [sp], #16 + .cfi_def_cfa_offset 0 + .cfi_restore w30 + .cfi_restore w29 + ret + +3: + b SYMBOL_NAME(do_abort) + .cfi_endproc +END_COMPILERRT_FUNCTION(__arm_sme_restore) + NO_EXEC_STACK_DIRECTIVE // GNU property note for BTI and PAC From 88e7dc405d108daa5bb7511c0872837ae9a3f738 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 16 Dec 2024 12:13:47 +0000 Subject: [PATCH 2/5] Address comments --- compiler-rt/lib/builtins/aarch64/sme-abi.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index 90b3f1bf180ff..61c2fbbd801ac 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -210,14 +210,16 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size) .variant_pcs __arm_sme_state_size BTI_C - // Test if SME is available and PSTATE = 1. + // Test if SME is available and ZA state is 'active'. adrp x16, CPU_FEATS_SYMBOL ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] tbz x16, #FEAT_SME_BIT, 0f mrs x16, SVCR tbz x16, #1, 0f + mrs x16, TPIDR2_EL0 + cbnz x16, 0f - // Size = HAS_FEAT_SME2 ? 32 : 96 + // Size = HAS_FEAT_SME2 ? 96 : 32 adrp x16, CPU_FEATS_SYMBOL ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] tst x16, #FEAT_SME2_MASK From aaecb05f6bf926578c6f34efc67b79e23174c66c Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 17 Dec 2024 15:36:15 +0000 Subject: [PATCH 3/5] Address more comments --- compiler-rt/lib/builtins/aarch64/sme-abi.S | 78 ++++++++++++---------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index 61c2fbbd801ac..855099427e268 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -206,23 +206,41 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg) ret END_COMPILERRT_FUNCTION(__arm_get_current_vg) +// The diagram below describes the layout used in the following routines: +// * __arm_sme_state_size +// * __arm_sme_save +// * __arm_sme_restore +// +// +---------------------------------+ +// | ... | +// | ZA buffer | +// | ... | +// +---------------------------------+ <- @96 +// | ZT0 contents | +// +---------------------------------+ <- @32 +// | byte 15-10: zero (reserved) | +// | byte 9-8: num_za_save_slices | TPIDR2 block +// | byte 7-0: za_save_buffer | +// +---------------------------------+ <- @16 +// | bit 127-1: zero (reserved) | Internal state for __arm_sme_save/restore +// | bit 0: VALID | +// +---------------------------------+ <- @0 + DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size) .variant_pcs __arm_sme_state_size BTI_C // Test if SME is available and ZA state is 'active'. - adrp x16, CPU_FEATS_SYMBOL - ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] - tbz x16, #FEAT_SME_BIT, 0f + adrp x17, CPU_FEATS_SYMBOL + ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] + tbz x17, #FEAT_SME_BIT, 0f mrs x16, SVCR tbz x16, #1, 0f mrs x16, TPIDR2_EL0 cbnz x16, 0f // Size = HAS_FEAT_SME2 ? 96 : 32 - adrp x16, CPU_FEATS_SYMBOL - ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] - tst x16, #FEAT_SME2_MASK + tst x17, #FEAT_SME2_MASK mov w17, #32 mov w16, #96 csel x16, x17, x16, eq @@ -242,17 +260,17 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save) .variant_pcs __arm_sme_save BTI_C - // Clear internal state bits - stp xzr, xzr, [x0] - // If PTR is not 16-byte aligned, abort. tst x0, #0xF b.ne 3f + // Clear internal state bits + stp xzr, xzr, [x0] + // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return. - adrp x16, CPU_FEATS_SYMBOL - ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] - tbz x16, #FEAT_SME_BIT, 2f + adrp x17, CPU_FEATS_SYMBOL + ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] + tbz x17, #FEAT_SME_BIT, 2f mrs x16, SVCR tbz x16, #1, 2f mrs x16, TPIDR2_EL0 @@ -262,20 +280,15 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save) mov w16, #1 str x16, [x0] - adrp x16, CPU_FEATS_SYMBOL - ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] - tbz x16, #FEAT_SME2_BIT, 0f + add x18, x0, #32 + tbz x17, #FEAT_SME2_BIT, 1f // Store ZT0 and ZA add x16, x0, #32 str zt0, [x16] - add x18, x0, #96 + add x18, x18, #64 b 1f -0: - // Has SME only - add x18, x0, #32 - 1: // Set up lazy-save (x18 = pointer to buffer) rdsvl x17, #1 @@ -284,7 +297,6 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save) stur wzr, [x0, #10] strh wzr, [x0, #14] msr TPIDR2_EL0, x0 - ret 2: // Do nothing @@ -296,7 +308,7 @@ END_COMPILERRT_FUNCTION(__arm_sme_save) DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore) .cfi_startproc - .variant_pcs __arm_sme_save + .variant_pcs __arm_sme_restore BTI_C stp x29, x30, [sp, #-16]! @@ -312,16 +324,16 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore) // If the VALID bit is 0, return early. ldr x16, [x0] - tbz x16, #0, 2f + cbz x16, 2f // If SME is not available, abort. - adrp x16, CPU_FEATS_SYMBOL - ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] - tbz x16, #FEAT_SME_BIT, 3f + adrp x17, CPU_FEATS_SYMBOL + ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] + tbz x17, #FEAT_SME_BIT, 3f // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0. mrs x16, TPIDR2_EL0 - cbnz x16, 0f + cbnz x16, 1f // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'), // abort. @@ -330,20 +342,16 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore) // Restore za. smstart za - mov x16, x0 add x0, x0, #16 bl __arm_tpidr2_restore - mov x0, x16 - msr TPIDR2_EL0, xzr + sub x0, x0, #16 -0: +1: smstart za + msr TPIDR2_EL0, xzr -1: // Check if zt0 needs restoring. - adrp x16, CPU_FEATS_SYMBOL - ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] - tbz x16, #FEAT_SME2_BIT, 2f + tbz x17, #FEAT_SME2_BIT, 2f // Restore zt0. add x16, x0, #32 From 7b50c7a3d740b3042c129fa3ef86ef9a40accdd8 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 17 Dec 2024 15:52:35 +0000 Subject: [PATCH 4/5] Further simplification --- compiler-rt/lib/builtins/aarch64/sme-abi.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index 855099427e268..b70c7ee1fe991 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -283,9 +283,8 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save) add x18, x0, #32 tbz x17, #FEAT_SME2_BIT, 1f - // Store ZT0 and ZA - add x16, x0, #32 - str zt0, [x16] + // Store ZT0 + str zt0, [x18] add x18, x18, #64 b 1f From c0dc2a4522b234f19f332ca10ab91dde6918da36 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 20 Dec 2024 12:08:41 +0000 Subject: [PATCH 5/5] Further simplification --- compiler-rt/lib/builtins/aarch64/sme-abi.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index b70c7ee1fe991..8dbbe061edb9b 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -286,15 +286,14 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save) // Store ZT0 str zt0, [x18] add x18, x18, #64 - b 1f 1: // Set up lazy-save (x18 = pointer to buffer) rdsvl x17, #1 str x18, [x0, #16]! strh w17, [x0, #8] - stur wzr, [x0, #10] - strh wzr, [x0, #14] + strh wzr, [x0, #10] + str wzr, [x0, #12] msr TPIDR2_EL0, x0 2: