diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake index 1f63e158409ca..706a1ff7eeb6d 100644 --- a/compiler-rt/cmake/builtin-config-ix.cmake +++ b/compiler-rt/cmake/builtin-config-ix.cmake @@ -43,8 +43,9 @@ asm(\"cas w0, w1, [x2]\"); builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME " void foo(void) __arm_streaming_compatible { - asm(\".arch armv9-a+sme\"); + asm(\".arch armv9-a+sme2\"); asm(\"smstart\"); + asm(\"ldr zt0, [sp]\"); } ") diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c index 4333353f8d2d1..37305ceb39c50 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c +++ b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c @@ -8,3 +8,4 @@ #include "../cpu_model/AArch64CPUFeatures.inc" _Static_assert(FEAT_SVE == 30, "sme-abi.S assumes FEAT_SVE = 30"); _Static_assert(FEAT_SME == 42, "sme-abi.S assumes FEAT_SME = 42"); +_Static_assert(FEAT_SME2 == 57, "sme-abi.S assumes FEAT_SME2 = 57"); diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index 45bd221655fd6..8dbbe061edb9b 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -10,6 +10,8 @@ .set FEAT_SVE_BIT, 30 .set FEAT_SME_BIT, 42 +.set FEAT_SME2_BIT, 57 +.set FEAT_SME2_MASK, 1 << 57 .set SVCR_PSTATE_SM_BIT, 0 #if !defined(__APPLE__) @@ -22,7 +24,7 @@ #define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff #endif -.arch armv9-a+sme +.arch armv9-a+sme2 // Utility function which calls a system's abort() routine. Because the function // is streaming-compatible it should disable streaming-SVE mode before calling @@ -204,6 +206,169 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg) ret END_COMPILERRT_FUNCTION(__arm_get_current_vg) +// The diagram below describes the layout used in the following routines: +// * __arm_sme_state_size +// * __arm_sme_save +// * __arm_sme_restore +// +// +---------------------------------+ +// | ... | +// | ZA buffer | +// | ... | +// +---------------------------------+ <- @96 +// | ZT0 contents | +// +---------------------------------+ <- @32 +// | byte 15-10: zero (reserved) | +// | byte 9-8: num_za_save_slices | TPIDR2 block +// | byte 7-0: za_save_buffer | +// +---------------------------------+ <- @16 +// | bit 127-1: zero (reserved) | Internal state for __arm_sme_save/restore +// | bit 0: VALID | +// +---------------------------------+ <- @0 + +DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size) + .variant_pcs __arm_sme_state_size + BTI_C + + // Test if SME is available and ZA state is 'active'. + adrp x17, CPU_FEATS_SYMBOL + ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] + tbz x17, #FEAT_SME_BIT, 0f + mrs x16, SVCR + tbz x16, #1, 0f + mrs x16, TPIDR2_EL0 + cbnz x16, 0f + + // Size = HAS_FEAT_SME2 ? 96 : 32 + tst x17, #FEAT_SME2_MASK + mov w17, #32 + mov w16, #96 + csel x16, x17, x16, eq + + // Size = Size + (SVLB * SVLB) + rdsvl x17, #1 + madd x0, x17, x17, x16 + ret + +0: + // Default case, 16 bytes is minimum (to encode VALID bit, multiple of 16 bytes) + mov w0, #16 + ret +END_COMPILERRT_FUNCTION(__arm_sme_state_size) + +DEFINE_COMPILERRT_FUNCTION(__arm_sme_save) + .variant_pcs __arm_sme_save + BTI_C + + // If PTR is not 16-byte aligned, abort. + tst x0, #0xF + b.ne 3f + + // Clear internal state bits + stp xzr, xzr, [x0] + + // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return. + adrp x17, CPU_FEATS_SYMBOL + ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] + tbz x17, #FEAT_SME_BIT, 2f + mrs x16, SVCR + tbz x16, #1, 2f + mrs x16, TPIDR2_EL0 + cbnz x16, 2f + + # ZA or ZT0 need saving, we can now set internal VALID bit to 1 + mov w16, #1 + str x16, [x0] + + add x18, x0, #32 + tbz x17, #FEAT_SME2_BIT, 1f + + // Store ZT0 + str zt0, [x18] + add x18, x18, #64 + +1: + // Set up lazy-save (x18 = pointer to buffer) + rdsvl x17, #1 + str x18, [x0, #16]! + strh w17, [x0, #8] + strh wzr, [x0, #10] + str wzr, [x0, #12] + msr TPIDR2_EL0, x0 + +2: + // Do nothing + ret + +3: + b SYMBOL_NAME(do_abort) +END_COMPILERRT_FUNCTION(__arm_sme_save) + +DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore) + .cfi_startproc + .variant_pcs __arm_sme_restore + BTI_C + + stp x29, x30, [sp, #-16]! + .cfi_def_cfa_offset 16 + mov x29, sp + .cfi_def_cfa w29, 16 + .cfi_offset w30, -8 + .cfi_offset w29, -16 + + // If PTR is not 16-byte aligned, abort. + tst x0, #0xF + b.ne 3f + + // If the VALID bit is 0, return early. + ldr x16, [x0] + cbz x16, 2f + + // If SME is not available, abort. + adrp x17, CPU_FEATS_SYMBOL + ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] + tbz x17, #FEAT_SME_BIT, 3f + + // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0. + mrs x16, TPIDR2_EL0 + cbnz x16, 1f + + // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'), + // abort. + mrs x16, SVCR + tbnz x16, #1, 3f + + // Restore za. + smstart za + add x0, x0, #16 + bl __arm_tpidr2_restore + sub x0, x0, #16 + +1: + smstart za + msr TPIDR2_EL0, xzr + + // Check if zt0 needs restoring. + tbz x17, #FEAT_SME2_BIT, 2f + + // Restore zt0. + add x16, x0, #32 + ldr zt0, [x16] + +2: + // Do nothing + .cfi_def_cfa wsp, 16 + ldp x29, x30, [sp], #16 + .cfi_def_cfa_offset 0 + .cfi_restore w30 + .cfi_restore w29 + ret + +3: + b SYMBOL_NAME(do_abort) + .cfi_endproc +END_COMPILERRT_FUNCTION(__arm_sme_restore) + NO_EXEC_STACK_DIRECTIVE // GNU property note for BTI and PAC