diff --git a/src/coreclr/gc/vxsort/isa_detection.cpp b/src/coreclr/gc/vxsort/isa_detection.cpp index 2a60ea01207a9f..1dcb7913a86961 100644 --- a/src/coreclr/gc/vxsort/isa_detection.cpp +++ b/src/coreclr/gc/vxsort/isa_detection.cpp @@ -77,11 +77,11 @@ SupportedISA DetermineSupportedISA() // get xcr0 register DWORD64 xcr0 = _xgetbv(0); - // get OS XState info + // get OS XState info DWORD64 FeatureMask = GetEnabledXStateFeaturesHelper(); // get processor extended feature flag info - __cpuid(reg, 7); + __cpuidex(reg, 7, 0); // check if all of AVX2, AVX512F and AVX512DQ are supported by both processor and OS if ((reg[EBX] & (AVX2 | AVX512F | AVX512DQ)) == (AVX2 | AVX512F | AVX512DQ) && diff --git a/src/coreclr/nativeaot/Runtime/PalRedhawk.h b/src/coreclr/nativeaot/Runtime/PalRedhawk.h index 44ba7ea15b0ec8..986ad2ac2c89f9 100644 --- a/src/coreclr/nativeaot/Runtime/PalRedhawk.h +++ b/src/coreclr/nativeaot/Runtime/PalRedhawk.h @@ -795,6 +795,9 @@ REDHAWK_PALIMPORT int32_t __cdecl _stricmp(const char *string1, const char *stri #ifdef TARGET_UNIX // MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures // We define matching signatures for use on Unix platforms. +// +// IMPORTANT: Unlike MSVC, Unix does not explicitly zero ECX for __cpuid + REDHAWK_PALIMPORT void __cpuid(int cpuInfo[4], int function_id); REDHAWK_PALIMPORT void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); #else diff --git a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp index cfe2502987c675..f4105485715e56 100644 --- a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp @@ -1031,7 +1031,7 @@ static void ActivationHandler(int code, siginfo_t* siginfo, void* context) #endif )) { - // Make sure that errno is not modified + // Make sure that errno is not modified int savedErrNo = errno; g_pHijackCallback((NATIVE_CONTEXT*)context, NULL); errno = savedErrNo; @@ -1275,12 +1275,16 @@ extern "C" uint64_t PalGetCurrentThreadIdForLogging() } #if defined(HOST_X86) || defined(HOST_AMD64) +// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures +// We define matching signatures for use on Unix platforms. +// +// IMPORTANT: Unlike MSVC, Unix does not explicitly zero ECX for __cpuid #if !__has_builtin(__cpuid) REDHAWK_PALEXPORT void __cpuid(int cpuInfo[4], int function_id) { // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h __asm(" cpuid\n" \ : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ @@ -1293,7 +1297,7 @@ REDHAWK_PALEXPORT void __cpuid(int cpuInfo[4], int function_id) REDHAWK_PALEXPORT void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) { // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h __asm(" cpuid\n" \ : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ @@ -1314,8 +1318,26 @@ REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI xmmYmmStateSupport() return ((eax & 0x06) == 0x06) ? 1 : 0; } +#ifndef XSTATE_MASK_AVX512 +#define XSTATE_MASK_AVX512 (0xE0) /* 0b1110_0000 */ +#endif // XSTATE_MASK_AVX512 + REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI avx512StateSupport() { +#if defined(TARGET_APPLE) + // MacOS has specialized behavior where it reports AVX512 support but doesnt + // actually enable AVX512 until the first instruction is executed and does so + // on a per thread basis. It does this by catching the faulting instruction and + // checking for the EVEX encoding. The kmov instructions, despite being part + // of the AVX512 instruction set are VEX encoded and dont trigger the enablement + // + // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 + + // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger + // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems + + return false; +#else DWORD eax; __asm(" xgetbv\n" \ : "=a"(eax) /*output in eax*/\ @@ -1324,6 +1346,7 @@ REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI avx512StateSupport() ); // check OS has enabled XMM, YMM and ZMM state support return ((eax & 0xE6) == 0x0E6) ? 1 : 0; +#endif } #endif // defined(HOST_X86) || defined(HOST_AMD64) diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp index 1d2b9766b4fdc0..b110c8f38accec 100644 --- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp +++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp @@ -365,17 +365,17 @@ REDHAWK_PALEXPORT CONTEXT* PalAllocateCompleteOSContext(_Out_ uint8_t** contextB } #endif //TARGET_X86 - // Determine if the processor supports AVX so we could + // Determine if the processor supports AVX or AVX512 so we could // retrieve extended registers DWORD64 FeatureMask = GetEnabledXStateFeatures(); - if ((FeatureMask & XSTATE_MASK_AVX) != 0) + if ((FeatureMask & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) != 0) { context = context | CONTEXT_XSTATE; } // Retrieve contextSize by passing NULL for Buffer DWORD contextSize = 0; - ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX; + ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_MPX | XSTATE_MASK_AVX512; // The initialize call should fail but return contextSize BOOL success = pfnInitializeContext2 ? pfnInitializeContext2(NULL, context, NULL, &contextSize, xStateCompactionMask) : @@ -426,9 +426,9 @@ REDHAWK_PALEXPORT _Success_(return) bool REDHAWK_PALAPI PalGetCompleteThreadCont #if defined(TARGET_X86) || defined(TARGET_AMD64) // Make sure that AVX feature mask is set, if supported. This should not normally fail. // The system silently ignores any feature specified in the FeatureMask which is not enabled on the processor. - if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX)) + if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) { - _ASSERTE(!"Could not apply XSTATE_MASK_AVX"); + _ASSERTE(!"Could not apply XSTATE_MASK_AVX | XSTATE_MASK_AVX512"); return FALSE; } #endif //defined(TARGET_X86) || defined(TARGET_AMD64) diff --git a/src/coreclr/pal/inc/pal.h b/src/coreclr/pal/inc/pal.h index 88dcd4e8d77f7a..37053c772215d8 100644 --- a/src/coreclr/pal/inc/pal.h +++ b/src/coreclr/pal/inc/pal.h @@ -1315,6 +1315,43 @@ QueueUserAPC( IN HANDLE hThread, IN ULONG_PTR dwData); +#if defined(HOST_X86) || defined(HOST_AMD64) +// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures +// We define matching signatures for use on Unix platforms. +// +// IMPORTANT: Unlike MSVC, Unix does not explicitly zero ECX for __cpuid + +#if __has_builtin(__cpuid) +extern "C" void __cpuid(int cpuInfo[4], int function_id); +#else +inline void __cpuid(int cpuInfo[4], int function_id) +{ + // Based on the Clang implementation provided in cpuid.h: + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h + + __asm(" cpuid\n" \ + : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ + : "0"(function_id) + ); +} +#endif // __cpuid + +#if __has_builtin(__cpuidex) +extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); +#else +inline void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) +{ + // Based on the Clang implementation provided in cpuid.h: + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h + + __asm(" cpuid\n" \ + : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ + : "0"(function_id), "2"(subFunction_id) + ); +} +#endif // __cpuidex +#endif // HOST_X86 || HOST_AMD64 + #ifdef HOST_X86 // @@ -1461,6 +1498,7 @@ typedef struct _KNONVOLATILE_CONTEXT_POINTERS { // #elif defined(HOST_AMD64) + // copied from winnt.h #define CONTEXT_AMD64 0x100000 @@ -1482,11 +1520,33 @@ typedef struct _KNONVOLATILE_CONTEXT_POINTERS { #define CONTEXT_EXCEPTION_REQUEST 0x40000000 #define CONTEXT_EXCEPTION_REPORTING 0x80000000 +#define XSTATE_GSSE (2) +#define XSTATE_AVX (XSTATE_GSSE) +#define XSTATE_AVX512_KMASK (5) +#define XSTATE_AVX512_ZMM_H (6) +#define XSTATE_AVX512_ZMM (7) + +#define XSTATE_MASK_GSSE (UI64(1) << (XSTATE_GSSE)) +#define XSTATE_MASK_AVX (XSTATE_MASK_GSSE) +#define XSTATE_MASK_AVX512 ((UI64(1) << (XSTATE_AVX512_KMASK)) | \ + (UI64(1) << (XSTATE_AVX512_ZMM_H)) | \ + (UI64(1) << (XSTATE_AVX512_ZMM))) + typedef struct DECLSPEC_ALIGN(16) _M128A { ULONGLONG Low; LONGLONG High; } M128A, *PM128A; +typedef struct DECLSPEC_ALIGN(16) _M256 { + M128A Low; + M128A High; +} M256, *PM256; + +typedef struct DECLSPEC_ALIGN(16) _M512 { + M256 Low; + M256 High; +} M512, *PM512; + typedef struct _XMM_SAVE_AREA32 { WORD ControlWord; WORD StatusWord; @@ -1623,6 +1683,82 @@ typedef struct DECLSPEC_ALIGN(16) _CONTEXT { DWORD64 LastBranchFromRip; DWORD64 LastExceptionToRip; DWORD64 LastExceptionFromRip; + + // XSTATE + DWORD64 XStateFeaturesMask; + DWORD64 XStateReserved0; + + // XSTATE_AVX + struct { + M128A Ymm0H; + M128A Ymm1H; + M128A Ymm2H; + M128A Ymm3H; + M128A Ymm4H; + M128A Ymm5H; + M128A Ymm6H; + M128A Ymm7H; + M128A Ymm8H; + M128A Ymm9H; + M128A Ymm10H; + M128A Ymm11H; + M128A Ymm12H; + M128A Ymm13H; + M128A Ymm14H; + M128A Ymm15H; + }; + + // XSTATE_AVX512_KMASK + struct { + DWORD64 KMask0; + DWORD64 KMask1; + DWORD64 KMask2; + DWORD64 KMask3; + DWORD64 KMask4; + DWORD64 KMask5; + DWORD64 KMask6; + DWORD64 KMask7; + }; + + // XSTATE_AVX512_ZMM_H + struct { + M256 Zmm0H; + M256 Zmm1H; + M256 Zmm2H; + M256 Zmm3H; + M256 Zmm4H; + M256 Zmm5H; + M256 Zmm6H; + M256 Zmm7H; + M256 Zmm8H; + M256 Zmm9H; + M256 Zmm10H; + M256 Zmm11H; + M256 Zmm12H; + M256 Zmm13H; + M256 Zmm14H; + M256 Zmm15H; + }; + + // XSTATE_AVX512_ZMM + struct { + M512 Zmm16; + M512 Zmm17; + M512 Zmm18; + M512 Zmm19; + M512 Zmm20; + M512 Zmm21; + M512 Zmm22; + M512 Zmm23; + M512 Zmm24; + M512 Zmm25; + M512 Zmm26; + M512 Zmm27; + M512 Zmm28; + M512 Zmm29; + M512 Zmm30; + M512 Zmm31; + }; } CONTEXT, *PCONTEXT, *LPCONTEXT; // diff --git a/src/coreclr/pal/src/arch/amd64/asmconstants.h b/src/coreclr/pal/src/arch/amd64/asmconstants.h index c23fb7043c77d5..d5a72cf6eda23a 100644 --- a/src/coreclr/pal/src/arch/amd64/asmconstants.h +++ b/src/coreclr/pal/src/arch/amd64/asmconstants.h @@ -3,6 +3,18 @@ #ifdef HOST_64BIT +#define XSTATE_GSSE (2) +#define XSTATE_AVX (XSTATE_GSSE) +#define XSTATE_AVX512_KMASK (5) +#define XSTATE_AVX512_ZMM_H (6) +#define XSTATE_AVX512_ZMM (7) + +#define XSTATE_MASK_GSSE (1 << (XSTATE_GSSE)) +#define XSTATE_MASK_AVX (XSTATE_MASK_GSSE) +#define XSTATE_MASK_AVX512 ((1 << (XSTATE_AVX512_KMASK)) | \ + (1 << (XSTATE_AVX512_ZMM_H)) | \ + (1 << (XSTATE_AVX512_ZMM))) + // The arch bit is normally set in the flag constants below. Since this is already arch-specific code and the arch bit is not // relevant, the arch bit is excluded from the flag constants below for simpler tests. #define CONTEXT_AMD64 0x100000 @@ -17,7 +29,7 @@ #define CONTEXT_XSTATE 64 -#define CONTEXT_ContextFlags 6*8 +#define CONTEXT_ContextFlags (6*8) #define CONTEXT_SegCs CONTEXT_ContextFlags+8 #define CONTEXT_SegDs CONTEXT_SegCs+2 #define CONTEXT_SegEs CONTEXT_SegDs+2 @@ -49,8 +61,8 @@ #define CONTEXT_R15 CONTEXT_R14+8 #define CONTEXT_Rip CONTEXT_R15+8 #define CONTEXT_FltSave CONTEXT_Rip+8 -#define FLOATING_SAVE_AREA_SIZE 4*8+24*16+96 -#define CONTEXT_Xmm0 CONTEXT_FltSave+10*16 +#define FLOATING_SAVE_AREA_SIZE (4*8)+(24*16)+96 +#define CONTEXT_Xmm0 CONTEXT_FltSave+(10*16) #define CONTEXT_Xmm1 CONTEXT_Xmm0+16 #define CONTEXT_Xmm2 CONTEXT_Xmm1+16 #define CONTEXT_Xmm3 CONTEXT_Xmm2+16 @@ -67,13 +79,19 @@ #define CONTEXT_Xmm14 CONTEXT_Xmm13+16 #define CONTEXT_Xmm15 CONTEXT_Xmm14+16 #define CONTEXT_VectorRegister CONTEXT_FltSave+FLOATING_SAVE_AREA_SIZE -#define CONTEXT_VectorControl CONTEXT_VectorRegister+16*26 +#define CONTEXT_VectorControl CONTEXT_VectorRegister+(16*26) #define CONTEXT_DebugControl CONTEXT_VectorControl+8 #define CONTEXT_LastBranchToRip CONTEXT_DebugControl+8 #define CONTEXT_LastBranchFromRip CONTEXT_LastBranchToRip+8 #define CONTEXT_LastExceptionToRip CONTEXT_LastBranchFromRip+8 #define CONTEXT_LastExceptionFromRip CONTEXT_LastExceptionToRip+8 -#define CONTEXT_Size CONTEXT_LastExceptionFromRip+8 +#define CONTEXT_XStateFeaturesMask CONTEXT_LastExceptionFromRip+8 +#define CONTEXT_XStateReserved0 CONTEXT_XStateFeaturesMask+8 +#define CONTEXT_Ymm0H CONTEXT_XStateReserved0+8 +#define CONTEXT_KMask0 CONTEXT_Ymm0H+(16*16) +#define CONTEXT_Zmm0H CONTEXT_KMask0+(8*8) +#define CONTEXT_Zmm16 CONTEXT_Zmm0H+(32*16) +#define CONTEXT_Size CONTEXT_Zmm16+(64*16) #else // HOST_64BIT diff --git a/src/coreclr/pal/src/arch/amd64/context2.S b/src/coreclr/pal/src/arch/amd64/context2.S index c8688dd63c0946..e4ab8ac1b19c3c 100644 --- a/src/coreclr/pal/src/arch/amd64/context2.S +++ b/src/coreclr/pal/src/arch/amd64/context2.S @@ -107,23 +107,81 @@ LOCAL_LABEL(Done_Restore_CONTEXT_FLOATING_POINT): test BYTE PTR [rdi + CONTEXT_ContextFlags], CONTEXT_XSTATE je LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE) - // Restore the extended state (for now, this is just the upper halves of YMM registers) - vinsertf128 ymm0, ymm0, xmmword ptr [rdi + (CONTEXT_VectorRegister + 0 * 16)], 1 - vinsertf128 ymm1, ymm1, xmmword ptr [rdi + (CONTEXT_VectorRegister + 1 * 16)], 1 - vinsertf128 ymm2, ymm2, xmmword ptr [rdi + (CONTEXT_VectorRegister + 2 * 16)], 1 - vinsertf128 ymm3, ymm3, xmmword ptr [rdi + (CONTEXT_VectorRegister + 3 * 16)], 1 - vinsertf128 ymm4, ymm4, xmmword ptr [rdi + (CONTEXT_VectorRegister + 4 * 16)], 1 - vinsertf128 ymm5, ymm5, xmmword ptr [rdi + (CONTEXT_VectorRegister + 5 * 16)], 1 - vinsertf128 ymm6, ymm6, xmmword ptr [rdi + (CONTEXT_VectorRegister + 6 * 16)], 1 - vinsertf128 ymm7, ymm7, xmmword ptr [rdi + (CONTEXT_VectorRegister + 7 * 16)], 1 - vinsertf128 ymm8, ymm8, xmmword ptr [rdi + (CONTEXT_VectorRegister + 8 * 16)], 1 - vinsertf128 ymm9, ymm9, xmmword ptr [rdi + (CONTEXT_VectorRegister + 9 * 16)], 1 - vinsertf128 ymm10, ymm10, xmmword ptr [rdi + (CONTEXT_VectorRegister + 10 * 16)], 1 - vinsertf128 ymm11, ymm11, xmmword ptr [rdi + (CONTEXT_VectorRegister + 11 * 16)], 1 - vinsertf128 ymm12, ymm12, xmmword ptr [rdi + (CONTEXT_VectorRegister + 12 * 16)], 1 - vinsertf128 ymm13, ymm13, xmmword ptr [rdi + (CONTEXT_VectorRegister + 13 * 16)], 1 - vinsertf128 ymm14, ymm14, xmmword ptr [rdi + (CONTEXT_VectorRegister + 14 * 16)], 1 - vinsertf128 ymm15, ymm15, xmmword ptr [rdi + (CONTEXT_VectorRegister + 15 * 16)], 1 + // Restore the YMM state + vinsertf128 ymm0, ymm0, xmmword ptr [rdi + (CONTEXT_Ymm0H + 0 * 16)], 1 + vinsertf128 ymm1, ymm1, xmmword ptr [rdi + (CONTEXT_Ymm0H + 1 * 16)], 1 + vinsertf128 ymm2, ymm2, xmmword ptr [rdi + (CONTEXT_Ymm0H + 2 * 16)], 1 + vinsertf128 ymm3, ymm3, xmmword ptr [rdi + (CONTEXT_Ymm0H + 3 * 16)], 1 + vinsertf128 ymm4, ymm4, xmmword ptr [rdi + (CONTEXT_Ymm0H + 4 * 16)], 1 + vinsertf128 ymm5, ymm5, xmmword ptr [rdi + (CONTEXT_Ymm0H + 5 * 16)], 1 + vinsertf128 ymm6, ymm6, xmmword ptr [rdi + (CONTEXT_Ymm0H + 6 * 16)], 1 + vinsertf128 ymm7, ymm7, xmmword ptr [rdi + (CONTEXT_Ymm0H + 7 * 16)], 1 + vinsertf128 ymm8, ymm8, xmmword ptr [rdi + (CONTEXT_Ymm0H + 8 * 16)], 1 + vinsertf128 ymm9, ymm9, xmmword ptr [rdi + (CONTEXT_Ymm0H + 9 * 16)], 1 + vinsertf128 ymm10, ymm10, xmmword ptr [rdi + (CONTEXT_Ymm0H + 10 * 16)], 1 + vinsertf128 ymm11, ymm11, xmmword ptr [rdi + (CONTEXT_Ymm0H + 11 * 16)], 1 + vinsertf128 ymm12, ymm12, xmmword ptr [rdi + (CONTEXT_Ymm0H + 12 * 16)], 1 + vinsertf128 ymm13, ymm13, xmmword ptr [rdi + (CONTEXT_Ymm0H + 13 * 16)], 1 + vinsertf128 ymm14, ymm14, xmmword ptr [rdi + (CONTEXT_Ymm0H + 14 * 16)], 1 + vinsertf128 ymm15, ymm15, xmmword ptr [rdi + (CONTEXT_Ymm0H + 15 * 16)], 1 + + test BYTE PTR [rdi + CONTEXT_XStateFeaturesMask], XSTATE_MASK_AVX512 + je LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE) + + // MacOS has specialized behavior where it reports AVX512 support but doesnt + // actually enable AVX512 until the first instruction is executed and does so + // on a per thread basis. It does this by catching the faulting instruction and + // checking for the EVEX encoding. The kmov instructions, despite being part + // of the AVX512 instruction set are VEX encoded and dont trigger the enablement + // + // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 + + // Restore the ZMM_Hi256 state + vinsertf64x4 zmm0, zmm0, ymmword ptr [rdi + (CONTEXT_Zmm0H + 0 * 32)], 1 + vinsertf64x4 zmm1, zmm1, ymmword ptr [rdi + (CONTEXT_Zmm0H + 1 * 32)], 1 + vinsertf64x4 zmm2, zmm2, ymmword ptr [rdi + (CONTEXT_Zmm0H + 2 * 32)], 1 + vinsertf64x4 zmm3, zmm3, ymmword ptr [rdi + (CONTEXT_Zmm0H + 3 * 32)], 1 + vinsertf64x4 zmm4, zmm4, ymmword ptr [rdi + (CONTEXT_Zmm0H + 4 * 32)], 1 + vinsertf64x4 zmm5, zmm5, ymmword ptr [rdi + (CONTEXT_Zmm0H + 5 * 32)], 1 + vinsertf64x4 zmm6, zmm6, ymmword ptr [rdi + (CONTEXT_Zmm0H + 6 * 32)], 1 + vinsertf64x4 zmm7, zmm7, ymmword ptr [rdi + (CONTEXT_Zmm0H + 7 * 32)], 1 + vinsertf64x4 zmm8, zmm8, ymmword ptr [rdi + (CONTEXT_Zmm0H + 8 * 32)], 1 + vinsertf64x4 zmm9, zmm9, ymmword ptr [rdi + (CONTEXT_Zmm0H + 9 * 32)], 1 + vinsertf64x4 zmm10, zmm10, ymmword ptr [rdi + (CONTEXT_Zmm0H + 10 * 32)], 1 + vinsertf64x4 zmm11, zmm11, ymmword ptr [rdi + (CONTEXT_Zmm0H + 11 * 32)], 1 + vinsertf64x4 zmm12, zmm12, ymmword ptr [rdi + (CONTEXT_Zmm0H + 12 * 32)], 1 + vinsertf64x4 zmm13, zmm13, ymmword ptr [rdi + (CONTEXT_Zmm0H + 13 * 32)], 1 + vinsertf64x4 zmm14, zmm14, ymmword ptr [rdi + (CONTEXT_Zmm0H + 14 * 32)], 1 + vinsertf64x4 zmm15, zmm15, ymmword ptr [rdi + (CONTEXT_Zmm0H + 15 * 32)], 1 + + // Restore the Hi16_ZMM state + vmovups zmm16, zmmword ptr [rdi + (CONTEXT_Zmm16 + 0 * 64)] + vmovups zmm17, zmmword ptr [rdi + (CONTEXT_Zmm16 + 1 * 64)] + vmovups zmm18, zmmword ptr [rdi + (CONTEXT_Zmm16 + 2 * 64)] + vmovups zmm19, zmmword ptr [rdi + (CONTEXT_Zmm16 + 3 * 64)] + vmovups zmm20, zmmword ptr [rdi + (CONTEXT_Zmm16 + 4 * 64)] + vmovups zmm21, zmmword ptr [rdi + (CONTEXT_Zmm16 + 5 * 64)] + vmovups zmm22, zmmword ptr [rdi + (CONTEXT_Zmm16 + 6 * 64)] + vmovups zmm23, zmmword ptr [rdi + (CONTEXT_Zmm16 + 7 * 64)] + vmovups zmm24, zmmword ptr [rdi + (CONTEXT_Zmm16 + 8 * 64)] + vmovups zmm25, zmmword ptr [rdi + (CONTEXT_Zmm16 + 9 * 64)] + vmovups zmm26, zmmword ptr [rdi + (CONTEXT_Zmm16 + 10 * 64)] + vmovups zmm27, zmmword ptr [rdi + (CONTEXT_Zmm16 + 11 * 64)] + vmovups zmm28, zmmword ptr [rdi + (CONTEXT_Zmm16 + 12 * 64)] + vmovups zmm29, zmmword ptr [rdi + (CONTEXT_Zmm16 + 13 * 64)] + vmovups zmm30, zmmword ptr [rdi + (CONTEXT_Zmm16 + 14 * 64)] + vmovups zmm31, zmmword ptr [rdi + (CONTEXT_Zmm16 + 15 * 64)] + + // Restore the Opmask state + kmovq k0, qword ptr [rdi + (CONTEXT_KMask0 + 0 * 8)] + kmovq k1, qword ptr [rdi + (CONTEXT_KMask0 + 1 * 8)] + kmovq k2, qword ptr [rdi + (CONTEXT_KMask0 + 2 * 8)] + kmovq k3, qword ptr [rdi + (CONTEXT_KMask0 + 3 * 8)] + kmovq k4, qword ptr [rdi + (CONTEXT_KMask0 + 4 * 8)] + kmovq k5, qword ptr [rdi + (CONTEXT_KMask0 + 5 * 8)] + kmovq k6, qword ptr [rdi + (CONTEXT_KMask0 + 6 * 8)] + kmovq k7, qword ptr [rdi + (CONTEXT_KMask0 + 7 * 8)] + LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE): test BYTE PTR [rdi + CONTEXT_ContextFlags], CONTEXT_CONTROL diff --git a/src/coreclr/pal/src/include/pal/context.h b/src/coreclr/pal/src/include/pal/context.h index c702ae272a76bd..2c860c2b7fac07 100644 --- a/src/coreclr/pal/src/include/pal/context.h +++ b/src/coreclr/pal/src/include/pal/context.h @@ -54,6 +54,10 @@ using asm_sigcontext::_xstate; #include #endif // !HAVE_MACH_EXCEPTIONS else +#if defined(XSTATE_SUPPORTED) || (defined(HOST_AMD64) && defined(HAVE_MACH_EXCEPTIONS)) +bool Xstate_IsAvx512Supported(); +#endif // XSTATE_SUPPORTED || (HOST_AMD64 && HAVE_MACH_EXCEPTIONS) + #ifdef HOST_S390X #define MCREG_PSWMask(mc) ((mc).psw.mask) @@ -354,13 +358,55 @@ using asm_sigcontext::_xstate; #define FPSTATE_RESERVED padding #endif -// The mask for YMM registers presence flag stored in the xfeatures (formerly xstate_bv). On current Linuxes, this definition is -// only in internal headers, so we define it here. The xfeatures (formerly xstate_bv) is extracted from the processor xstate bit -// vector register, so the value is OS independent. -#ifndef XSTATE_YMM -#define XSTATE_YMM 4 +// Presence for various extended state registers is detected via the xfeatures (formerly xstate_bv) field. On some +// Linux distros, this definition is only in internal headers, so we define it here. The masks are extracted from +// the processor xstate bit vector register, so the value is OS independent. + +#ifndef XFEATURE_MASK_YMM +#define XFEATURE_MASK_YMM (1 << XSTATE_AVX) +#endif // XFEATURE_MASK_YMM + +#ifndef XFEATURE_MASK_OPMASK +#define XFEATURE_MASK_OPMASK (1 << XSTATE_AVX512_KMASK) +#endif // XFEATURE_MASK_OPMASK + +#ifndef XFEATURE_MASK_ZMM_Hi256 +#define XFEATURE_MASK_ZMM_Hi256 (1 << XSTATE_AVX512_ZMM_H) +#endif // XFEATURE_MASK_ZMM_Hi256 + +#ifndef XFEATURE_MASK_Hi16_ZMM +#define XFEATURE_MASK_Hi16_ZMM (1 << XSTATE_AVX512_ZMM) +#endif // XFEATURE_MASK_Hi16_ZMM + +#ifndef XFEATURE_MASK_AVX512 +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) +#endif // XFEATURE_MASK_AVX512 + +#if HAVE__FPX_SW_BYTES_WITH_XSTATE_BV +#define FPREG_FpxSwBytes_xfeatures(uc) FPREG_FpxSwBytes(uc)->xstate_bv +#else +#define FPREG_FpxSwBytes_xfeatures(uc) FPREG_FpxSwBytes(uc)->xfeatures #endif +// The internal _xstate struct is exposed as fpstate, xstate_hdr, ymmh. However, in reality this is +// fpstate, xstate_hdr, extended_state_area and "technically" we are supposed to be determining the +// offset and size of each XFEATURE_MASK_* via CPUID. The extended region always starts at offset +// 576 which is the same as the address of ymmh + +#define FPREG_Xstate_ExtendedStateArea_Offset offsetof(_xstate, ymmh) +#define FPREG_Xstate_ExtendedStateArea(uc) (reinterpret_cast(FPREG_Fpstate(uc)) + \ + FPREG_Xstate_ExtendedStateArea_Offset) + +struct Xstate_ExtendedFeature +{ + bool initialized; + uint32_t offset; + uint32_t size; +}; + +#define Xstate_ExtendedFeatures_Count (XSTATE_AVX512_ZMM + 1) +extern Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; + inline _fpx_sw_bytes *FPREG_FpxSwBytes(const ucontext_t *uc) { // Bytes 464..511 in the FXSAVE format are available for software to use for any purpose. In this case, they are used to @@ -378,7 +424,7 @@ inline UINT32 FPREG_ExtendedSize(const ucontext_t *uc) return FPREG_FpxSwBytes(uc)->extended_size; } -inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) +inline bool FPREG_HasExtendedState(const ucontext_t *uc) { // See comments in /usr/include/x86_64-linux-gnu/asm/sigcontext.h for info on how to detect if extended state is present static_assert_no_msg(FP_XSTATE_MAGIC2_SIZE == sizeof(UINT32)); @@ -401,21 +447,100 @@ inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) return false; } -#if HAVE__FPX_SW_BYTES_WITH_XSTATE_BV - return (FPREG_FpxSwBytes(uc)->xstate_bv & XSTATE_YMM) != 0; -#else - return (FPREG_FpxSwBytes(uc)->xfeatures & XSTATE_YMM) != 0; -#endif + return true; +} + +inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) +{ + if (!FPREG_HasExtendedState(uc)) + { + return false; + } + + return (FPREG_FpxSwBytes_xfeatures(uc) & XFEATURE_MASK_YMM) == XFEATURE_MASK_YMM; +} + +inline void *FPREG_Xstate_ExtendedFeature(const ucontext_t *uc, uint32_t *featureSize, uint32_t featureIndex) +{ + _ASSERTE(featureSize != nullptr); + _ASSERTE(featureIndex < (sizeof(Xstate_ExtendedFeatures) / sizeof(Xstate_ExtendedFeature))); + _ASSERT(FPREG_Xstate_ExtendedStateArea_Offset == 576); + + Xstate_ExtendedFeature *extendedFeature = &Xstate_ExtendedFeatures[featureIndex]; + + if (!extendedFeature->initialized) + { + int cpuidInfo[4]; + + const int CPUID_EAX = 0; + const int CPUID_EBX = 1; + const int CPUID_ECX = 2; + const int CPUID_EDX = 3; + +#ifdef _DEBUG + // We should only be calling this function if we know the extended feature exists + + __cpuid(cpuidInfo, 0x00000000); + _ASSERTE(static_cast(cpuidInfo[CPUID_EAX]) >= 0x0D); + + __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); + _ASSERTE((cpuidInfo[CPUID_EAX] & (1 << featureIndex)) != 0); +#endif // _DEBUG + + __cpuidex(cpuidInfo, 0x0000000D, static_cast(featureIndex)); + + _ASSERTE(static_cast(cpuidInfo[CPUID_EAX]) > 0); + _ASSERTE(static_cast(cpuidInfo[CPUID_EBX]) >= FPREG_Xstate_ExtendedStateArea_Offset); + + extendedFeature->size = static_cast(cpuidInfo[CPUID_EAX]); + extendedFeature->offset = static_cast(cpuidInfo[CPUID_EBX] - FPREG_Xstate_ExtendedStateArea_Offset); + + extendedFeature->initialized = true; + } + + *featureSize = extendedFeature->size; + return (FPREG_Xstate_ExtendedStateArea(uc) + extendedFeature->offset); +} + +inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc, uint32_t *featureSize) +{ + _ASSERTE(FPREG_HasYmmRegisters(uc)); + return FPREG_Xstate_ExtendedFeature(uc, featureSize, XSTATE_AVX); } -inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc) +inline bool FPREG_HasAvx512Registers(const ucontext_t *uc) { - static_assert_no_msg(sizeof(reinterpret_cast<_xstate *>(FPREG_Fpstate(uc))->ymmh.ymmh_space) == 16 * 16); + if (!FPREG_HasExtendedState(uc)) + { + return false; + } + + if ((FPREG_FpxSwBytes_xfeatures(uc) & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) + { + return false; + } + _ASSERTE(FPREG_HasYmmRegisters(uc)); + return Xstate_IsAvx512Supported(); +} - return reinterpret_cast<_xstate *>(FPREG_Fpstate(uc))->ymmh.ymmh_space; +inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *featureSize) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + return FPREG_Xstate_ExtendedFeature(uc, featureSize, XSTATE_AVX512_KMASK); +} + +inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *featureSize) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + return FPREG_Xstate_ExtendedFeature(uc, featureSize, XSTATE_AVX512_ZMM_H); } +inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *featureSize) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + return FPREG_Xstate_ExtendedFeature(uc, featureSize, XSTATE_AVX512_ZMM); +} #endif // XSTATE_SUPPORTED ///////////////////// @@ -706,11 +831,48 @@ inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) } static_assert_no_msg(offsetof(_STRUCT_X86_AVX_STATE64, __fpu_ymmh0) == offsetof(_STRUCT_X86_AVX512_STATE64, __fpu_ymmh0)); -inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc) + +inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc, uint32_t *featureSize) { + _ASSERTE(FPREG_HasYmmRegisters(uc)); + _ASSERTE(featureSize != nullptr); + + *featureSize = sizeof(_STRUCT_XMM_REG) * 16; return reinterpret_cast(&((_STRUCT_X86_AVX_STATE64&)FPSTATE(uc)).__fpu_ymmh0); } +inline bool FPREG_HasAvx512Registers(const ucontext_t *uc) +{ + _ASSERTE((uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX64)) || (uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX512_64))); + return (uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX512_64)); +} + +inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *featureSize) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + _ASSERTE(featureSize != nullptr); + + *featureSize = sizeof(_STRUCT_OPMASK_REG) * 8; + return reinterpret_cast(&((_STRUCT_X86_AVX512_STATE64&)FPSTATE(uc)).__fpu_k0); +} + +inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *featureSize) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + _ASSERTE(featureSize != nullptr); + + *featureSize = sizeof(_STRUCT_YMM_REG) * 16; + return reinterpret_cast(&((_STRUCT_X86_AVX512_STATE64&)FPSTATE(uc)).__fpu_zmmh0); +} + +inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *featureSize) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + _ASSERTE(featureSize != nullptr); + + *featureSize = sizeof(_STRUCT_ZMM_REG) * 16; + return reinterpret_cast(&((_STRUCT_X86_AVX512_STATE64&)FPSTATE(uc)).__fpu_zmm16); +} #else //TARGET_OSX // For FreeBSD, as found in x86/ucontext.h diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index a17c6c077da3b7..0b0229548984c1 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -316,8 +316,81 @@ typedef int __ptrace_request; ASSIGN_CONTROL_REGS \ ASSIGN_INTEGER_REGS \ +#if defined(XSTATE_SUPPORTED) || defined(HOST_AMD64) && defined(HAVE_MACH_EXCEPTIONS) +bool Xstate_IsAvx512Supported() +{ +#if defined(HAVE_MACH_EXCEPTIONS) + // MacOS has specialized behavior where it reports AVX512 support but doesnt + // actually enable AVX512 until the first instruction is executed and does so + // on a per thread basis. It does this by catching the faulting instruction and + // checking for the EVEX encoding. The kmov instructions, despite being part + // of the AVX512 instruction set are VEX encoded and dont trigger the enablement + // + // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 + + // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger + // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems + + return false; +#else + static int Xstate_Avx512Supported = -1; + + if (Xstate_Avx512Supported == -1) + { + int cpuidInfo[4]; + + const int CPUID_EAX = 0; + const int CPUID_EBX = 1; + const int CPUID_ECX = 2; + const int CPUID_EDX = 3; + +#ifdef _DEBUG + // We should only be calling this function if we know the extended feature exists + __cpuid(cpuidInfo, 0x00000000); + _ASSERTE(static_cast(cpuidInfo[CPUID_EAX]) >= 0x0D); +#endif // _DEBUG + + __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); + + if ((cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512) + { + // Knight's Landing and Knight's Mill shipped without all 5 of the "baseline" + // AVX-512 ISAs that are required by x86-64-v4. Specifically they do not include + // BW, DQ, or VL. RyuJIT currently requires all 5 ISAs to be present so we will + // only enable Avx512 context save/restore when all exist. This requires us to + // query which ISAs are actually supported to ensure they're all present. + + __cpuidex(cpuidInfo, 0x00000007, 0x00000000); + + const int requiredAvx512Flags = (1 << 16) | // AVX512F + (1 << 17) | // AVX512DQ + (1 << 28) | // AVX512CD + (1 << 30) | // AVX512BW + (1 << 31); // AVX512VL + + if ((cpuidInfo[CPUID_EBX] & requiredAvx512Flags) == requiredAvx512Flags) + { + Xstate_Avx512Supported = 1; + } + } + + if (Xstate_Avx512Supported == -1) + { + Xstate_Avx512Supported = 0; + } + } + + return Xstate_Avx512Supported == 1; +#endif +} +#endif // XSTATE_SUPPORTED || defined(HOST_AMD64) && defined(HAVE_MACH_EXCEPTIONS) + #if !HAVE_MACH_EXCEPTIONS +#ifdef XSTATE_SUPPORTED +Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; +#endif // XSTATE_SUPPORTED + /*++ Function: CONTEXT_GetRegisters @@ -682,8 +755,34 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native) #if defined(HOST_AMD64) && defined(XSTATE_SUPPORTED) if ((lpContext->ContextFlags & CONTEXT_XSTATE) == CONTEXT_XSTATE) { - _ASSERTE(FPREG_HasYmmRegisters(native)); - memcpy_s(FPREG_Xstate_Ymmh(native), sizeof(M128A) * 16, lpContext->VectorRegister, sizeof(M128A) * 16); + if (FPREG_HasYmmRegisters(native)) + { + _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX) == XSTATE_MASK_AVX); + + uint32_t size; + void *dest; + + dest = FPREG_Xstate_Ymmh(native, &size); + _ASSERT(size == (sizeof(M128A) * 16)); + memcpy_s(dest, sizeof(M128A) * 16, &lpContext->Ymm0H, sizeof(M128A) * 16); + + if (FPREG_HasAvx512Registers(native)) + { + _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512); + + dest = FPREG_Xstate_Opmask(native, &size); + _ASSERT(size == (sizeof(DWORD64) * 8)); + memcpy_s(dest, sizeof(DWORD64) * 8, &lpContext->KMask0, sizeof(DWORD64) * 8); + + dest = FPREG_Xstate_ZmmHi256(native, &size); + _ASSERT(size == (sizeof(M256) * 16)); + memcpy_s(dest, sizeof(M256) * 16, &lpContext->Zmm0H, sizeof(M256) * 16); + + dest = FPREG_Xstate_Hi16Zmm(native, &size); + _ASSERT(size == (sizeof(M512) * 16)); + memcpy_s(dest, sizeof(M512) * 16, &lpContext->Zmm16, sizeof(M512) * 16); + } + } } #endif //HOST_AMD64 && XSTATE_SUPPORTED } @@ -853,7 +952,31 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex #if XSTATE_SUPPORTED if (FPREG_HasYmmRegisters(native)) { - memcpy_s(lpContext->VectorRegister, sizeof(M128A) * 16, FPREG_Xstate_Ymmh(native), sizeof(M128A) * 16); + uint32_t size; + void *src; + + src = FPREG_Xstate_Ymmh(native, &size); + _ASSERT(size == (sizeof(M128A) * 16)); + memcpy_s(&lpContext->Ymm0H, sizeof(M128A) * 16, src, sizeof(M128A) * 16); + + lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX; + + if (FPREG_HasAvx512Registers(native)) + { + src = FPREG_Xstate_Opmask(native, &size); + _ASSERT(size == (sizeof(DWORD64) * 8)); + memcpy_s(&lpContext->KMask0, sizeof(DWORD64) * 8, src, sizeof(DWORD64) * 8); + + src = FPREG_Xstate_ZmmHi256(native, &size); + _ASSERT(size == (sizeof(M256) * 16)); + memcpy_s(&lpContext->Zmm0H, sizeof(M256) * 16, src, sizeof(M256) * 16); + + src = FPREG_Xstate_Hi16Zmm(native, &size); + _ASSERT(size == (sizeof(M512) * 16)); + memcpy_s(&lpContext->Zmm16, sizeof(M512) * 16, src, sizeof(M512) * 16); + + lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX512; + } } else #endif // XSTATE_SUPPORTED @@ -1212,23 +1335,29 @@ CONTEXT_GetThreadContextFromPort( x86_avx512_state64_t State; - StateFlavor = x86_AVX_STATE64; - StateCount = sizeof(x86_avx_state64_t) / sizeof(natural_t); + StateFlavor = x86_AVX512_STATE64; + StateCount = sizeof(x86_avx512_state64_t) / sizeof(natural_t); MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount); + if (MachRet != KERN_SUCCESS) { - // The AVX state is not available, try to get the AVX512 state. - StateFlavor = x86_AVX512_STATE64; - StateCount = sizeof(x86_avx512_state64_t) / sizeof(natural_t); + // The AVX512 state is not available, try to get the AVX state. + lpContext->XStateFeaturesMask &= ~XSTATE_MASK_AVX512; + + StateFlavor = x86_AVX_STATE64; + StateCount = sizeof(x86_avx_state64_t) / sizeof(natural_t); MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount); + if (MachRet != KERN_SUCCESS) { - // Neither the AVX nor the AVX512 state is not available, try to get at least the FLOAT state. + // Neither the AVX512 nor the AVX state is not available, try to get at least the FLOAT state. + lpContext->XStateFeaturesMask &= ~XSTATE_MASK_AVX; lpContext->ContextFlags &= ~(CONTEXT_XSTATE & CONTEXT_AREA_MASK); StateFlavor = x86_FLOAT_STATE64; StateCount = sizeof(x86_float_state64_t) / sizeof(natural_t); MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount); + if (MachRet != KERN_SUCCESS) { // We were unable to get any floating point state. This case was observed on OSX with AVX512 capable processors. @@ -1306,18 +1435,41 @@ CONTEXT_GetThreadContextFromThreadState( } break; - case x86_AVX_STATE64: case x86_AVX512_STATE64: + { + if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) + { + if (Xstate_IsAvx512Supported()) + { + x86_avx512_state64_t *pState = (x86_avx512_state64_t *)threadState; + + memcpy(&lpContext->KMask0, &pState->__fpu_k0, sizeof(_STRUCT_OPMASK_REG) * 8); + memcpy(&lpContext->Zmm0H, &pState->__fpu_zmmh0, sizeof(_STRUCT_YMM_REG) * 16); + memcpy(&lpContext->Zmm16, &pState->__fpu_zmm16, sizeof(_STRUCT_ZMM_REG) * 16); + + lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX512; + } + } + + // Intentional fall-through, the AVX512 states are supersets of the AVX state + FALLTHROUGH; + } + + case x86_AVX_STATE64: + { if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { x86_avx_state64_t *pState = (x86_avx_state64_t *)threadState; - memcpy(&lpContext->VectorRegister, &pState->__fpu_ymmh0, 16 * 16); + memcpy(&lpContext->Ymm0H, &pState->__fpu_ymmh0, sizeof(_STRUCT_XMM_REG) * 16); + lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX; } // Intentional fall-through, the AVX states are supersets of the FLOAT state FALLTHROUGH; + } case x86_FLOAT_STATE64: + { if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT & CONTEXT_AREA_MASK) { x86_float_state64_t *pState = (x86_float_state64_t *)threadState; @@ -1343,6 +1495,8 @@ CONTEXT_GetThreadContextFromThreadState( memcpy(&lpContext->Xmm0, &pState->__fpu_xmm0, 16 * 16); } break; + } + case x86_THREAD_STATE: { x86_thread_state_t *pState = (x86_thread_state_t *)threadState; @@ -1453,21 +1607,33 @@ CONTEXT_SetThreadContextOnPort( if (lpContext->ContextFlags & CONTEXT_ALL_FLOATING & CONTEXT_AREA_MASK) { - #ifdef HOST_AMD64 #ifdef XSTATE_SUPPORTED // We're relying on the fact that the initial portion of - // x86_avx_state64_t is identical to x86_float_state64_t. + // x86_avx_state64_t is identical to x86_float_state64_t + // and x86_avx512_state64_t to _x86_avx_state64_t. // Check a few fields to make sure the assumption is correct. static_assert_no_msg(sizeof(x86_avx_state64_t) > sizeof(x86_float_state64_t)); + static_assert_no_msg(sizeof(x86_avx512_state64_t) > sizeof(x86_avx_state64_t)); static_assert_no_msg(offsetof(x86_avx_state64_t, __fpu_fcw) == offsetof(x86_float_state64_t, __fpu_fcw)); static_assert_no_msg(offsetof(x86_avx_state64_t, __fpu_xmm0) == offsetof(x86_float_state64_t, __fpu_xmm0)); + static_assert_no_msg(offsetof(x86_avx512_state64_t, __fpu_fcw) == offsetof(x86_float_state64_t, __fpu_fcw)); + static_assert_no_msg(offsetof(x86_avx512_state64_t, __fpu_xmm0) == offsetof(x86_float_state64_t, __fpu_xmm0)); - x86_avx_state64_t State; + x86_avx512_state64_t State; if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { - StateFlavor = x86_AVX_STATE64; - StateCount = sizeof(State) / sizeof(natural_t); + if ((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512) + { + StateFlavor = x86_AVX512_STATE64; + StateCount = sizeof(x86_avx512_state64_t) / sizeof(natural_t); + } + else + { + _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX) == XSTATE_MASK_AVX); + StateFlavor = x86_AVX_STATE64; + StateCount = sizeof(x86_avx_state64_t) / sizeof(natural_t); + } } else { @@ -1520,7 +1686,15 @@ CONTEXT_SetThreadContextOnPort( #if defined(HOST_AMD64) && defined(XSTATE_SUPPORTED) if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { - memcpy(&State.__fpu_ymmh0, lpContext->VectorRegister, 16 * 16); + if ((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512) + { + memcpy(&State.__fpu_k0, &lpContext->KMask0, sizeof(_STRUCT_OPMASK_REG) * 8); + memcpy(&State.__fpu_zmmh0, &lpContext->Zmm0H, sizeof(_STRUCT_YMM_REG) * 16); + memcpy(&State.__fpu_zmm16, &lpContext->Zmm16, sizeof(_STRUCT_ZMM_REG) * 16); + } + + _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX) == XSTATE_MASK_AVX); + memcpy(&State.__fpu_ymmh0, &lpContext->Ymm0H, sizeof(_STRUCT_XMM_REG) * 16); } #endif diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h index cbd0eae0c79df4..22082547234275 100644 --- a/src/coreclr/vm/amd64/asmconstants.h +++ b/src/coreclr/vm/amd64/asmconstants.h @@ -285,7 +285,15 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__MethodDesc__m_wFlags == offsetof(MethodDesc, m_w ASMCONSTANTS_C_ASSERT(OFFSETOF__VASigCookie__pNDirectILStub == offsetof(VASigCookie, pNDirectILStub)); -#define SIZEOF__CONTEXT (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5) +#if defined(UNIX_AMD64_ABI) && !defined(HOST_WINDOWS) +// Expression is too complicated, is currently: +// (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5 + /*XSTATE*/ + 8 + 8 + /*XSTATE_AVX*/ 16*16 + /*XSTATE_AVX512_KMASK*/ 8*8 + /*XSTATE_AVX512_ZMM_H*/ 32*16 + /*XSTATE_AVX512_ZMM*/ 64*16) +#define SIZEOF__CONTEXT (3104) +#else +// Expression is too complicated, is currently: +// (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5) +#define SIZEOF__CONTEXT (1232) +#endif ASMCONSTANTS_C_ASSERT(SIZEOF__CONTEXT == sizeof(CONTEXT)); diff --git a/src/coreclr/vm/amd64/unixstubs.cpp b/src/coreclr/vm/amd64/unixstubs.cpp index 8fdcfd15a9b3eb..d5bb054c9be5b7 100644 --- a/src/coreclr/vm/amd64/unixstubs.cpp +++ b/src/coreclr/vm/amd64/unixstubs.cpp @@ -10,32 +10,6 @@ extern "C" PORTABILITY_ASSERT("Implement for PAL"); } -#if !__has_builtin(__cpuid) - void __cpuid(int cpuInfo[4], int function_id) - { - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid\n" \ - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id) - ); - } -#endif - -#if !__has_builtin(__cpuidex) - void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) - { - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid\n" \ - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id), "2"(subFunction_id) - ); - } -#endif - DWORD xmmYmmStateSupport() { DWORD eax; @@ -48,8 +22,26 @@ extern "C" return ((eax & 0x06) == 0x06) ? 1 : 0; } +#ifndef XSTATE_MASK_AVX512 +#define XSTATE_MASK_AVX512 (0xE0) /* 0b1110_0000 */ +#endif // XSTATE_MASK_AVX512 + DWORD avx512StateSupport() { +#if defined(TARGET_OSX) + // MacOS has specialized behavior where it reports AVX512 support but doesnt + // actually enable AVX512 until the first instruction is executed and does so + // on a per thread basis. It does this by catching the faulting instruction and + // checking for the EVEX encoding. The kmov instructions, despite being part + // of the AVX512 instruction set are VEX encoded and dont trigger the enablement + // + // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 + + // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger + // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems + + return false; +#else DWORD eax; __asm(" xgetbv\n" \ : "=a"(eax) /*output in eax*/\ @@ -58,6 +50,7 @@ extern "C" ); // check OS has enabled XMM, YMM and ZMM state support return ((eax & 0x0E6) == 0x0E6) ? 1 : 0; +#endif } void STDMETHODCALLTYPE JIT_ProfilerEnterLeaveTailcallStub(UINT_PTR ProfilerHandle) diff --git a/src/coreclr/vm/cgensys.h b/src/coreclr/vm/cgensys.h index 75f266be916023..b7cc4c715a51b0 100644 --- a/src/coreclr/vm/cgensys.h +++ b/src/coreclr/vm/cgensys.h @@ -93,13 +93,6 @@ inline void GetSpecificCpuInfo(CORINFO_CPU * cpuInfo) #endif // !TARGET_X86 #if (defined(TARGET_X86) || defined(TARGET_AMD64)) -#ifdef TARGET_UNIX -// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures -// We define matching signatures for use on Unix platforms. - -extern "C" void __cpuid(int cpuInfo[4], int function_id); -extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); -#endif // TARGET_UNIX extern "C" DWORD xmmYmmStateSupport(); extern "C" DWORD avx512StateSupport(); #endif diff --git a/src/coreclr/vm/i386/cgenx86.cpp b/src/coreclr/vm/i386/cgenx86.cpp index 020593b8735361..8d17ace05de965 100644 --- a/src/coreclr/vm/i386/cgenx86.cpp +++ b/src/coreclr/vm/i386/cgenx86.cpp @@ -1133,32 +1133,6 @@ extern "C" DWORD avx512StateSupport() #else // !TARGET_UNIX -#if !__has_builtin(__cpuid) -void __cpuid(int cpuInfo[4], int function_id) -{ - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid" - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id) - ); -} -#endif - -#if !__has_builtin(__cpuidex) -void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) -{ - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid" - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id), "2"(subFunction_id) - ); -} -#endif - extern "C" DWORD xmmYmmStateSupport() { DWORD eax;