1
- import ctypes , subprocess , pathlib , tempfile
1
+ import ctypes , ctypes .util , struct , platform , tempfile , pathlib , subprocess
2
+ from mmap import mmap , PROT_READ , PROT_WRITE , PROT_EXEC , MAP_ANON , MAP_PRIVATE
3
+ from tinygrad .helpers import OSX , mv_address , cpu_time_execution , cpu_objdump
2
4
from tinygrad .device import Compiled , Compiler , MallocAllocator
3
- from tinygrad .helpers import cpu_time_execution , cpu_objdump
5
+ from tinygrad .runtime . support . elf import elf_loader , relocate
4
6
from tinygrad .renderer .cstyle import ClangRenderer
5
7
8
+ # NOTE: MAP_JIT is added to mmap module in python 3.13
9
+ MAP_JIT = 0x0800
10
+
11
+ # Used by ops_dsp.py
6
12
class ClangCompiler (Compiler ):
7
13
def __init__ (self , cachekey = "compile_clang" , args :list [str ]| None = None , objdump_tool = 'objdump' ):
8
14
self .args = ['-march=native' ] if args is None else args
@@ -18,15 +24,60 @@ def compile(self, src:str) -> bytes:
18
24
19
25
def disassemble (self , lib :bytes ): return cpu_objdump (lib , self .objdump_tool )
20
26
21
- class ClangProgram :
27
+ class ClangJITCompiler (Compiler ):
28
+ def __init__ (self , cachekey = "compile_clang_jit" ): super ().__init__ (cachekey )
29
+
30
+ def compile (self , src :str ) -> bytes :
31
+ # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
32
+ # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
33
+ args = ['-march=native' , f'--target={ platform .machine ()} -none-unknown-elf' , '-O2' , '-fPIC' , '-ffreestanding' , '-fno-math-errno' , '-nostdlib' ]
34
+ arch_args = ['-ffixed-x18' ] if platform .machine () == 'arm64' else []
35
+ obj = subprocess .check_output (['clang' , '-c' , '-x' , 'c' , * args , * arch_args , '-' , '-o' , '-' ], input = src .encode ('utf-8' ))
36
+ image , _ , relocs = elf_loader (obj )
37
+ # This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
38
+ for ploc ,tgt ,r_type ,r_addend in relocs :
39
+ image [ploc :ploc + 4 ] = struct .pack ("<I" , relocate (struct .unpack ("<I" , image [ploc :ploc + 4 ])[0 ], ploc , tgt + r_addend , r_type ))
40
+ return bytes (image )
41
+
42
+ def disassemble (self , lib ):
43
+ import capstone
44
+ match platform .machine ():
45
+ case 'x86_64' : cs = capstone .Cs (capstone .CS_ARCH_X86 , capstone .CS_MODE_64 )
46
+ case 'aarch64' | 'arm64' : cs = capstone .Cs (capstone .CS_ARCH_ARM64 , capstone .CS_MODE_ARM )
47
+ case machine : raise NotImplementedError (f"Capstone disassembly isn't supported for { machine } " )
48
+ for instr in cs .disasm (lib , 0 ):
49
+ print (f"{ instr .address :#08x} : { instr .mnemonic } \t { instr .op_str } " )
50
+
51
+ # CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
52
+ class CPUProgram :
53
+ helper_handle = ctypes .CDLL (ctypes .util .find_library ('System' if OSX else 'gcc_s' ))
54
+
22
55
def __init__ (self , name :str , lib :bytes ):
23
- self .name , self .lib = name , lib
24
- # write to disk so we can load it
25
- with tempfile .NamedTemporaryFile (delete = True ) as cached_file_path :
26
- pathlib .Path (cached_file_path .name ).write_bytes (lib )
27
- self .fxn = ctypes .CDLL (str (cached_file_path .name ))[name ]
56
+ # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
57
+ # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
58
+ self .mem = mmap (- 1 , len (lib ), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0 ), PROT_READ | PROT_WRITE | PROT_EXEC )
59
+
60
+ if OSX : CPUProgram .helper_handle .pthread_jit_write_protect_np (False )
61
+ self .mem .write (lib )
62
+ if OSX : CPUProgram .helper_handle .pthread_jit_write_protect_np (True )
63
+
64
+ # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
65
+ # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
66
+ # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
67
+ # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
68
+ CPUProgram .helper_handle ["__clear_cache" ](ctypes .c_void_p (mv_address (self .mem )), ctypes .c_void_p (mv_address (self .mem ) + len (lib )))
69
+
70
+ self .fxn = ctypes .CFUNCTYPE (None )(mv_address (self .mem ))
28
71
29
- def __call__ (self , * bufs , vals = (), wait = False ): return cpu_time_execution (lambda : self .fxn (* bufs , * vals ), enable = wait )
72
+ def __call__ (self , * bufs , vals = (), wait = False ):
73
+ args = list (bufs ) + list (vals )
74
+ # NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
75
+ # Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
76
+ # https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
77
+ # This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
78
+ # The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
79
+ if platform .machine () == "arm64" and OSX : args = args [:8 ] + [ctypes .c_int64 (a ) if isinstance (a , int ) else a for a in args [8 :]]
80
+ return cpu_time_execution (lambda : self .fxn (* args ), enable = wait )
30
81
31
82
class ClangDevice (Compiled ):
32
- def __init__ (self , device :str ): super ().__init__ (device , MallocAllocator , ClangRenderer (), ClangCompiler (), ClangProgram )
83
+ def __init__ (self , device :str ): super ().__init__ (device , MallocAllocator , ClangRenderer (), ClangJITCompiler (), CPUProgram )
0 commit comments