diff --git a/src/gc-debug.c b/src/gc-debug.c index 7d6ca8ece2ecf..744401a2a5e4f 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -200,19 +200,17 @@ static void gc_verify_track(jl_ptls_t ptls) { jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; do { - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); arraylist_push(&lostval_parents_done, lostval); jl_safe_printf("Now looking for %p =======\n", lostval); clear_mark(GC_CLEAN); - gc_mark_queue_all_roots(ptls, &sp); - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); + gc_mark_queue_all_roots(ptls); + gc_mark_queue_finlist(gc_cache, &to_finalize, 0); for (int i = 0;i < jl_n_threads;i++) { jl_ptls_t ptls2 = jl_all_tls_states[i]; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + gc_mark_queue_finlist(gc_cache, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); - gc_mark_loop(ptls, sp); + gc_mark_queue_finlist(gc_cache, &finalizer_list_marked, 0); + gc_mark_loop(ptls); if (lostval_parents.len == 0) { jl_safe_printf("Could not find the missing link. We missed a toplevel root. This is odd.\n"); break; @@ -247,21 +245,19 @@ static void gc_verify_track(jl_ptls_t ptls) void gc_verify(jl_ptls_t ptls) { jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); lostval = NULL; lostval_parents.len = 0; lostval_parents_done.len = 0; clear_mark(GC_CLEAN); gc_verifying = 1; - gc_mark_queue_all_roots(ptls, &sp); - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); + gc_mark_queue_all_roots(ptls); + gc_mark_queue_finlist(gc_cache, &to_finalize, 0); for (int i = 0;i < jl_n_threads;i++) { jl_ptls_t ptls2 = jl_all_tls_states[i]; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + gc_mark_queue_finlist(gc_cache, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); - gc_mark_loop(ptls, sp); + gc_mark_queue_finlist(gc_cache, &finalizer_list_marked, 0); + gc_mark_loop(ptls); int clean_len = bits_save[GC_CLEAN].len; for(int i = 0; i < clean_len + bits_save[GC_OLD].len; i++) { jl_taggedvalue_t *v = (jl_taggedvalue_t*)bits_save[i >= clean_len ? GC_OLD : GC_CLEAN].items[i >= clean_len ? i - clean_len : i]; @@ -1271,136 +1267,137 @@ int gc_slot_to_arrayidx(void *obj, void *_slot) // Print a backtrace from the bottom (start) of the mark stack up to `sp` // `pc_offset` will be added to `sp` for convenience in the debugger. -NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset) -{ - jl_jmp_buf *old_buf = jl_get_safe_restore(); - jl_jmp_buf buf; - jl_set_safe_restore(&buf); - if (jl_setjmp(buf, 0) != 0) { - jl_safe_printf("\n!!! ERROR when unwinding gc mark loop -- ABORTING !!!\n"); - jl_set_safe_restore(old_buf); - return; - } - void **top = sp.pc + pc_offset; - jl_gc_mark_data_t *data_top = sp.data; - sp.data = ptls->gc_cache.data_stack; - sp.pc = ptls->gc_cache.pc_stack; - int isroot = 1; - while (sp.pc < top) { - void *pc = *sp.pc; - const char *prefix = isroot ? "r--" : " `-"; - isroot = 0; - if (pc == gc_mark_label_addrs[GC_MARK_L_marked_obj]) { - gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Root object: %p :: %p (bits: %d)\n of type ", - (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); - jl_((void*)data->tag); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_scan_only]) { - gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Queued root: %p :: %p (bits: %d)\n of type ", - (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); - jl_((void*)data->tag); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_finlist]) { - gc_mark_finlist_t *data = gc_repush_markdata(&sp, gc_mark_finlist_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Finalizer list from %p to %p\n", - (void*)data, (void*)data->begin, (void*)data->end); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_objarray]) { - gc_mark_objarray_t *data = gc_repush_markdata(&sp, gc_mark_objarray_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Array in object %p :: %p -- [%p, %p)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (void*)data->begin, (void*)data->end); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj8]) { - gc_mark_obj8_t *data = gc_repush_markdata(&sp, gc_mark_obj8_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint8_t *desc = (uint8_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (8bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj16]) { - gc_mark_obj16_t *data = gc_repush_markdata(&sp, gc_mark_obj16_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint16_t *desc = (uint16_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (16bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj32]) { - gc_mark_obj32_t *data = gc_repush_markdata(&sp, gc_mark_obj32_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint32_t *desc = (uint32_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (32bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_stack]) { - gc_mark_stackframe_t *data = gc_repush_markdata(&sp, gc_mark_stackframe_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Stack frame %p -- %d of %d (%s)\n", - (void*)data, prefix, (void*)data->s, (int)data->i, - (int)data->nroots >> 1, - (data->nroots & 1) ? "indirect" : "direct"); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_module_binding]) { - // module_binding - gc_mark_binding_t *data = gc_repush_markdata(&sp, gc_mark_binding_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Module (bindings) %p (bits %d) -- [%p, %p)\n", - (void*)data, prefix, (void*)data->parent, (int)data->bits, - (void*)data->begin, (void*)data->end); - } - else { - jl_safe_printf("Unknown pc %p --- ABORTING !!!\n", pc); - break; - } - } - jl_set_safe_restore(old_buf); -} +// NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, int pc_offset) +// { +// jl_jmp_buf *old_buf = jl_get_safe_restore(); +// jl_jmp_buf buf; +// jl_set_safe_restore(&buf); +// if (jl_setjmp(buf, 0) != 0) { +// jl_safe_printf("\n!!! ERROR when unwinding gc mark loop -- ABORTING !!!\n"); +// jl_set_safe_restore(old_buf); +// return; +// } +// jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; +// void **top = sp.pc + pc_offset; +// jl_gc_mark_data_t *data_top = sp.data; +// sp.data = ptls->gc_cache.data_stack; +// sp.pc = ptls->gc_cache.pc_stack; +// int isroot = 1; +// while (sp.pc < top) { +// void *pc = *sp.pc; +// const char *prefix = isroot ? "r--" : " `-"; +// isroot = 0; +// if (pc == gc_mark_label_addrs[GC_MARK_L_marked_obj]) { +// gc_mark_marked_obj_t *data = gc_repush_markdata(gc_cache, &sp, gc_mark_marked_obj_t); +// if ((jl_gc_mark_data_t *)data > data_top) { +// jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); +// break; +// } +// jl_safe_printf("%p: Root object: %p :: %p (bits: %d)\n of type ", +// (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); +// jl_((void*)data->tag); +// isroot = 1; +// } +// else if (pc == gc_mark_label_addrs[GC_MARK_L_scan_only]) { +// gc_mark_marked_obj_t *data = gc_repush_markdata(gc_cache, &sp, gc_mark_marked_obj_t); +// if ((jl_gc_mark_data_t *)data > data_top) { +// jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); +// break; +// } +// jl_safe_printf("%p: Queued root: %p :: %p (bits: %d)\n of type ", +// (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); +// jl_((void*)data->tag); +// isroot = 1; +// } +// else if (pc == gc_mark_label_addrs[GC_MARK_L_finlist]) { +// gc_mark_finlist_t *data = gc_repush_markdata(gc_cache, &sp, gc_mark_finlist_t); +// if ((jl_gc_mark_data_t *)data > data_top) { +// jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); +// break; +// } +// jl_safe_printf("%p: Finalizer list from %p to %p\n", +// (void*)data, (void*)data->begin, (void*)data->end); +// isroot = 1; +// } +// else if (pc == gc_mark_label_addrs[GC_MARK_L_objarray]) { +// gc_mark_objarray_t *data = gc_repush_markdata(gc_cache, &sp, gc_mark_objarray_t); +// if ((jl_gc_mark_data_t *)data > data_top) { +// jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); +// break; +// } +// jl_safe_printf("%p: %s Array in object %p :: %p -- [%p, %p)\n of type ", +// (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], +// (void*)data->begin, (void*)data->end); +// jl_(jl_typeof(data->parent)); +// } +// else if (pc == gc_mark_label_addrs[GC_MARK_L_obj8]) { +// gc_mark_obj8_t *data = gc_repush_markdata(gc_cache, &sp, gc_mark_obj8_t); +// if ((jl_gc_mark_data_t *)data > data_top) { +// jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); +// break; +// } +// jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); +// uint8_t *desc = (uint8_t*)jl_dt_layout_ptrs(vt->layout); +// jl_safe_printf("%p: %s Object (8bit) %p :: %p -- [%d, %d)\n of type ", +// (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], +// (int)(data->begin - desc), (int)(data->end - desc)); +// jl_(jl_typeof(data->parent)); +// } +// else if (pc == gc_mark_label_addrs[GC_MARK_L_obj16]) { +// gc_mark_obj16_t *data = gc_repush_markdata(gc_cache, &sp, gc_mark_obj16_t); +// if ((jl_gc_mark_data_t *)data > data_top) { +// jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); +// break; +// } +// jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); +// uint16_t *desc = (uint16_t*)jl_dt_layout_ptrs(vt->layout); +// jl_safe_printf("%p: %s Object (16bit) %p :: %p -- [%d, %d)\n of type ", +// (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], +// (int)(data->begin - desc), (int)(data->end - desc)); +// jl_(jl_typeof(data->parent)); +// } +// else if (pc == gc_mark_label_addrs[GC_MARK_L_obj32]) { +// gc_mark_obj32_t *data = gc_repush_markdata(gc_cache, &sp, gc_mark_obj32_t); +// if ((jl_gc_mark_data_t *)data > data_top) { +// jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); +// break; +// } +// jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); +// uint32_t *desc = (uint32_t*)jl_dt_layout_ptrs(vt->layout); +// jl_safe_printf("%p: %s Object (32bit) %p :: %p -- [%d, %d)\n of type ", +// (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], +// (int)(data->begin - desc), (int)(data->end - desc)); +// jl_(jl_typeof(data->parent)); +// } +// else if (pc == gc_mark_label_addrs[GC_MARK_L_stack]) { +// gc_mark_stackframe_t *data = gc_repush_markdata(gc_cache, &sp, gc_mark_stackframe_t); +// if ((jl_gc_mark_data_t *)data > data_top) { +// jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); +// break; +// } +// jl_safe_printf("%p: %s Stack frame %p -- %d of %d (%s)\n", +// (void*)data, prefix, (void*)data->s, (int)data->i, +// (int)data->nroots >> 1, +// (data->nroots & 1) ? "indirect" : "direct"); +// } +// else if (pc == gc_mark_label_addrs[GC_MARK_L_module_binding]) { +// // module_binding +// gc_mark_binding_t *data = gc_repush_markdata(gc_cache, &sp, gc_mark_binding_t); +// if ((jl_gc_mark_data_t *)data > data_top) { +// jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); +// break; +// } +// jl_safe_printf("%p: %s Module (bindings) %p (bits %d) -- [%p, %p)\n", +// (void*)data, prefix, (void*)data->parent, (int)data->bits, +// (void*)data->begin, (void*)data->end); +// } +// else { +// jl_safe_printf("Unknown pc %p --- ABORTING !!!\n", pc); +// break; +// } +// } +// jl_set_safe_restore(old_buf); +// } static int gc_logging_enabled = 0; diff --git a/src/gc.c b/src/gc.c index 6eb803e96d062..aa049025856d9 100644 --- a/src/gc.c +++ b/src/gc.c @@ -27,6 +27,8 @@ static jl_gc_callback_list_t *gc_cblist_post_gc; static jl_gc_callback_list_t *gc_cblist_notify_external_alloc; static jl_gc_callback_list_t *gc_cblist_notify_external_free; +extern _Atomic(int32_t) nworkers_marking; + #define gc_invoke_callbacks(ty, list, args) \ do { \ for (jl_gc_callback_list_t *cb = list; \ @@ -112,17 +114,6 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb); } -// Save/restore local mark stack to/from thread-local storage. - -STATIC_INLINE void export_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { - ptls->gc_mark_sp = *sp; -} - -STATIC_INLINE void import_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { - // Has the stack been reallocated in the meantime? - *sp = ptls->gc_mark_sp; -} - // Protect all access to `finalizer_list_marked` and `to_finalize`. // For accessing `ptls->finalizers`, the lock is needed if a thread // is going to realloc the buffer (of its own list) or accessing the @@ -1696,14 +1687,14 @@ STATIC_INLINE uintptr_t gc_read_stack(void *_addr, uintptr_t offset, return *(uintptr_t*)real_addr; } -JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_ptls_t ptls, jl_datatype_t *vt, - jl_gc_mark_sp_t sp) +JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_ptls_t ptls, jl_datatype_t *vt) { jl_safe_printf("GC error (probable corruption) :\n"); jl_gc_debug_print_status(); jl_(vt); jl_gc_debug_critical_error(); - gc_mark_loop_unwind(ptls, sp, 0); + // FIXME - gc-debugging infrastructure + // gc_mark_loop_unwind(ptls, 0); abort(); } @@ -1712,38 +1703,132 @@ JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_ptls_t ptls, jl_datatype_t // See the call to `gc_mark_loop` in init with a `NULL` `ptls`. void *gc_mark_label_addrs[_GC_MARK_L_MAX]; -// Double the local mark stack (both pc and data) -static void NOINLINE gc_mark_stack_resize(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) JL_NOTSAFEPOINT +// Size information used for copying from the data-stack during work-stealing +size_t gc_mark_label_sizes[_GC_MARK_L_MAX]; + +// Pop a return address from the mark pc queue (i.e. decrease the pc queue bottom pointer) +STATIC_INLINE void *gc_mark_deque_pop_pc(jl_gc_ws_queue_t *mark_queue) +{ + jl_gc_ws_bottom_t bottom = jl_atomic_load_relaxed(&mark_queue->bottom); + int b = bottom.pc_offset - 1; + jl_gc_ws_bottom_t bottom2 = {b, bottom.data_offset}; + jl_gc_ws_array_t *array = jl_atomic_load_relaxed(&mark_queue->array); + jl_atomic_store_relaxed(&mark_queue->bottom, bottom2); + jl_fence(); + jl_gc_ws_top_t top = jl_atomic_load_relaxed(&mark_queue->top); + void *pc; + if (b >= top.offset) { + pc = jl_atomic_load_relaxed((_Atomic(void *) *)&array->pc_start[b % array->size]); + if (__unlikely(b == top.offset)) { + jl_gc_ws_top_t top2 = {top.offset, top.version + 1}; + if (!jl_atomic_cmpswap(&mark_queue->top, &top, top2)) { + pc = (void*)_GC_MARK_L_MAX; + jl_atomic_store_relaxed(&mark_queue->bottom, bottom); + } + } + } + else { + pc = (void*)_GC_MARK_L_MAX; + jl_atomic_store_relaxed(&mark_queue->bottom, bottom); + } + return pc; +} + +// Pop a data struct from the mark data queue (i.e. decrease the data queue bottom pointer) +// This should be used after dispatch and therefore the pc stack pointer is already popped from +// the stack. +STATIC_INLINE void *gc_mark_deque_pop_data(jl_gc_ws_queue_t *mark_queue) { - jl_gc_mark_data_t *old_data = gc_cache->data_stack; - void **pc_stack = sp->pc_start; - size_t stack_size = (char*)sp->pc_end - (char*)pc_stack; - gc_cache->data_stack = (jl_gc_mark_data_t *)realloc_s(old_data, stack_size * 2 * sizeof(jl_gc_mark_data_t)); - sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + (((char*)gc_cache->data_stack) - ((char*)old_data))); + jl_gc_ws_bottom_t bottom = jl_atomic_load_relaxed(&mark_queue->bottom); + bottom.data_offset--; + jl_gc_ws_array_t *array = jl_atomic_load_relaxed(&mark_queue->array); + jl_gc_mark_data_t *data = &array->data_start[bottom.data_offset % array->size]; + jl_atomic_store_relaxed(&mark_queue->bottom, bottom); + return data; +} + +// Double the mark queue (both pc and data) +static jl_gc_ws_array_t *NOINLINE gc_mark_deque_resize(jl_gc_ws_queue_t *mark_queue, + jl_gc_ws_array_t *old_array, + jl_gc_ws_top_t top) JL_NOTSAFEPOINT +{ + // Resize/copy pc queue + void **old_pc_start = old_array->pc_start; + void **new_pc_start = (void**)malloc_s(2 * old_array->size * sizeof(void*)); + for (size_t i = 0; i < old_array->size; i++) { + new_pc_start[(top.offset + i) % (2 * old_array->size)] = + old_pc_start[(top.offset + i) % old_array->size]; + } + + // Resize/copy data queue + jl_gc_mark_data_t *old_data_start = old_array->data_start; + jl_gc_mark_data_t *new_data_start = + (jl_gc_mark_data_t*)malloc_s(2 * old_array->size * sizeof(jl_gc_mark_data_t)); + for (size_t i = 0; i < old_array->size; i++) { + new_data_start[(top.offset + i) % (2 * old_array->size)] = + old_data_start[(top.offset + i) % old_array->size]; + } - sp->pc_start = gc_cache->pc_stack = (void**)realloc_s(pc_stack, stack_size * 2 * sizeof(void*)); - gc_cache->pc_stack_end = sp->pc_end = sp->pc_start + stack_size * 2; - sp->pc = sp->pc_start + (sp->pc - pc_stack); + jl_gc_ws_array_t *new_array = (jl_gc_ws_array_t*)malloc_s(sizeof(jl_gc_ws_array_t)); + new_array->pc_start = new_pc_start; + new_array->data_start = new_data_start; + new_array->size = 2 * old_array->size; + jl_atomic_store_release(&mark_queue->array, new_array); + + // enqueue `old_array` to be freed at the end of the mark loop + arraylist_push(mark_queue->reclaim_set, old_array); + + return new_array; } -// Push a work item to the stack. The type of the work item is marked with `pc`. -// The data needed is in `data` and is of size `data_size`. -// If there isn't enough space on the stack, the stack will be resized with the stack -// lock held. The caller should invalidate any local cache of the stack addresses that's not -// in `gc_cache` or `sp` -// The `sp` will be updated on return if `inc` is true. -STATIC_INLINE void gc_mark_stack_push(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - void *pc, void *data, size_t data_size, int inc) JL_NOTSAFEPOINT +// Push a work item to the queue. The type of the work item is marked with `pc`. +// The data needed is in `data` and is of size `data_size`, +// and the caller should invalidate any local cache of the stack addresses that's not in +// `gc_cache` or `sp`in such case). `pm` is set to 'no_inc', 'inc' or 'inc_data_only' to indicate which +// queues sould have their bottom updated (none, both or only data queue, respectively) +STATIC_INLINE void gc_mark_deque_push(jl_gc_mark_cache_t *gc_cache, void *pc, + void *data, size_t data_size, jl_gc_push_mode_t pm) JL_NOTSAFEPOINT { assert(data_size <= sizeof(jl_gc_mark_data_t)); - if (__unlikely(sp->pc == sp->pc_end)) - gc_mark_stack_resize(gc_cache, sp); - *sp->pc = pc; - memcpy(sp->data, data, data_size); - if (inc) { - sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + data_size); - sp->pc++; - } + jl_gc_ws_queue_t *mark_queue = &gc_cache->mark_queue; + jl_gc_ws_bottom_t bottom = jl_atomic_load_relaxed(&mark_queue->bottom); + jl_gc_ws_top_t top = jl_atomic_load_acquire(&mark_queue->top); + jl_gc_ws_array_t *array = jl_atomic_load_relaxed(&mark_queue->array); + int64_t size = bottom.data_offset - top.offset; + // Queue overflow + if (__unlikely(size >= array->size)) + array = gc_mark_deque_resize(mark_queue, array, top); + // Copy pc/data items + memcpy(&array->data_start[bottom.data_offset % array->size], data, data_size); + jl_atomic_store_relaxed((_Atomic(void *) *)&array->pc_start[bottom.pc_offset % array->size], pc); + jl_fence(); + bottom.pc_offset += (pm == inc); + bottom.data_offset += (pm == inc || pm == inc_data_only); + jl_atomic_store_relaxed(&mark_queue->bottom, bottom); +} + +// Try to steal a work item from the queue in `gc_cache2` +STATIC_INLINE int gc_mark_deque_try_steal(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_cache_t *gc_cache2) +{ + jl_gc_ws_queue_t *mark_queue2 = &gc_cache2->mark_queue; + jl_gc_ws_top_t top = jl_atomic_load_acquire(&mark_queue2->top); + jl_fence(); + jl_gc_ws_bottom_t bottom = jl_atomic_load_relaxed(&mark_queue2->bottom); + // No items to steal + if (bottom.pc_offset - top.offset <= 0) + return 0; + // Try stealing + jl_gc_ws_array_t *array2 = jl_atomic_load_acquire(&mark_queue2->array); + void *pc = jl_atomic_load_relaxed((_Atomic(void *) *)&array2->pc_start[top.offset % array2->size]); + jl_gc_mark_data_t *data = &array2->data_start[top.offset % array2->size]; + jl_gc_ws_top_t top2 = {top.offset + 1, top.version + 1}; + // Top already claimed by another thief: abort stealing + if (!jl_atomic_cmpswap(&mark_queue2->top, &top, top2)) + return 0; + // Push stolen items to thief's queue + size_t data_size = gc_mark_label_sizes[(int)(uintptr_t)pc]; + gc_mark_deque_push(gc_cache, pc, data, data_size, inc); + return 1; } // Check if the reference is non-NULL and atomically set the mark bit. @@ -1773,7 +1858,7 @@ STATIC_INLINE int gc_try_setmark(jl_value_t *obj, uintptr_t *nptr, } // Queue a finalizer list to be scanned in the mark loop. Start marking from index `start`. -void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, +void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, arraylist_t *list, size_t start) { size_t len = list->len; @@ -1781,13 +1866,13 @@ void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, return; jl_value_t **items = (jl_value_t**)list->items; gc_mark_finlist_t markdata = {items + start, items + len}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_finlist], - &markdata, sizeof(markdata), 1); + gc_mark_deque_push(gc_cache, gc_mark_label_addrs[GC_MARK_L_finlist], + &markdata, sizeof(markdata), inc); } // Queue a object to be scanned. The object should already be marked and the GC metadata // should already be updated for it. Only scanning of the object should be performed. -STATIC_INLINE void gc_mark_queue_scan_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, +STATIC_INLINE void gc_mark_queue_scan_obj(jl_gc_mark_cache_t *gc_cache, jl_value_t *obj) { jl_taggedvalue_t *o = jl_astaggedvalue(obj); @@ -1795,15 +1880,15 @@ STATIC_INLINE void gc_mark_queue_scan_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_ma uint8_t bits = tag & 0xf; tag = tag & ~(uintptr_t)0xf; gc_mark_marked_obj_t data = {obj, tag, bits}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_scan_only], - &data, sizeof(data), 1); + gc_mark_deque_push(gc_cache, gc_mark_label_addrs[GC_MARK_L_scan_only], + &data, sizeof(data), inc); } // Mark and queue a object to be scanned. // The object will be marked atomically which can also happen concurrently. // It will be queued if the object wasn't marked already (or concurrently by another thread) // Returns whether the object is young. -STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, void *_obj) JL_NOTSAFEPOINT +STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, void *_obj) JL_NOTSAFEPOINT { jl_value_t *obj = (jl_value_t*)jl_assume(_obj); uintptr_t nptr = 0; @@ -1812,19 +1897,19 @@ STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_ if (!gc_try_setmark(obj, &nptr, &tag, &bits)) return (int)nptr; gc_mark_marked_obj_t data = {obj, tag, bits}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_marked_obj], - &data, sizeof(data), 1); + gc_mark_deque_push(gc_cache, gc_mark_label_addrs[GC_MARK_L_marked_obj], + &data, sizeof(data), inc); return (int)nptr; } -int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_value_t *obj) +int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, jl_value_t *obj) { - return gc_mark_queue_obj(gc_cache, sp, obj); + return gc_mark_queue_obj(gc_cache, obj); } JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) { - return gc_mark_queue_obj(&ptls->gc_cache, &ptls->gc_mark_sp, obj); + return gc_mark_queue_obj(&ptls->gc_cache, obj); } JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, @@ -1832,9 +1917,9 @@ JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, { gc_mark_objarray_t data = { parent, objs, objs + nobjs, 1, jl_astaggedvalue(parent)->bits.gc & 2 }; - gc_mark_stack_push(&ptls->gc_cache, &ptls->gc_mark_sp, + gc_mark_deque_push(&ptls->gc_cache, gc_mark_label_addrs[GC_MARK_L_objarray], - &data, sizeof(data), 1); + &data, sizeof(data), inc); } @@ -1857,12 +1942,13 @@ STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, uintptr_ } // Scan a dense array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, +STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, gc_mark_objarray_t *objary, jl_value_t **begin, jl_value_t **end, jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) { - (void)jl_assume(objary == (gc_mark_objarray_t*)sp->data); + jl_gc_ws_queue_t *mark_queue = &((&ptls->gc_cache)->mark_queue); + (void)jl_assume(objary == (gc_mark_objarray_t*)gc_mark_deque_data_bottom(mark_queue)); for (; begin < end; begin += objary->step) { *pnew_obj = *begin; if (*pnew_obj) @@ -1875,7 +1961,7 @@ STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, if (begin < end) { // Haven't done with this one yet. Update the content and push it back objary->begin = begin; - gc_repush_markdata(sp, gc_mark_objarray_t); + gc_mark_deque_repush(mark_queue); } else { // Finished scanning this one, finish up by checking the GC invariance @@ -1889,13 +1975,14 @@ STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, } // Scan a sparse array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, +STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, gc_mark_array8_t *ary8, jl_value_t **begin, jl_value_t **end, uint8_t *elem_begin, uint8_t *elem_end, jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) { - (void)jl_assume(ary8 == (gc_mark_array8_t*)sp->data); + jl_gc_ws_queue_t *mark_queue = &((&ptls->gc_cache)->mark_queue); + (void)jl_assume(ary8 == (gc_mark_array8_t*)gc_mark_deque_data_bottom(mark_queue)); size_t elsize = ((jl_array_t*)ary8->elem.parent)->elsize / sizeof(jl_value_t*); for (; begin < end; begin += elsize) { for (; elem_begin < elem_end; elem_begin++) { @@ -1912,7 +1999,7 @@ STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, // Haven't done with this one yet. Update the content and push it back ary8->elem.begin = elem_begin; ary8->begin = begin; - gc_repush_markdata(sp, gc_mark_array8_t); + gc_mark_deque_repush(mark_queue); } else { begin += elsize; @@ -1920,7 +2007,7 @@ STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, // Haven't done with this array yet. Reset the content and push it back ary8->elem.begin = ary8->rebegin; ary8->begin = begin; - gc_repush_markdata(sp, gc_mark_array8_t); + gc_mark_deque_repush(mark_queue); } else { // Finished scanning this one, finish up by checking the GC invariance @@ -1937,13 +2024,14 @@ STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, } // Scan a sparse array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, +STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, gc_mark_array16_t *ary16, jl_value_t **begin, jl_value_t **end, uint16_t *elem_begin, uint16_t *elem_end, jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) { - (void)jl_assume(ary16 == (gc_mark_array16_t*)sp->data); + jl_gc_ws_queue_t *mark_queue = &((&ptls->gc_cache)->mark_queue); + (void)jl_assume(ary16 == (gc_mark_array16_t*)gc_mark_deque_data_bottom(mark_queue)); size_t elsize = ((jl_array_t*)ary16->elem.parent)->elsize / sizeof(jl_value_t*); for (; begin < end; begin += elsize) { for (; elem_begin < elem_end; elem_begin++) { @@ -1960,7 +2048,7 @@ STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, // Haven't done with this one yet. Update the content and push it back ary16->elem.begin = elem_begin; ary16->begin = begin; - gc_repush_markdata(sp, gc_mark_array16_t); + gc_mark_deque_repush(mark_queue); } else { begin += elsize; @@ -1968,7 +2056,7 @@ STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, // Haven't done with this array yet. Reset the content and push it back ary16->elem.begin = ary16->rebegin; ary16->begin = begin; - gc_repush_markdata(sp, gc_mark_array16_t); + gc_mark_deque_repush(mark_queue); } else { // Finished scanning this one, finish up by checking the GC invariance @@ -1986,11 +2074,12 @@ STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, // Scan an object with 8bits field descriptors. see `gc_mark_obj8_t` -STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj8_t *obj8, +STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, gc_mark_obj8_t *obj8, char *parent, uint8_t *begin, uint8_t *end, jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) { - (void)jl_assume(obj8 == (gc_mark_obj8_t*)sp->data); + jl_gc_ws_queue_t *mark_queue = &((&ptls->gc_cache)->mark_queue); + (void)jl_assume(obj8 == (gc_mark_obj8_t*)gc_mark_deque_data_bottom(mark_queue)); (void)jl_assume(begin < end); for (; begin < end; begin++) { jl_value_t **slot = &((jl_value_t**)parent)[*begin]; @@ -2005,7 +2094,7 @@ STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark if (begin < end) { // Haven't done with this one yet. Update the content and push it back obj8->begin = begin; - gc_repush_markdata(sp, gc_mark_obj8_t); + gc_mark_deque_repush(mark_queue); } else { // Finished scanning this one, finish up by checking the GC invariance @@ -2019,11 +2108,13 @@ STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark } // Scan an object with 16bits field descriptors. see `gc_mark_obj16_t` -STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj16_t *obj16, +STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, gc_mark_obj16_t *obj16, char *parent, uint16_t *begin, uint16_t *end, jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT { - (void)jl_assume(obj16 == (gc_mark_obj16_t*)sp->data); + + jl_gc_ws_queue_t *mark_queue = &((&ptls->gc_cache)->mark_queue); + (void)jl_assume(obj16 == (gc_mark_obj16_t*)gc_mark_deque_data_bottom(mark_queue)); (void)jl_assume(begin < end); for (; begin < end; begin++) { jl_value_t **slot = &((jl_value_t**)parent)[*begin]; @@ -2038,7 +2129,7 @@ STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mar if (begin < end) { // Haven't done with this one yet. Update the content and push it back obj16->begin = begin; - gc_repush_markdata(sp, gc_mark_obj16_t); + gc_mark_deque_repush(mark_queue); } else { // Finished scanning this one, finish up by checking the GC invariance @@ -2052,11 +2143,12 @@ STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mar } // Scan an object with 32bits field descriptors. see `gc_mark_obj32_t` -STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj32_t *obj32, +STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, gc_mark_obj32_t *obj32, char *parent, uint32_t *begin, uint32_t *end, jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) { - (void)jl_assume(obj32 == (gc_mark_obj32_t*)sp->data); + jl_gc_ws_queue_t *mark_queue = &((&ptls->gc_cache)->mark_queue); + (void)jl_assume(obj32 == (gc_mark_obj32_t*)gc_mark_deque_data_bottom(mark_queue)); (void)jl_assume(begin < end); for (; begin < end; begin++) { jl_value_t **slot = &((jl_value_t**)parent)[*begin]; @@ -2071,7 +2163,7 @@ STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mar if (begin < end) { // Haven't done with this one yet. Update the content and push it back obj32->begin = begin; - gc_repush_markdata(sp, gc_mark_obj32_t); + gc_mark_deque_repush(mark_queue); } else { // Finished scanning this one, finish up by checking the GC invariance @@ -2084,10 +2176,28 @@ STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mar return 0; } -#if defined(__GNUC__) && !defined(_OS_EMSCRIPTEN_) -# define gc_mark_laddr(name) (&&name) -# define gc_mark_jmp(ptr) goto *(ptr) -#else +// Set the mark loop recruitment location (periodically checked by threads that are not +// running gc) +void jl_gc_set_recruit(jl_ptls_t ptls, void *addr) +{ + jl_fence(); + jl_atomic_store_release(&jl_gc_recruiting_location, addr); + if (jl_n_threads > 1) + jl_wake_libuv(); + for (int i = 0; i < jl_n_threads; i++) { + if (i == ptls->tid) + continue; + jl_wakeup_thread(i); + } +} + +// FIXME - GNU's labels as values are commented out for now +// to make the bookkeeping easier in `gc_mark_label_sizes` + +// #if defined(__GNUC__) && !defined(_OS_EMSCRIPTEN_) +// # define gc_mark_laddr(name) (&&name) +// # define gc_mark_jmp(ptr) goto *(ptr) +// #else #define gc_mark_laddr(name) ((void*)(uintptr_t)GC_MARK_L_##name) #define gc_mark_jmp(ptr) do { \ switch ((int)(uintptr_t)ptr) { \ @@ -2119,7 +2229,7 @@ STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mar abort(); \ } \ } while (0) -#endif +// #endif // This is the main marking loop. // It uses an iterative (mostly) Depth-first search (DFS) to mark all the objects. @@ -2173,7 +2283,7 @@ STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mar // 1. When encountering an pointer (julia object reference) slots, load, perform NULL check // and atomically set the mark bits to determine if the object needs to be scanned. // 2. If yes, it'll push itself back onto the mark stack (after updating fields that are changed) -// using `gc_repush_markdata` to increment the stack pointers. +// using `gc_mark_deque_repush` to increment the stack pointers. // This step can also be replaced by a tail call by finishing up the marking of the current // object when the end of the current object is reached. // 3. Jump to `mark`. The marking of the current object will be resumed after the child is @@ -2187,7 +2297,7 @@ STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mar // Additional optimizations are done for some of the common cases by skipping // the unnecessary data stack pointer increment and the load from the stack // (i.e. store to load forwaring). See `objary_loaded`, `obj8_loaded` and `obj16_loaded`. -JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp) +JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls) { if (__unlikely(ptls == NULL)) { gc_mark_label_addrs[GC_MARK_L_marked_obj] = gc_mark_laddr(marked_obj); @@ -2202,9 +2312,27 @@ JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp) gc_mark_label_addrs[GC_MARK_L_stack] = gc_mark_laddr(stack); gc_mark_label_addrs[GC_MARK_L_excstack] = gc_mark_laddr(excstack); gc_mark_label_addrs[GC_MARK_L_module_binding] = gc_mark_laddr(module_binding); + + gc_mark_label_sizes[GC_MARK_L_marked_obj] = sizeof(gc_mark_marked_obj_t); + gc_mark_label_sizes[GC_MARK_L_scan_only] = sizeof(gc_mark_marked_obj_t); + gc_mark_label_sizes[GC_MARK_L_finlist] = sizeof(gc_mark_finlist_t); + gc_mark_label_sizes[GC_MARK_L_objarray] = sizeof(gc_mark_objarray_t); + gc_mark_label_sizes[GC_MARK_L_array8] = sizeof(gc_mark_array8_t); + gc_mark_label_sizes[GC_MARK_L_array16] = sizeof(gc_mark_array16_t); + gc_mark_label_sizes[GC_MARK_L_obj8] = sizeof(gc_mark_obj8_t); + gc_mark_label_sizes[GC_MARK_L_obj16] = sizeof(gc_mark_obj16_t); + gc_mark_label_sizes[GC_MARK_L_obj32] = sizeof(gc_mark_obj32_t); + gc_mark_label_sizes[GC_MARK_L_stack] = sizeof(gc_mark_stackframe_t); + gc_mark_label_sizes[GC_MARK_L_excstack] = sizeof(gc_mark_excstack_t); + gc_mark_label_sizes[GC_MARK_L_module_binding] = sizeof(gc_mark_binding_t); + return; } + jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; + jl_gc_ws_queue_t *mark_queue = &gc_cache->mark_queue; + void *pc; + jl_value_t *new_obj = NULL; uintptr_t tag = 0; uint8_t bits = 0; @@ -2227,17 +2355,26 @@ JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp) uint16_t *obj16_begin; uint16_t *obj16_end; -pop: - if (sp.pc == sp.pc_start) { - // TODO: stealing form another thread +pop: { + pc = gc_mark_deque_pop_pc(mark_queue); + // `_GC_MARK_L_MAX` is used as sentinel to indicate that a pop from `pc` queue + // failed (e.g. no items left) + if (pc != (void*)_GC_MARK_L_MAX) + gc_mark_jmp(pc); + // Empty mark queue: try to steal from other thread + for (int i = 0; i < jl_n_threads; i++) { + uint64_t victim = rand() % jl_n_threads; + if (victim == ptls->tid) + continue; + if (gc_mark_deque_try_steal(gc_cache, &jl_all_tls_states[victim]->gc_cache)) + goto pop; + } return; } - sp.pc--; - gc_mark_jmp(*sp.pc); // computed goto marked_obj: { // An object that has been marked and needs have metadata updated and scanned. - gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); + gc_mark_marked_obj_t *obj = gc_mark_deque_pop_data(mark_queue); new_obj = obj->obj; tag = obj->tag; bits = obj->bits; @@ -2246,7 +2383,7 @@ marked_obj: { scan_only: { // An object that has been marked and needs to be scanned. - gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); + gc_mark_marked_obj_t *obj = gc_mark_deque_pop_data(mark_queue); new_obj = obj->obj; tag = obj->tag; bits = obj->bits; @@ -2255,67 +2392,67 @@ scan_only: { } objarray: - objary = gc_pop_markdata(&sp, gc_mark_objarray_t); + objary = gc_mark_deque_pop_data(mark_queue); objary_begin = objary->begin; objary_end = objary->end; objarray_loaded: - if (gc_mark_scan_objarray(ptls, &sp, objary, objary_begin, objary_end, + if (gc_mark_scan_objarray(ptls, objary, objary_begin, objary_end, &new_obj, &tag, &bits)) goto mark; goto pop; array8: - ary8 = gc_pop_markdata(&sp, gc_mark_array8_t); + ary8 = gc_mark_deque_pop_data(mark_queue); objary_begin = ary8->begin; objary_end = ary8->end; obj8_begin = ary8->elem.begin; obj8_end = ary8->elem.end; array8_loaded: - if (gc_mark_scan_array8(ptls, &sp, ary8, objary_begin, objary_end, obj8_begin, obj8_end, + if (gc_mark_scan_array8(ptls, ary8, objary_begin, objary_end, obj8_begin, obj8_end, &new_obj, &tag, &bits)) goto mark; goto pop; array16: - ary16 = gc_pop_markdata(&sp, gc_mark_array16_t); + ary16 = gc_mark_deque_pop_data(mark_queue); objary_begin = ary16->begin; objary_end = ary16->end; obj16_begin = ary16->elem.begin; obj16_end = ary16->elem.end; array16_loaded: - if (gc_mark_scan_array16(ptls, &sp, ary16, objary_begin, objary_end, obj16_begin, obj16_end, + if (gc_mark_scan_array16(ptls, ary16, objary_begin, objary_end, obj16_begin, obj16_end, &new_obj, &tag, &bits)) goto mark; goto pop; obj8: - obj8 = gc_pop_markdata(&sp, gc_mark_obj8_t); + obj8 = gc_mark_deque_pop_data(mark_queue); obj8_parent = (char*)obj8->parent; obj8_begin = obj8->begin; obj8_end = obj8->end; obj8_loaded: - if (gc_mark_scan_obj8(ptls, &sp, obj8, obj8_parent, obj8_begin, obj8_end, + if (gc_mark_scan_obj8(ptls, obj8, obj8_parent, obj8_begin, obj8_end, &new_obj, &tag, &bits)) goto mark; goto pop; obj16: - obj16 = gc_pop_markdata(&sp, gc_mark_obj16_t); + obj16 = gc_mark_deque_pop_data(mark_queue); obj16_parent = (char*)obj16->parent; obj16_begin = obj16->begin; obj16_end = obj16->end; obj16_loaded: - if (gc_mark_scan_obj16(ptls, &sp, obj16, obj16_parent, obj16_begin, obj16_end, + if (gc_mark_scan_obj16(ptls, obj16, obj16_parent, obj16_begin, obj16_end, &new_obj, &tag, &bits)) goto mark; goto pop; obj32: { - gc_mark_obj32_t *obj32 = gc_pop_markdata(&sp, gc_mark_obj32_t); + gc_mark_obj32_t *obj32 = gc_mark_deque_pop_data(mark_queue); char *parent = (char*)obj32->parent; uint32_t *begin = obj32->begin; uint32_t *end = obj32->end; - if (gc_mark_scan_obj32(ptls, &sp, obj32, parent, begin, end, &new_obj, &tag, &bits)) + if (gc_mark_scan_obj32(ptls, obj32, parent, begin, end, &new_obj, &tag, &bits)) goto mark; goto pop; } @@ -2324,7 +2461,7 @@ stack: { // Scan the stack. see `gc_mark_stackframe_t` // The task object this stack belongs to is being scanned separately as a normal // 8bit field descriptor object. - gc_mark_stackframe_t *stack = gc_pop_markdata(&sp, gc_mark_stackframe_t); + gc_mark_stackframe_t *stack = gc_mark_deque_pop_data(mark_queue); jl_gcframe_t *s = stack->s; uint32_t i = stack->i; uint32_t nroots = stack->nroots; @@ -2354,7 +2491,7 @@ stack: { if (i < nr) { // Haven't done with this one yet. Update the content and push it back stack->i = i; - gc_repush_markdata(&sp, gc_mark_stackframe_t); + gc_mark_deque_repush(mark_queue); } else if ((s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub))) { stack->s = s; @@ -2362,7 +2499,7 @@ stack: { uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); assert(new_nroots <= UINT32_MAX); stack->nroots = (uint32_t)new_nroots; - gc_repush_markdata(&sp, gc_mark_stackframe_t); + gc_mark_deque_repush(mark_queue); } goto mark; } @@ -2382,7 +2519,7 @@ stack: { excstack: { // Scan an exception stack - gc_mark_excstack_t *stackitr = gc_pop_markdata(&sp, gc_mark_excstack_t); + gc_mark_excstack_t *stackitr = gc_mark_deque_pop_data(mark_queue); jl_excstack_t *excstack = stackitr->s; size_t itr = stackitr->itr; size_t bt_index = stackitr->bt_index; @@ -2405,7 +2542,7 @@ excstack: { stackitr->itr = itr; stackitr->bt_index = bt_index; stackitr->jlval_index = jlval_index; - gc_repush_markdata(&sp, gc_mark_excstack_t); + gc_mark_deque_repush(mark_queue); goto mark; } } @@ -2421,7 +2558,7 @@ excstack: { stackitr->itr = itr; stackitr->bt_index = bt_index; stackitr->jlval_index = jlval_index; - gc_repush_markdata(&sp, gc_mark_excstack_t); + gc_mark_deque_repush(mark_queue); goto mark; } } @@ -2431,7 +2568,7 @@ excstack: { module_binding: { // Scan a module. see `gc_mark_binding_t` // Other fields of the module will be scanned after the bindings are scanned - gc_mark_binding_t *binding = gc_pop_markdata(&sp, gc_mark_binding_t); + gc_mark_binding_t *binding = gc_mark_deque_pop_data(mark_queue); jl_binding_t **begin = binding->begin; jl_binding_t **end = binding->end; uint8_t mbits = binding->bits; @@ -2461,13 +2598,13 @@ module_binding: { new_obj = value; begin += 2; binding->begin = begin; - gc_repush_markdata(&sp, gc_mark_binding_t); + gc_mark_deque_repush(mark_queue); uintptr_t gr_tag; uint8_t gr_bits; if (gc_try_setmark(globalref, &binding->nptr, &gr_tag, &gr_bits)) { gc_mark_marked_obj_t data = {globalref, gr_tag, gr_bits}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(marked_obj), - &data, sizeof(data), 1); + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(marked_obj), + &data, sizeof(data), inc); } goto mark; } @@ -2475,7 +2612,7 @@ module_binding: { if (gc_try_setmark(globalref, &binding->nptr, &tag, &bits)) { begin += 2; binding->begin = begin; - gc_repush_markdata(&sp, gc_mark_binding_t); + gc_mark_deque_repush(mark_queue); new_obj = globalref; goto mark; } @@ -2491,14 +2628,13 @@ module_binding: { objary_begin = (jl_value_t**)m->usings.items; objary_end = objary_begin + nusings; gc_mark_objarray_t data = {(jl_value_t*)m, objary_begin, objary_end, 1, binding->nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &data, sizeof(data), 0); + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(objarray), + &data, sizeof(data), no_inc); if (!scanparent) { - objary = (gc_mark_objarray_t*)sp.data; + objary = gc_mark_deque_data_bottom(mark_queue); goto objarray_loaded; } - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(data)); - sp.pc++; + gc_mark_deque_repush(mark_queue); } else { gc_mark_push_remset(ptls, (jl_value_t*)m, binding->nptr); @@ -2512,7 +2648,7 @@ module_binding: { finlist: { // Scan a finalizer (or format compatible) list. see `gc_mark_finlist_t` - gc_mark_finlist_t *finlist = gc_pop_markdata(&sp, gc_mark_finlist_t); + gc_mark_finlist_t *finlist = gc_mark_deque_pop_data(mark_queue); jl_value_t **begin = finlist->begin; jl_value_t **end = finlist->end; for (; begin < end; begin++) { @@ -2532,7 +2668,7 @@ finlist: { if (begin < end) { // Haven't done with this one yet. Update the content and push it back finlist->begin = begin; - gc_repush_markdata(&sp, gc_mark_finlist_t); + gc_mark_deque_repush(mark_queue); } goto mark; } @@ -2569,9 +2705,9 @@ mark: { objary_begin = data; objary_end = data + l; gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(objarray), + &markdata, sizeof(markdata), no_inc); + objary = gc_mark_deque_data_bottom(mark_queue); goto objarray_loaded; } else if (vt->name == jl_array_typename) { @@ -2625,9 +2761,9 @@ mark: { objary_begin = (jl_value_t**)a->data; objary_end = objary_begin + l; gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(objarray), + &markdata, sizeof(markdata), no_inc); + objary = gc_mark_deque_data_bottom(mark_queue); goto objarray_loaded; } else if (flags.hasptr) { @@ -2642,27 +2778,27 @@ mark: { if (npointers == 1) { // TODO: detect anytime time stride is uniform? objary_begin += layout->first_ptr; gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, elsize, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(objarray), + &markdata, sizeof(markdata), no_inc); + objary = gc_mark_deque_data_bottom(mark_queue); goto objarray_loaded; } else if (layout->fielddesc_type == 0) { obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); obj8_end = obj8_begin + npointers; gc_mark_array8_t markdata = {objary_begin, objary_end, obj8_begin, {new_obj, obj8_begin, obj8_end, nptr}}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array8), - &markdata, sizeof(markdata), 0); - ary8 = (gc_mark_array8_t*)sp.data; + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(array8), + &markdata, sizeof(markdata), no_inc); + ary8 = gc_mark_deque_data_bottom(mark_queue); goto array8_loaded; } else if (layout->fielddesc_type == 1) { obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout); obj16_end = obj16_begin + npointers; gc_mark_array16_t markdata = {objary_begin, objary_end, obj16_begin, {new_obj, obj16_begin, obj16_end, nptr}}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array16), - &markdata, sizeof(markdata), 0); - ary16 = (gc_mark_array16_t*)sp.data; + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(array16), + &markdata, sizeof(markdata), no_inc); + ary16 = gc_mark_deque_data_bottom(mark_queue); goto array16_loaded; } else { @@ -2681,9 +2817,8 @@ mark: { size_t bsize = m->bindings.size; uintptr_t nptr = ((bsize + m->usings.len + 1) << 2) | (bits & GC_OLD); gc_mark_binding_t markdata = {m, table + 1, table + bsize, nptr, bits}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(module_binding), - &markdata, sizeof(markdata), 0); - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata)); + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(module_binding), + &markdata, sizeof(markdata), inc_data_only); goto module_binding; } else if (vt == jl_task_type) { @@ -2694,12 +2829,10 @@ mark: { jl_task_t *ta = (jl_task_t*)new_obj; gc_scrub_record_task(ta); if (gc_cblist_task_scanner) { - export_gc_state(ptls, &sp); int16_t tid = jl_atomic_load_relaxed(&ta->tid); gc_invoke_callbacks(jl_gc_cb_task_scanner_t, gc_cblist_task_scanner, (ta, tid != -1 && ta == jl_all_tls_states[tid]->root_task)); - import_gc_state(ptls, &sp); } #ifdef COPY_STACKS void *stkbuf = ta->stkbuf; @@ -2725,15 +2858,15 @@ mark: { nroots = gc_read_stack(&s->nroots, offset, lb, ub); assert(nroots <= UINT32_MAX); gc_mark_stackframe_t stackdata = {s, 0, (uint32_t)nroots, offset, lb, ub}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(stack), - &stackdata, sizeof(stackdata), 1); + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(stack), + &stackdata, sizeof(stackdata), inc); } if (ta->excstack) { gc_setmark_buf_(ptls, ta->excstack, bits, sizeof(jl_excstack_t) + sizeof(uintptr_t)*ta->excstack->reserved_size); gc_mark_excstack_t stackdata = {ta->excstack, ta->excstack->top, 0, 0}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(excstack), - &stackdata, sizeof(stackdata), 1); + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(excstack), + &stackdata, sizeof(stackdata), inc); } const jl_datatype_layout_t *layout = jl_task_type->layout; assert(layout->fielddesc_type == 0); @@ -2744,9 +2877,9 @@ mark: { // assume tasks always reference young objects: set lowest bit uintptr_t nptr = (npointers << 2) | 1 | bits; gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), - &markdata, sizeof(markdata), 0); - obj8 = (gc_mark_obj8_t*)sp.data; + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(obj8), + &markdata, sizeof(markdata), no_inc); + obj8 = gc_mark_deque_data_bottom(mark_queue); obj8_parent = (char*)ta; goto obj8_loaded; } @@ -2760,7 +2893,7 @@ mark: { } else { if (__unlikely(!jl_is_datatype(vt))) - gc_assert_datatype_fail(ptls, vt, sp); + gc_assert_datatype_fail(ptls, vt); size_t dtsz = jl_datatype_size(vt); if (update_meta) gc_setmark(ptls, o, bits, dtsz); @@ -2780,9 +2913,9 @@ mark: { obj8_end = obj8_begin + npointers; assert(obj8_begin < obj8_end); gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), - &markdata, sizeof(markdata), 0); - obj8 = (gc_mark_obj8_t*)sp.data; + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(obj8), + &markdata, sizeof(markdata), no_inc); + obj8 = gc_mark_deque_data_bottom(mark_queue); goto obj8_loaded; } else if (layout->fielddesc_type == 1) { @@ -2791,9 +2924,9 @@ mark: { obj16_end = obj16_begin + npointers; assert(obj16_begin < obj16_end); gc_mark_obj16_t markdata = {new_obj, obj16_begin, obj16_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj16), - &markdata, sizeof(markdata), 0); - obj16 = (gc_mark_obj16_t*)sp.data; + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(obj16), + &markdata, sizeof(markdata), no_inc); + obj16 = gc_mark_deque_data_bottom(mark_queue); goto obj16_loaded; } else if (layout->fielddesc_type == 2) { @@ -2802,18 +2935,15 @@ mark: { uint32_t *obj32_begin = (uint32_t*)jl_dt_layout_ptrs(layout); uint32_t *obj32_end = obj32_begin + npointers; gc_mark_obj32_t markdata = {new_obj, obj32_begin, obj32_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj32), - &markdata, sizeof(markdata), 0); - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata)); + gc_mark_deque_push(&ptls->gc_cache, gc_mark_laddr(obj32), + &markdata, sizeof(markdata), inc_data_only); goto obj32; } else { assert(layout->fielddesc_type == 3); jl_fielddescdyn_t *desc = (jl_fielddescdyn_t*)jl_dt_layout_fields(layout); int old = jl_astaggedvalue(new_obj)->bits.gc & 2; - export_gc_state(ptls, &sp); uintptr_t young = desc->markfunc(ptls, new_obj); - import_gc_state(ptls, &sp); if (old && young) gc_mark_push_remset(ptls, new_obj, young * 4 + 3); goto pop; @@ -2822,52 +2952,52 @@ mark: { } } -static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, +static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_ptls_t ptls2) { - gc_mark_queue_obj(gc_cache, sp, jl_atomic_load_relaxed(&ptls2->current_task)); - gc_mark_queue_obj(gc_cache, sp, ptls2->root_task); + gc_mark_queue_obj(gc_cache, jl_atomic_load_relaxed(&ptls2->current_task)); + gc_mark_queue_obj(gc_cache, ptls2->root_task); if (ptls2->next_task) - gc_mark_queue_obj(gc_cache, sp, ptls2->next_task); + gc_mark_queue_obj(gc_cache, ptls2->next_task); if (ptls2->previous_task) // shouldn't be necessary, but no reason not to - gc_mark_queue_obj(gc_cache, sp, ptls2->previous_task); + gc_mark_queue_obj(gc_cache, ptls2->previous_task); if (ptls2->previous_exception) - gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception); + gc_mark_queue_obj(gc_cache, ptls2->previous_exception); } extern jl_value_t *cmpswap_names JL_GLOBALLY_ROOTED; // mark the initial root set -static void mark_roots(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) +static void mark_roots(jl_gc_mark_cache_t *gc_cache) { // modules - gc_mark_queue_obj(gc_cache, sp, jl_main_module); + gc_mark_queue_obj(gc_cache, jl_main_module); // invisible builtin values if (jl_an_empty_vec_any != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_an_empty_vec_any); + gc_mark_queue_obj(gc_cache, jl_an_empty_vec_any); if (jl_module_init_order != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_module_init_order); + gc_mark_queue_obj(gc_cache, jl_module_init_order); for (size_t i = 0; i < jl_current_modules.size; i += 2) { if (jl_current_modules.table[i + 1] != HT_NOTFOUND) { - gc_mark_queue_obj(gc_cache, sp, jl_current_modules.table[i]); + gc_mark_queue_obj(gc_cache, jl_current_modules.table[i]); } } - gc_mark_queue_obj(gc_cache, sp, jl_anytuple_type_type); + gc_mark_queue_obj(gc_cache, jl_anytuple_type_type); for (size_t i = 0; i < N_CALL_CACHE; i++) { jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]); if (v != NULL) - gc_mark_queue_obj(gc_cache, sp, v); + gc_mark_queue_obj(gc_cache, v); } if (jl_all_methods != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_all_methods); + gc_mark_queue_obj(gc_cache, jl_all_methods); if (_jl_debug_method_invalidation != NULL) - gc_mark_queue_obj(gc_cache, sp, _jl_debug_method_invalidation); + gc_mark_queue_obj(gc_cache, _jl_debug_method_invalidation); // constants - gc_mark_queue_obj(gc_cache, sp, jl_emptytuple_type); + gc_mark_queue_obj(gc_cache, jl_emptytuple_type); if (cmpswap_names != NULL) - gc_mark_queue_obj(gc_cache, sp, cmpswap_names); + gc_mark_queue_obj(gc_cache, cmpswap_names); } // find unmarked objects that need to be finalized from the finalizer list "list". @@ -3019,12 +3149,12 @@ static void jl_gc_premark(jl_ptls_t ptls2) } } -static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) +static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_ptls_t ptls2) { size_t len = ptls2->heap.last_remset->len; void **items = ptls2->heap.last_remset->items; for (size_t i = 0; i < len; i++) - gc_mark_queue_scan_obj(gc_cache, sp, (jl_value_t*)items[i]); + gc_mark_queue_scan_obj(gc_cache, (jl_value_t*)items[i]); int n_bnd_refyoung = 0; len = ptls2->heap.rem_bindings.len; items = ptls2->heap.rem_bindings.items; @@ -3033,7 +3163,7 @@ static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp // A null pointer can happen here when the binding is cleaned up // as an exception is thrown after it was already queued (#10221) jl_value_t *v = jl_atomic_load_relaxed(&ptr->value); - if (v != NULL && gc_mark_queue_obj(gc_cache, sp, v)) { + if (v != NULL && gc_mark_queue_obj(gc_cache, v)) { items[n_bnd_refyoung] = ptr; n_bnd_refyoung++; } @@ -3041,7 +3171,7 @@ static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp ptls2->heap.rem_bindings.len = n_bnd_refyoung; } -static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) +static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_ptls_t ptls2) { jl_bt_element_t *bt_data = ptls2->bt_data; size_t bt_size = ptls2->bt_size; @@ -3051,20 +3181,19 @@ static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp continue; size_t njlvals = jl_bt_num_jlvals(bt_entry); for (size_t j = 0; j < njlvals; j++) - gc_mark_queue_obj(gc_cache, sp, jl_bt_entry_jlvalue(bt_entry, j)); + gc_mark_queue_obj(gc_cache, jl_bt_entry_jlvalue(bt_entry, j)); } } size_t jl_maxrss(void); + // Only one thread should be running in this function static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) { combine_thread_gc_counts(&gc_num); jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); uint64_t gc_start_time = jl_hrtime(); int64_t last_perm_scanned_bytes = perm_scanned_bytes; @@ -3078,25 +3207,31 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) for (int t_i = 0; t_i < jl_n_threads; t_i++) { jl_ptls_t ptls2 = jl_all_tls_states[t_i]; // 2.1. mark every object in the `last_remsets` and `rem_binding` - jl_gc_queue_remset(gc_cache, &sp, ptls2); - // 2.2. mark every thread local root - jl_gc_queue_thread_local(gc_cache, &sp, ptls2); - // 2.3. mark any managed objects in the backtrace buffer - jl_gc_queue_bt_buf(gc_cache, &sp, ptls2); + jl_gc_queue_remset(gc_cache, ptls2); + // 2.2. thread local root + jl_gc_queue_thread_local(gc_cache, ptls2); + // 2.3. managed objects in the backtrace buffer + jl_gc_queue_bt_buf(gc_cache, ptls2); } // 3. walk roots - mark_roots(gc_cache, &sp); + mark_roots(gc_cache); if (gc_cblist_root_scanner) { - export_gc_state(ptls, &sp); gc_invoke_callbacks(jl_gc_cb_root_scanner_t, gc_cblist_root_scanner, (collection)); - import_gc_state(ptls, &sp); } - gc_mark_loop(ptls, sp); - gc_mark_sp_init(gc_cache, &sp); - gc_num.since_sweep += gc_num.allocd; + + // Mark-loop entry/exit sequence + jl_gc_mark_loop_enter(ptls); + jl_gc_set_recruit(ptls, (void *)gc_mark_loop); + gc_mark_loop(ptls); + jl_gc_mark_loop_leave(ptls); + jl_safepoint_wait_gc(); + jl_atomic_store_release(&jl_gc_recruiting_location, NULL); + JL_PROBE_GC_MARK_END(scanned_bytes, perm_scanned_bytes); + + gc_num.since_sweep += gc_num.allocd; gc_settime_premark_end(); gc_time_mark_pause(gc_start_time, scanned_bytes, perm_scanned_bytes); uint64_t end_mark_time = jl_hrtime(); @@ -3122,13 +3257,12 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) } for (int i = 0;i < jl_n_threads;i++) { jl_ptls_t ptls2 = jl_all_tls_states[i]; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + gc_mark_queue_finlist(gc_cache, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len); + gc_mark_queue_finlist(gc_cache, &finalizer_list_marked, orig_marked_len); // "Flush" the mark stack before flipping the reset_age bit // so that the objects are not incorrectly reset. - gc_mark_loop(ptls, sp); - gc_mark_sp_init(gc_cache, &sp); + gc_mark_loop(ptls); // Conservative marking relies on age to tell allocated objects // and freelist entries apart. mark_reset_age = !jl_gc_conservative_gc_support_enabled(); @@ -3136,8 +3270,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // `to_finalize` list. These objects are only reachable from this list // and should not be referenced by any old objects so this won't break // the GC invariant. - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - gc_mark_loop(ptls, sp); + gc_mark_queue_finlist(gc_cache, &to_finalize, 0); + gc_mark_loop(ptls); mark_reset_age = 0; gc_settime_postmark_end(); @@ -3388,12 +3522,12 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) errno = last_errno; } -void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) +void gc_mark_queue_all_roots(jl_ptls_t ptls) { jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; for (size_t i = 0; i < jl_n_threads; i++) - jl_gc_queue_thread_local(gc_cache, sp, jl_all_tls_states[i]); - mark_roots(gc_cache, sp); + jl_gc_queue_thread_local(gc_cache, jl_all_tls_states[i]); + mark_roots(gc_cache); } // allocator entry points @@ -3432,10 +3566,26 @@ void jl_init_thread_heap(jl_ptls_t ptls) gc_cache->perm_scanned_bytes = 0; gc_cache->scanned_bytes = 0; gc_cache->nbig_obj = 0; - size_t init_size = 1024; - gc_cache->pc_stack = (void**)malloc_s(init_size * sizeof(void*)); - gc_cache->pc_stack_end = gc_cache->pc_stack + init_size; - gc_cache->data_stack = (jl_gc_mark_data_t *)malloc_s(init_size * sizeof(jl_gc_mark_data_t)); + + jl_gc_ws_queue_t *mark_queue = &gc_cache->mark_queue; + + jl_gc_ws_top_t top0 = {0, 0}; + jl_gc_ws_bottom_t bottom0 = {0, 0}; + jl_atomic_store_release(&mark_queue->top, top0); + jl_atomic_store_relaxed(&mark_queue->bottom, bottom0); + + jl_gc_ws_array_t *array = (jl_gc_ws_array_t*)malloc_s(sizeof(jl_gc_ws_array_t)); + + size_t init_size = 1 << 10; + array->pc_start = (void**)malloc_s(init_size * sizeof(void*)); + array->data_start = (jl_gc_mark_data_t*)malloc_s(init_size * sizeof(jl_gc_mark_data_t)); + array->size = init_size; + + jl_atomic_store_release(&mark_queue->array, array); + + size_t reclaim_set_size = 10; // TODO: does `reclaim_set` need resize? + arraylist_t *a = (arraylist_t*)malloc(sizeof(arraylist_t)); + mark_queue->reclaim_set = arraylist_new(a, reclaim_set_size); memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); assert(gc_num.interval == default_collect_interval); @@ -3471,8 +3621,7 @@ void jl_gc_init(void) if (maxmem > max_collect_interval) max_collect_interval = maxmem; #endif - jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL}; - gc_mark_loop(NULL, sp); + gc_mark_loop(NULL); t_start = jl_hrtime(); } diff --git a/src/gc.h b/src/gc.h index 00c3d48b52935..b63aef95151f7 100644 --- a/src/gc.h +++ b/src/gc.h @@ -217,28 +217,35 @@ union _jl_gc_mark_data { gc_mark_finlist_t finlist; }; -// Pop a data struct from the mark data stack (i.e. decrease the stack pointer) -// This should be used after dispatch and therefore the pc stack pointer is already popped from -// the stack. -STATIC_INLINE void *gc_pop_markdata_(jl_gc_mark_sp_t *sp, size_t size) +// Return a pointer to the bottom of the data queue +STATIC_INLINE void *gc_mark_deque_data_bottom(jl_gc_ws_queue_t *mark_queue) JL_NOTSAFEPOINT { - jl_gc_mark_data_t *data = (jl_gc_mark_data_t *)(((char*)sp->data) - size); - sp->data = data; - return data; + jl_gc_ws_bottom_t bottom = jl_atomic_load_relaxed(&mark_queue->bottom); + jl_gc_ws_array_t *array = jl_atomic_load_relaxed(&mark_queue->array); + return &array->data_start[bottom.data_offset % array->size]; } -#define gc_pop_markdata(sp, type) ((type*)gc_pop_markdata_(sp, sizeof(type))) -// Re-push a frame to the mark stack (both data and pc) -// The data and pc are expected to be on the stack (or updated in place) already. +// Re-push a frame to the mark queue (both data and pc) +// The data and pc are expected to be on the queue (or updated in place) already. // Mainly useful to pause the current scanning in order to scan an new object. -STATIC_INLINE void *gc_repush_markdata_(jl_gc_mark_sp_t *sp, size_t size) JL_NOTSAFEPOINT +STATIC_INLINE void *gc_mark_deque_repush(jl_gc_ws_queue_t *mark_queue) JL_NOTSAFEPOINT { - jl_gc_mark_data_t *data = sp->data; - sp->pc++; - sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + size); + jl_gc_ws_bottom_t bottom = jl_atomic_load_relaxed(&mark_queue->bottom); + jl_gc_ws_array_t *array = jl_atomic_load_relaxed(&mark_queue->array); + jl_gc_mark_data_t *data = &array->data_start[bottom.data_offset % array->size]; + bottom.pc_offset++; + bottom.data_offset++; + jl_atomic_store_relaxed(&mark_queue->bottom, bottom); return data; } -#define gc_repush_markdata(sp, type) ((type*)gc_repush_markdata_(sp, sizeof(type))) + +// Used to determine whether the bottom of pc/data queue should be incremented +// on a push +typedef enum { + no_inc, + inc, + inc_data_only +} jl_gc_push_mode_t; // layout for big (>2k) objects @@ -505,18 +512,10 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFE *list = hdr; } -STATIC_INLINE void gc_mark_sp_init(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) -{ - sp->pc = gc_cache->pc_stack; - sp->data = gc_cache->data_stack; - sp->pc_start = gc_cache->pc_stack; - sp->pc_end = gc_cache->pc_stack_end; -} - -void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp); -void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, +void gc_mark_queue_all_roots(jl_ptls_t ptls); +void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, arraylist_t *list, size_t start); -void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp); +void gc_mark_loop(jl_ptls_t ptls); void sweep_stack_pools(void); void jl_gc_debug_init(void); @@ -648,7 +647,7 @@ extern int gc_verifying; #endif int gc_slot_to_fieldidx(void *_obj, void *slot); int gc_slot_to_arrayidx(void *_obj, void *begin); -NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset); +// NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, int pc_offset); #ifdef GC_DEBUG_ENV JL_DLLEXPORT extern jl_gc_debug_env_t jl_gc_debug_env; diff --git a/src/julia_internal.h b/src/julia_internal.h index fe38812d5c962..1017d1c9835a7 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -832,7 +832,7 @@ typedef jl_gcframe_t ***(*jl_pgcstack_key_t)(void) JL_NOTSAFEPOINT; #endif JL_DLLEXPORT void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t *k); -#if !defined(__clang_gcanalyzer__) && !defined(_OS_DARWIN_) +#if !defined(__clang_gcanalyzer__) static inline void jl_set_gc_and_wait(void) { jl_task_t *ct = jl_current_task; @@ -1570,6 +1570,8 @@ JL_DLLEXPORT uint16_t julia__truncdfhf2(double param) JL_NOTSAFEPOINT; #define JL_PROBE_GC_STOP_THE_WORLD() do ; while (0) #define JL_PROBE_GC_MARK_BEGIN() do ; while (0) #define JL_PROBE_GC_MARK_END(scanned_bytes, perm_scanned_bytes) do ; while (0) +#define JL_PROBE_GC_MARK_STOP_THE_WORLD_SWEEP_BEGIN() do; while(0) +#define JL_PROBE_GC_MARK_STOP_THE_WORLD_SWEEP_END() do; while(0) #define JL_PROBE_GC_SWEEP_BEGIN(full) do ; while (0) #define JL_PROBE_GC_SWEEP_END() do ; while (0) #define JL_PROBE_GC_END() do ; while (0) diff --git a/src/julia_threads.h b/src/julia_threads.h index 8228d1e056cb5..641812733102b 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -173,11 +173,25 @@ typedef struct { typedef union _jl_gc_mark_data jl_gc_mark_data_t; typedef struct { - void **pc; // Current stack address for the pc (up growing) - jl_gc_mark_data_t *data; // Current stack address for the data (up growing) - void **pc_start; // Cached value of `gc_cache->pc_stack` - void **pc_end; // Cached value of `gc_cache->pc_stack_end` -} jl_gc_mark_sp_t; + int32_t offset, version; +} jl_gc_ws_top_t; + +typedef struct { + int32_t pc_offset, data_offset; +} jl_gc_ws_bottom_t; + +typedef struct { + void **pc_start; + jl_gc_mark_data_t *data_start; + size_t size; +} jl_gc_ws_array_t; + +typedef struct { + _Atomic(jl_gc_ws_top_t) top; + _Atomic(jl_gc_ws_bottom_t) bottom; + _Atomic(jl_gc_ws_array_t *) array; + arraylist_t *reclaim_set; +} jl_gc_ws_queue_t; typedef struct { // thread local increment of `perm_scanned_bytes` @@ -195,9 +209,7 @@ typedef struct { // this makes sure that a single objects can only appear once in // the lists (the mark bit cannot be flipped to `0` without sweeping) void *big_obj[1024]; - void **pc_stack; - void **pc_stack_end; - jl_gc_mark_data_t *data_stack; + jl_gc_ws_queue_t mark_queue; } jl_gc_mark_cache_t; struct _jl_bt_element_t; @@ -217,6 +229,9 @@ typedef struct _jl_tls_states_t { #define JL_GC_STATE_SAFE 2 // gc_state = 2 means the thread is running unmanaged code that can be // execute at the same time with the GC. +#define JL_GC_STATE_PARALLEL 3 + // gc_state = 3 means the thread is doing GC work that can be executed + // concurrently on multiple threads. _Atomic(int8_t) gc_state; // read from foreign threads // execution of certain certain impure // statements is prohibited from certain @@ -264,7 +279,6 @@ typedef struct _jl_tls_states_t { arraylist_t finalizers; jl_gc_mark_cache_t gc_cache; arraylist_t sweep_objs; - jl_gc_mark_sp_t gc_mark_sp; // Saved exception for previous *external* API call or NULL if cleared. // Access via jl_exception_occurred(). struct _jl_value_t *previous_exception; @@ -357,7 +371,20 @@ int8_t jl_gc_safe_leave(jl_ptls_t ptls, int8_t state); // Can be a safepoint #define jl_gc_safe_enter(ptls) jl_gc_state_save_and_set(ptls, JL_GC_STATE_SAFE) #define jl_gc_safe_leave(ptls, state) ((void)jl_gc_state_set(ptls, (state), JL_GC_STATE_SAFE)) #endif +#define jl_gc_mark_loop_enter(ptls) do { \ + jl_atomic_fetch_add(&nworkers_marking, 1); \ + jl_fence(); \ + jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_PARALLEL); \ + } while (0) +#define jl_gc_mark_loop_leave(ptls) do { \ + jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING); \ + jl_fence(); \ + jl_atomic_fetch_add(&nworkers_marking, -1); \ + } while (0) JL_DLLEXPORT void (jl_gc_safepoint)(void); +// Either NULL, or the address of a function that threads can call while +// waiting for the GC, which will recruit them into a concurrent GC operation. +extern _Atomic(void *) jl_gc_recruiting_location; JL_DLLEXPORT void jl_gc_enable_finalizers(struct _jl_task_t *ct, int on); JL_DLLEXPORT void jl_gc_disable_finalizers_internal(void); diff --git a/src/partr.c b/src/partr.c index c128ba76f3e60..fdfb302a4ef5b 100644 --- a/src/partr.c +++ b/src/partr.c @@ -47,8 +47,8 @@ uint64_t io_wakeup_enter; uint64_t io_wakeup_leave; ); -uv_mutex_t *sleep_locks; -uv_cond_t *wake_signals; +uv_mutex_t *sleep_locks, *safepoint_sleep_locks; +uv_cond_t *wake_signals, *safepoint_wake_signals; JL_DLLEXPORT int jl_set_task_tid(jl_task_t *task, int16_t tid) JL_NOTSAFEPOINT { @@ -70,8 +70,8 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA } // GC functions used -extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, - jl_gc_mark_sp_t *sp, jl_value_t *obj) JL_NOTSAFEPOINT; +extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, + jl_value_t *obj) JL_NOTSAFEPOINT; // parallel task runtime // --- @@ -99,10 +99,14 @@ void jl_init_threadinginfra(void) int16_t tid; sleep_locks = (uv_mutex_t*)calloc(jl_n_threads, sizeof(uv_mutex_t)); + safepoint_sleep_locks = (uv_mutex_t*)calloc(jl_n_threads, sizeof(uv_mutex_t)); wake_signals = (uv_cond_t*)calloc(jl_n_threads, sizeof(uv_cond_t)); + safepoint_wake_signals = (uv_cond_t*)calloc(jl_n_threads, sizeof(uv_cond_t)); for (tid = 0; tid < jl_n_threads; tid++) { uv_mutex_init(&sleep_locks[tid]); + uv_mutex_init(&safepoint_sleep_locks[tid]); uv_cond_init(&wake_signals[tid]); + uv_cond_init(&safepoint_wake_signals[tid]); } } @@ -409,7 +413,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, uv_mutex_lock(&sleep_locks[ptls->tid]); while (may_sleep(ptls)) { uv_cond_wait(&wake_signals[ptls->tid], &sleep_locks[ptls->tid]); - // TODO: help with gc work here, if applicable + jl_safepoint_wait_gc(); } assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping); uv_mutex_unlock(&sleep_locks[ptls->tid]); diff --git a/src/safepoint.c b/src/safepoint.c index 17c37a66c3a16..dd6f72e7f50e6 100644 --- a/src/safepoint.c +++ b/src/safepoint.c @@ -44,6 +44,15 @@ uint8_t jl_safepoint_enable_cnt[3] = {0, 0, 0}; // fight on the safepoint lock... uv_mutex_t safepoint_lock; +_Atomic(void *) jl_gc_recruiting_location = NULL; +_Atomic(int32_t) jl_gc_safepoint_master = -1; +_Atomic(int32_t) nworkers_marking = 0; + +extern uv_mutex_t *safepoint_sleep_locks; +extern uv_cond_t *safepoint_wake_signals; + +const uint64_t timeout_ns = 500; + static void jl_safepoint_enable(int idx) JL_NOTSAFEPOINT { // safepoint_lock should be held @@ -146,21 +155,125 @@ void jl_safepoint_end_gc(void) jl_safepoint_disable(2); jl_safepoint_disable(1); jl_atomic_store_release(&jl_gc_running, 0); -# ifdef __APPLE__ - // This wakes up other threads on mac. - jl_mach_gc_end(); -# endif uv_mutex_unlock(&safepoint_lock); } +// Thread recruitment scheme inspired by Hassanein, +// `Understanding and Improving JVM GC Work Stealing at the +// Data Center Scale` + +void jl_safepoint_try_recruit(jl_ptls_t ptls) +{ + if (jl_atomic_load_relaxed(&jl_gc_recruiting_location)) { + jl_gc_mark_loop_enter(ptls); + void *location = jl_atomic_load_acquire(&jl_gc_recruiting_location); + if (location) + ((void (*)(jl_ptls_t))location)(ptls); + jl_gc_mark_loop_leave(ptls); + } +} + +size_t jl_safepoint_master_count_work(jl_ptls_t ptls) +{ + size_t work = 0; + for (int i = 0; i < jl_n_threads; i++) { + if (i == ptls->tid) + continue; + jl_ptls_t ptls2 = jl_all_tls_states[i]; + if (jl_atomic_load_relaxed(&ptls2->gc_state) == JL_GC_STATE_PARALLEL) { + jl_gc_mark_cache_t *gc_cache2 = &ptls2->gc_cache; + jl_gc_ws_queue_t *mark_queue2 = &gc_cache2->mark_queue; + // This count can be slightly off, but it doesn't matter + // for recruitment heuristics + jl_gc_ws_bottom_t bottom2 = jl_atomic_load_relaxed(&mark_queue2->bottom); + jl_gc_ws_top_t top2 = jl_atomic_load_relaxed(&mark_queue2->top); + work += bottom2.pc_offset - top2.offset; + } + } + return work; +} + +void jl_safepoint_master_notify_all(jl_ptls_t ptls) +{ + for (int i = 0; i < jl_n_threads; i++) { + if (i == ptls->tid) + continue; + uv_mutex_lock(&safepoint_sleep_locks[i]); + uv_cond_signal(&safepoint_wake_signals[i]); + uv_mutex_unlock(&safepoint_sleep_locks[i]); + } +} + +void jl_safepoint_master_recruit_workers(jl_ptls_t ptls, size_t nworkers) +{ + for (int i = 0; i < jl_n_threads && nworkers > 0; i++) { + if (i == ptls->tid) + continue; + jl_ptls_t ptls2 = jl_all_tls_states[i]; + if (jl_atomic_load_acquire(&ptls2->gc_state) == JL_GC_STATE_WAITING) { + uv_mutex_lock(&safepoint_sleep_locks[i]); + uv_cond_signal(&safepoint_wake_signals[i]); + uv_mutex_unlock(&safepoint_sleep_locks[i]); + nworkers--; + } + } +} + +int jl_safepoint_master_end_marking(jl_ptls_t ptls) +{ + // All workers done with marking + if (jl_atomic_load_acquire(&nworkers_marking) == 0) + return 1; + int no_master = -1; + if (jl_atomic_cmpswap(&jl_gc_safepoint_master, &no_master, ptls->tid)) { + spin: { + if (jl_atomic_load_acquire(&nworkers_marking) > 0) { + size_t work = jl_safepoint_master_count_work(ptls); + // If there is enough work, recruit workers and also become a worker, + // relinquishing the safepoint master status + if (work > 2) { + jl_safepoint_master_recruit_workers(ptls, work - 1); + jl_atomic_store_release(&jl_gc_safepoint_master, -1); + jl_safepoint_try_recruit(ptls); + return 0; + } + goto spin; + } + } + jl_atomic_store_release(&jl_gc_safepoint_master, -1); + jl_safepoint_master_notify_all(ptls); + return 1; + } + return 0; +} + void jl_safepoint_wait_gc(void) { - // The thread should have set this is already - assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) != 0); - // Use normal volatile load in the loop for speed until GC finishes. - // Then use an acquire load to make sure the GC result is visible on this thread. + jl_ptls_t ptls = jl_current_task->ptls; while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) { - jl_cpu_pause(); // yield? + if (jl_safepoint_master_end_marking(ptls)) { + // Clean-up buffers from `reclaim_set` + jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; + jl_gc_ws_queue_t *mark_queue = &gc_cache->mark_queue; + arraylist_t *rs = mark_queue->reclaim_set; + jl_gc_ws_array_t *a; + while ((a = (jl_gc_ws_array_t*)arraylist_pop(rs))) { + free(a->pc_start); + free(a->data_start); + free(a); + } + break; + } + uv_mutex_lock(&safepoint_sleep_locks[ptls->tid]); + if (!uv_cond_timedwait(&safepoint_wake_signals[ptls->tid], + &safepoint_sleep_locks[ptls->tid], timeout_ns)) { + // Stopped waiting because we got a notification + // from safepoint master: try to get recruited + jl_safepoint_try_recruit(ptls); + } + uv_mutex_unlock(&safepoint_sleep_locks[ptls->tid]); + // Otherwise, just go to the top of the loop and try + // to become a safepoint master } } diff --git a/src/signals-mach.c b/src/signals-mach.c index 0da7ba915f0c7..3f4c8aa1f1252 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -19,6 +19,7 @@ #endif #include "julia_assert.h" +#include "julia_internal.h" // private keymgr stuff #define KEYMGR_GCC3_DW2_OBJ_LIST 302 @@ -41,50 +42,6 @@ static void attach_exception_port(thread_port_t thread, int segv_only); // low 16 bits are the thread id, the next 8 bits are the original gc_state static arraylist_t suspended_threads; -void jl_mach_gc_end(void) -{ - // Requires the safepoint lock to be held - for (size_t i = 0; i < suspended_threads.len; i++) { - uintptr_t item = (uintptr_t)suspended_threads.items[i]; - int16_t tid = (int16_t)item; - int8_t gc_state = (int8_t)(item >> 8); - jl_ptls_t ptls2 = jl_all_tls_states[tid]; - jl_atomic_store_release(&ptls2->gc_state, gc_state); - thread_resume(pthread_mach_thread_np(ptls2->system_id)); - } - suspended_threads.len = 0; -} - -// Suspend the thread and return `1` if the GC is running. -// Otherwise return `0` -static int jl_mach_gc_wait(jl_ptls_t ptls2, - mach_port_t thread, int16_t tid) -{ - uv_mutex_lock(&safepoint_lock); - if (!jl_atomic_load_relaxed(&jl_gc_running)) { - // relaxed, since gets set to zero only while the safepoint_lock was held - // this means we can tell if GC is done before we got the message or - // the safepoint was enabled for SIGINT. - uv_mutex_unlock(&safepoint_lock); - return 0; - } - // Otherwise, set the gc state of the thread, suspend and record it - // TODO: TSAN will complain that it never saw the faulting task do an - // atomic release (it was in the kernel). And our attempt here does - // nothing, since we are a different thread, and it is not transitive). - // - // This also means we are not making this thread available for GC work. - // Eventually, we should probably release this signal to the original - // thread, (return KERN_FAILURE instead of KERN_SUCCESS) so that it - // triggers a SIGSEGV and gets handled by the usual codepath for unix. - int8_t gc_state = ptls2->gc_state; - jl_atomic_store_release(&ptls2->gc_state, JL_GC_STATE_WAITING); - uintptr_t item = tid | (((uintptr_t)gc_state) << 16); - arraylist_push(&suspended_threads, (void*)item); - thread_suspend(thread); - uv_mutex_unlock(&safepoint_lock); - return 1; -} static mach_port_t segv_port = 0; @@ -233,11 +190,24 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio static void segv_handler(int sig, siginfo_t *info, void *context) { assert(sig == SIGSEGV || sig == SIGBUS); + jl_task_t *ct = jl_get_current_task(); if (jl_get_safe_restore()) { // restarting jl_ or jl_unwind_stepn - jl_task_t *ct = jl_get_current_task(); jl_ptls_t ptls = ct == NULL ? NULL : ct->ptls; jl_call_in_state(ptls, (host_thread_state_t*)jl_to_bt_context(context), &jl_sig_throw); } + else if (jl_addr_is_safepoint((uintptr_t)info->si_addr)) { + jl_set_gc_and_wait(); + // Do not raise sigint on worker thread + if (jl_atomic_load_relaxed(&ct->tid) != 0) + return; + if (ct->ptls->defer_signal) { + jl_safepoint_defer_sigint(); + } + else if (jl_safepoint_consume_sigint()) { + jl_clear_force_sigint(); + jl_throw_in_ctx(ct, jl_interrupt_exception, sig, context); + } + } else { sigdie_handler(sig, info, context); } @@ -288,8 +258,9 @@ kern_return_t catch_mach_exception_raise( uint64_t fault_addr = exc_state.__far; #endif if (jl_addr_is_safepoint(fault_addr)) { - if (jl_mach_gc_wait(ptls2, thread, tid)) - return KERN_SUCCESS; + if (jl_atomic_load_acquire(&jl_gc_running)) + // Fallback to POSIX signals and handle GC thread recruitment there + return KERN_FAILURE; if (ptls2->tid != 0) return KERN_SUCCESS; if (ptls2->defer_signal) {