diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index a6b2ce6b4..cfb680bbd 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -405,7 +405,20 @@ function cleanup_proc(state, p, log_sink) delete!(WORKER_MONITOR_CHANS[wid], state.uid) end end - remote_do(_cleanup_proc, wid, state.uid, log_sink) + + # If the worker process is still alive, clean it up + if wid in workers() + try + remotecall_wait(_cleanup_proc, wid, state.uid, log_sink) + catch ex + # We allow ProcessExitedException's, which means that the worker + # shutdown halfway through cleanup. + if !(ex isa ProcessExitedException) + rethrow() + end + end + end + timespan_finish(ctx, :cleanup_proc, (;worker=wid), nothing) end diff --git a/src/sch/dynamic.jl b/src/sch/dynamic.jl index 7c52bf748..df78a1dd1 100644 --- a/src/sch/dynamic.jl +++ b/src/sch/dynamic.jl @@ -33,9 +33,18 @@ function safepoint(state) if state.halt.set # Force dynamic thunks and listeners to terminate for (inp_chan,out_chan) in values(state.worker_chans) - close(inp_chan) - close(out_chan) + # Closing these channels will fail if the worker died, which we + # allow. + try + close(inp_chan) + close(out_chan) + catch ex + if !(ex isa ProcessExitedException) + rethrow() + end + end end + # Throw out of scheduler throw(SchedulerHaltedException()) end