-
-
Notifications
You must be signed in to change notification settings - Fork 5.6k
Description
Executive summary: I occasionally get segfaults when running an @threads
loop in Julia 1.7.1. It is a Heisenbug, and I'm still trying to get rr
working on my system.
I've been trying to pare down the example code, but this is definitely a heisenbug and it often takes me several tries to confirm that the program still segfaults with a set of changes, so progress has been slow. I also have not been able to get an rr trace since rr
refuses to run at all on my system at the moment, though I'll work towards getting that working in my spare time.
Running on an Arch system (confirmed that this still happens after a reboot w/ no updates), julia installed from the julia-bin
package which downloads and unpacks the official binaries.
I see segfaults with 32 threads, they're much rarer with just 16 but can still occur after a very long time.
Current Code
I will continue to pare this down to get as minimal an example as possible.using Base.Iterators
using Base.Threads
using Serialization
using Distributions
function bin_data(data, lo, hi, nbins)
dx = (hi - lo) / nbins
bins = ((data .- lo) ./ dx) .|> floor
bins = UInt8.(bins)
clamp.(bins, UInt8(0), UInt8(nbins))
end
l = SpinLock()
function compress_data(data)
lock(l)
tmpfn = tempname()
unlock(l)
write(tmpfn, data)
run(
pipeline(
`xz -9e --keep --format=raw --suffix=.xz $(tmpfn)`,
stdout = devnull,
stderr = devnull,
),
)
nbytes = filesize(tmpfn * ".xz")
rm(tmpfn * ".xz")
rm(tmpfn)
return nbytes
end
compressed_size_bytes(data) = compress_data(data)
compressed_size_bits(data) = compress_data(data) * 8
function emission_times_exp(n, k, Γ)
η = (k + Γ) / (k * Γ)
dist = Exponential(η)
rand(dist, n)
end
function lose_data(lagtimes, γ)
@assert(all(lagtimes .>= 0.0))
ind = Int[]
fixed_times = cumsum(lagtimes)
for i = 1:length(lagtimes)
x = rand()
if x < γ
push!(ind, i)
end
end
detected_times = fixed_times[ind]
detected_times |> diff
end
ns = [100_000, 1_000_000, 10_000_000]
# ns = [1_000] # testing only
ks = [0.1, 0.5, 1.0, 5.0, 10.0]
Γs = [0.1, 0.5, 1.0, 5.0, 10.0]
γs = range(0.1, 1.0, step = 0.1)
ntrials = 1000
smrates = Iterators.product(ks, Γs) |> collect |> vec
l = SpinLock()
@threads for trialnum = 1:ntrials
data = Dict()
for p in smrates
(k, Γ) = p
for n in ns
# nm_times = get_emission_dt(n, k, Γ)
# mar_times = emission_times_exp(n, k, Γ)
nm_times = 10.0 .* rand(n)
mar_times = 10.0 .* rand(n)
for γ in γs
nm_lost = lose_data(nm_times, γ)
mar_lost = lose_data(mar_times, γ)
hi = max(maximum(nm_lost),maximum(mar_lost))
@assert(all(nm_lost .>= 0.0))
@assert(all(mar_lost .>= 0.0))
nm_binned = bin_data(nm_lost, 0.0, hi, 100)
mar_binned = bin_data(mar_lost, 0.0, hi, 100)
nm_size = compressed_size_bytes(nm_binned)
mar_size = compressed_size_bytes(mar_binned)
experiment_index = (n = n, k = k, Γ = Γ, γ = γ, trial = trialnum)
try
lock(l)
data[experiment_index] = (1.0, 1.0)
finally
unlock(l)
end
end
end
end
serialize("../data/compression_sweep_$(trialnum).jls", data)
@info "Finishing trial $(trialnum)"
end
Output of `versioninfo()`
Julia Version 1.7.1
Commit ac5cc99908 (2021-12-22 19:35 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: AMD Ryzen 9 5950X 16-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-12.0.1 (ORCJIT, znver3)
Stack trace from the one time in many that the program managed to print one before dying
The line that originates in my code (line 65 at the top) is just the @threads
loop.
signal (11): Segmentation fault
in expression starting at /mnt/ssd-data/Experiments/02-2022/simple-photon-model/code/gen_paramsweep_segfault.jl:65
jl_uv_call_close_callback at /buildworker/worker/package_linux64/build/src/jl_uv.c:88 [inlined]
jl_uv_closeHandle at /buildworker/worker/package_linux64/build/src/jl_uv.c:111
uv__finish_close at /workspace/srcdir/libuv/src/unix/core.c:301
uv__run_closing_handles at /workspace/srcdir/libuv/src/unix/core.c:315
uv_run at /workspace/srcdir/libuv/src/unix/core.c:393
jl_process_events at /buildworker/worker/package_linux64/build/src/jl_uv.c:214
jl_task_get_next at /buildworker/worker/package_linux64/build/src/partr.c:528
poptask at ./task.jl:827
wait at ./task.jl:836
Exception: julia killed by signal segmentation fault (core dumped)
[tty 13], line 1: E:JULIA_NUM_THREADS=32 julia gen_paramsweep_segfault.jl
Let me know if any other information would be helpful!