-
Notifications
You must be signed in to change notification settings - Fork 253
Closed
Labels
bugSomething isn't workingSomething isn't workingupstreamSomebody else's problem.Somebody else's problem.
Description
We're seeing some compute-sanitizer failures in #772, which I can reproduce locally with the following snippet:
using CUDA
a = CuArray{Float32}(undef, 1)
@show pointer(a)
b = CuArray{Float32}(undef, 1)
@show pointer(b)
c = CuArray{Float64}(undef, 1)
@show pointer(c)
a .^ b
device_synchronize()
a .^ b
device_synchronize()
a .^ c
device_synchronize()
Full API trace.
cuDeviceGet(Base.RefValue{Int32}, 0)cuDriverGetVersion(Base.RefValue{Int32}) = CUDA_SUCCESS
1: 11020
cuDeviceGetCount(Base.RefValue{Int32}) = CUDA_SUCCESS
1: 1
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
1: 0
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, CuDevice(0)) = CUDA_SUCCESS
1: 1
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, CuDevice(0)) = CUDA_SUCCESS
1: 1
= CUDA_SUCCESS
1: 0
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CuDevice(0)) = CUDA_SUCCESS
1: 7
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, CuDevice(0)) = CUDA_SUCCESS
1: 5
cuDevicePrimaryCtxRetain(Base.RefValue{Ptr{Nothing}}, CuDevice(0)) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000026454f0
cuCtxSetCurrent(CuContext(0x00000000026454f0, instance f4adfa9382129458)) = CUDA_SUCCESS
cuCtxGetDevice(Base.RefValue{Int32}) = CUDA_SUCCESS
1: 0
cuMemGetInfo_v2(Base.RefValue{UInt64}, Base.RefValue{UInt64}) = CUDA_SUCCESS
1: 16784883712
2: 16908615680
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CuDevice(0)) = CUDA_SUCCESS
1: 7
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, CuDevice(0)) = CUDA_SUCCESS
1: 5
cuStreamCreate(Base.RefValue{Ptr{Nothing}}, CU_STREAM_NON_BLOCKING) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000027b5eb0
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000026454f0
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x0000000302000000)
pointer(a) = CuPtr{Float32}(0x0000000302000000)
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x0000000302000200)
pointer(b) = CuPtr{Float32}(0x0000000302000200)
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x0000000302000400)
pointer(c) = CuPtr{Float64}(0x0000000302000400)
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x0000000302000600)
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CuDevice(0)) = CUDA_SUCCESS
1: 7
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, CuDevice(0)) = CUDA_SUCCESS
1: 5
cuLinkCreate_v2(3, 3-element Vector{CUDA.CUjit_option_enum}, 3-element Vector{Ptr{Nothing}}, Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
4: Ptr{Nothing} @0x0000000005c146f0
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000026454f0
cuLinkAddFile_v2(CuLink(0x0000000005c146f0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), CU_JIT_INPUT_LIBRARY, /home/tim/Julia/depot/artifacts/53345f55f74b563606017c90cbafd05b61cd1768/lib/libcudadevrt.a, 0, Ptr{Nothing} @0x0000000000000000, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuLinkAddData_v2(CuLink(0x0000000005c146f0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), CU_JIT_INPUT_PTX, Ptr{Int8} @0x0000000005f05548, 16705, _Z27julia_broadcast_kernel_199715CuKernelContext13CuDeviceArrayI7Float32Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS0_IS1_Li1ELi1EES3_I4BoolES3_IS5_EES7_IS0_IS1_Li1ELi1EES3_IS8_ES3_IS5_EEEES5_, 0, Ptr{Nothing} @0x0000000000000000, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuLinkComplete(CuLink(0x0000000005c146f0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), Base.RefValue{Ptr{Nothing}}, Base.RefValue{UInt64}) = CUDA_SUCCESS
2: Ptr{Nothing} @0x000000000606ebc8
3: 210088
cuModuleLoadDataEx(Base.RefValue{Ptr{Nothing}}, Ptr{UInt8} @0x000000000606ebc8, 3, 3-element Vector{CUDA.CUjit_option_enum}, 3-element Vector{Ptr{Nothing}}) = CUDA_SUCCESS
1: Ptr{Nothing} @0x0000000005dbcc90
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000026454f0
cuModuleGetFunction(Base.RefValue{Ptr{Nothing}}, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458)), _Z27julia_broadcast_kernel_199715CuKernelContext13CuDeviceArrayI7Float32Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS0_IS1_Li1ELi1EES3_I4BoolES3_IS5_EES7_IS0_IS1_Li1ELi1EES3_IS8_ES3_IS5_EEEES5_) = CUDA_SUCCESS
1: Ptr{Nothing} @0x0000000005c41750
cuModuleGetGlobal_v2(Base.RefValue{CuPtr{Nothing}}, Base.RefValue{UInt64}, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458)), exception_flag) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x00007fda06c0f800)
2: 8
cuMemHostAlloc(Base.RefValue{Ptr{Nothing}}, 8, 2) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00007fda01a00000
cuMemHostGetDevicePointer_v2(Base.RefValue{CuPtr{Nothing}}, Ptr{Nothing} @0x00007fda01a00000, 0) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x00007fda01a00000)
cuMemcpyHtoDAsync_v2(CuGlobal{Ptr{Nothing}}(DeviceBuffer(8 bytes at 0x00007fda06c0f800)), Base.RefValue{Ptr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
2: Ptr{Nothing} @0x00007fda01a00000
cuOccupancyMaxPotentialBlockSize(Base.RefValue{Int32}, Base.RefValue{Int32}, CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), Ptr{Nothing} @0x0000000000000000, 0, 256) = CUDA_SUCCESS
1: 192
2: 256
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuCtxSynchronize() = CUDA_SUCCESS
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x0000000302000800)
cuOccupancyMaxPotentialBlockSize(Base.RefValue{Int32}, Base.RefValue{Int32}, CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), Ptr{Nothing} @0x0000000000000000, 0, 256) = CUDA_SUCCESS
1: 192
2: 256
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuCtxSynchronize() = CUDA_SUCCESS
cuMemFree_v2(DeviceBuffer(4 bytes at 0x0000000302000800)) = CUDA_SUCCESS
cuCtxPushCurrent_v2(CuContext(0x00000000026454f0, instance f4adfa9382129458)) = CUDA_SUCCESS
cuLinkDestroy(CuLink(0x0000000005c146f0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
cuCtxPopCurrent_v2(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000026454f0
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x0000000302000800)
cuLinkCreate_v2(3, 3-element Vector{CUDA.CUjit_option_enum}, 3-element Vector{Ptr{Nothing}}, Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
4: Ptr{Nothing} @0x0000000001da7520
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000026454f0
cuLinkAddFile_v2(CuLink(0x0000000001da7520, CuContext(0x00000000026454f0, instance f4adfa9382129458)), CU_JIT_INPUT_LIBRARY, /home/tim/Julia/depot/artifacts/53345f55f74b563606017c90cbafd05b61cd1768/lib/libcudadevrt.a, 0, Ptr{Nothing} @0x0000000000000000, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuLinkAddData_v2(CuLink(0x0000000001da7520, CuContext(0x00000000026454f0, instance f4adfa9382129458)), CU_JIT_INPUT_PTX, Ptr{Int8} @0x0000000006d68248, 21912, _Z27julia_broadcast_kernel_344815CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS0_I7Float32Li1ELi1EES3_I4BoolES3_IS5_EES7_IS0_IS1_Li1ELi1EES3_IS9_ES3_IS5_EEEES5_, 0, Ptr{Nothing} @0x0000000000000000, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuLinkComplete(CuLink(0x0000000001da7520, CuContext(0x00000000026454f0, instance f4adfa9382129458)), Base.RefValue{Ptr{Nothing}}, Base.RefValue{UInt64}) = CUDA_SUCCESS
2: Ptr{Nothing} @0x00000000069f3b68
3: 220016
cuModuleLoadDataEx(Base.RefValue{Ptr{Nothing}}, Ptr{UInt8} @0x00000000069f3b68, 3, 3-element Vector{CUDA.CUjit_option_enum}, 3-element Vector{Ptr{Nothing}}) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000061f3310
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000026454f0
cuModuleGetFunction(Base.RefValue{Ptr{Nothing}}, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458)), _Z27julia_broadcast_kernel_344815CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS0_I7Float32Li1ELi1EES3_I4BoolES3_IS5_EES7_IS0_IS1_Li1ELi1EES3_IS9_ES3_IS5_EEEES5_) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00000000072a2f80
cuModuleGetGlobal_v2(Base.RefValue{CuPtr{Nothing}}, Base.RefValue{UInt64}, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458)), exception_flag) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x00007fda06c1fa00)
2: 8
cuMemHostAlloc(Base.RefValue{Ptr{Nothing}}, 8, 2) = CUDA_SUCCESS
1: Ptr{Nothing} @0x00007fda01a00200
cuMemHostGetDevicePointer_v2(Base.RefValue{CuPtr{Nothing}}, Ptr{Nothing} @0x00007fda01a00000, 0) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x00007fda01a00000)
cuMemcpyHtoDAsync_v2(CuGlobal{Ptr{Nothing}}(DeviceBuffer(8 bytes at 0x00007fda06c1fa00)), Base.RefValue{Ptr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
2: Ptr{Nothing} @0x00007fda01a00000
cuMemFree_v2(DeviceBuffer(4 bytes at 0x0000000302000600)) = CUDA_SUCCESS
cuOccupancyMaxPotentialBlockSize(Base.RefValue{Int32}, Base.RefValue{Int32}, CuFunction(Ptr{Nothing} @0x00000000072a2f80, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458))), Ptr{Nothing} @0x0000000000000000, 0, 256) = CUDA_SUCCESS
1: 192
2: 256
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x00000000072a2f80, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuCtxSynchronize() = CUDA_ERROR_LAUNCH_FAILED
========= Invalid __global__ write of size 8 bytes
========= at 0x9a0 in julia_broadcast_kernel_3448(CuKernelContext,CuDeviceArray<Float64,int=1,int=1>,Broadcasted<void,Tuple<OneTo<Int64>>,__,Broadcasted<Extruded<CuDeviceArray<Float32,int=1,int=1>,Broadcasted<Bool>,Broadcasted<OneTo>>,OneTo<Int64,CuDeviceArray<Float64,int=1,int=1>,Broadcasted<__>,Broadcasted<OneTo>>>>,OneTo)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x302000800 is out of bounds
=========
Relevant parts:
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x0000000302000800)
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuCtxSynchronize() = CUDA_SUCCESS
cuMemFree_v2(DeviceBuffer(4 bytes at 0x0000000302000800)) = CUDA_SUCCESS
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
1: CuPtr{Nothing}(0x0000000302000800)
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x00000000072a2f80, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuCtxSynchronize() = CUDA_ERROR_LAUNCH_FAILED
========= Invalid __global__ write of size 8 bytes
========= at 0x9a0 in julia_broadcast_kernel_3448(CuKernelContext,CuDeviceArray<Float64,int=1,int=1>,Broadcasted<void,Tuple<OneTo<Int64>>,__,Broadcasted<Extruded<CuDeviceArray<Float32,int=1,int=1>,Broadcasted<Bool>,Broadcasted<OneTo>>,OneTo<Int64,CuDeviceArray<Float64,int=1,int=1>,Broadcasted<__>,Broadcasted<OneTo>>>>,OneTo)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x302000800 is out of bounds
=========
So we're allocating and freeing a buffer at 0x302000800, after which a new allocation requests returns 0x302000800 again. All of this happens asynchronously on the same stream, but compute-sanitizer seems to think the pointer is invalid. I think this is a compute-sanitizer bug?
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workingupstreamSomebody else's problem.Somebody else's problem.