Skip to content

compute-sanitizer out-of-bounds failure #780

@maleadt

Description

@maleadt

We're seeing some compute-sanitizer failures in #772, which I can reproduce locally with the following snippet:

using CUDA

a = CuArray{Float32}(undef, 1)
@show pointer(a)
b = CuArray{Float32}(undef, 1)
@show pointer(b)
c = CuArray{Float64}(undef, 1)
@show pointer(c)

a .^ b
device_synchronize()
a .^ b
device_synchronize()

a .^ c
device_synchronize()
Full API trace.
cuDeviceGet(Base.RefValue{Int32}, 0)cuDriverGetVersion(Base.RefValue{Int32}) = CUDA_SUCCESS
 1: 11020
cuDeviceGetCount(Base.RefValue{Int32}) = CUDA_SUCCESS
 1: 1
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
 1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
 1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
 1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
 1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
 1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
 1: 0
cuDeviceGet(Base.RefValue{Int32}, 0) = CUDA_SUCCESS
 1: 0
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, CuDevice(0)) = CUDA_SUCCESS
 1: 1
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, CuDevice(0)) = CUDA_SUCCESS
 1: 1
 = CUDA_SUCCESS
 1: 0
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CuDevice(0)) = CUDA_SUCCESS
 1: 7
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, CuDevice(0)) = CUDA_SUCCESS
 1: 5
cuDevicePrimaryCtxRetain(Base.RefValue{Ptr{Nothing}}, CuDevice(0)) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000026454f0
cuCtxSetCurrent(CuContext(0x00000000026454f0, instance f4adfa9382129458)) = CUDA_SUCCESS
cuCtxGetDevice(Base.RefValue{Int32}) = CUDA_SUCCESS
 1: 0
cuMemGetInfo_v2(Base.RefValue{UInt64}, Base.RefValue{UInt64}) = CUDA_SUCCESS
 1: 16784883712
 2: 16908615680
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CuDevice(0)) = CUDA_SUCCESS
 1: 7
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, CuDevice(0)) = CUDA_SUCCESS
 1: 5
cuStreamCreate(Base.RefValue{Ptr{Nothing}}, CU_STREAM_NON_BLOCKING) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000027b5eb0
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000026454f0
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x0000000302000000)
pointer(a) = CuPtr{Float32}(0x0000000302000000)
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x0000000302000200)
pointer(b) = CuPtr{Float32}(0x0000000302000200)
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x0000000302000400)
pointer(c) = CuPtr{Float64}(0x0000000302000400)
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x0000000302000600)
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CuDevice(0)) = CUDA_SUCCESS
 1: 7
cuDeviceGetAttribute(Base.RefValue{Int32}, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, CuDevice(0)) = CUDA_SUCCESS
 1: 5
cuLinkCreate_v2(3, 3-element Vector{CUDA.CUjit_option_enum}, 3-element Vector{Ptr{Nothing}}, Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
 4: Ptr{Nothing} @0x0000000005c146f0
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000026454f0
cuLinkAddFile_v2(CuLink(0x0000000005c146f0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), CU_JIT_INPUT_LIBRARY, /home/tim/Julia/depot/artifacts/53345f55f74b563606017c90cbafd05b61cd1768/lib/libcudadevrt.a, 0, Ptr{Nothing} @0x0000000000000000, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuLinkAddData_v2(CuLink(0x0000000005c146f0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), CU_JIT_INPUT_PTX, Ptr{Int8} @0x0000000005f05548, 16705, _Z27julia_broadcast_kernel_199715CuKernelContext13CuDeviceArrayI7Float32Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS0_IS1_Li1ELi1EES3_I4BoolES3_IS5_EES7_IS0_IS1_Li1ELi1EES3_IS8_ES3_IS5_EEEES5_, 0, Ptr{Nothing} @0x0000000000000000, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuLinkComplete(CuLink(0x0000000005c146f0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), Base.RefValue{Ptr{Nothing}}, Base.RefValue{UInt64}) = CUDA_SUCCESS
 2: Ptr{Nothing} @0x000000000606ebc8
 3: 210088
cuModuleLoadDataEx(Base.RefValue{Ptr{Nothing}}, Ptr{UInt8} @0x000000000606ebc8, 3, 3-element Vector{CUDA.CUjit_option_enum}, 3-element Vector{Ptr{Nothing}}) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x0000000005dbcc90
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000026454f0
cuModuleGetFunction(Base.RefValue{Ptr{Nothing}}, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458)), _Z27julia_broadcast_kernel_199715CuKernelContext13CuDeviceArrayI7Float32Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS0_IS1_Li1ELi1EES3_I4BoolES3_IS5_EES7_IS0_IS1_Li1ELi1EES3_IS8_ES3_IS5_EEEES5_) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x0000000005c41750
cuModuleGetGlobal_v2(Base.RefValue{CuPtr{Nothing}}, Base.RefValue{UInt64}, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458)), exception_flag) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x00007fda06c0f800)
 2: 8
cuMemHostAlloc(Base.RefValue{Ptr{Nothing}}, 8, 2) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00007fda01a00000
cuMemHostGetDevicePointer_v2(Base.RefValue{CuPtr{Nothing}}, Ptr{Nothing} @0x00007fda01a00000, 0) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x00007fda01a00000)
cuMemcpyHtoDAsync_v2(CuGlobal{Ptr{Nothing}}(DeviceBuffer(8 bytes at 0x00007fda06c0f800)), Base.RefValue{Ptr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 2: Ptr{Nothing} @0x00007fda01a00000
cuOccupancyMaxPotentialBlockSize(Base.RefValue{Int32}, Base.RefValue{Int32}, CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), Ptr{Nothing} @0x0000000000000000, 0, 256) = CUDA_SUCCESS
 1: 192
 2: 256
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuCtxSynchronize() = CUDA_SUCCESS
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x0000000302000800)
cuOccupancyMaxPotentialBlockSize(Base.RefValue{Int32}, Base.RefValue{Int32}, CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), Ptr{Nothing} @0x0000000000000000, 0, 256) = CUDA_SUCCESS
 1: 192
 2: 256
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuCtxSynchronize() = CUDA_SUCCESS
cuMemFree_v2(DeviceBuffer(4 bytes at 0x0000000302000800)) = CUDA_SUCCESS
cuCtxPushCurrent_v2(CuContext(0x00000000026454f0, instance f4adfa9382129458)) = CUDA_SUCCESS
cuLinkDestroy(CuLink(0x0000000005c146f0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
cuCtxPopCurrent_v2(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000026454f0
cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x0000000302000800)
cuLinkCreate_v2(3, 3-element Vector{CUDA.CUjit_option_enum}, 3-element Vector{Ptr{Nothing}}, Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
 4: Ptr{Nothing} @0x0000000001da7520
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000026454f0
cuLinkAddFile_v2(CuLink(0x0000000001da7520, CuContext(0x00000000026454f0, instance f4adfa9382129458)), CU_JIT_INPUT_LIBRARY, /home/tim/Julia/depot/artifacts/53345f55f74b563606017c90cbafd05b61cd1768/lib/libcudadevrt.a, 0, Ptr{Nothing} @0x0000000000000000, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuLinkAddData_v2(CuLink(0x0000000001da7520, CuContext(0x00000000026454f0, instance f4adfa9382129458)), CU_JIT_INPUT_PTX, Ptr{Int8} @0x0000000006d68248, 21912, _Z27julia_broadcast_kernel_344815CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS0_I7Float32Li1ELi1EES3_I4BoolES3_IS5_EES7_IS0_IS1_Li1ELi1EES3_IS9_ES3_IS5_EEEES5_, 0, Ptr{Nothing} @0x0000000000000000, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuLinkComplete(CuLink(0x0000000001da7520, CuContext(0x00000000026454f0, instance f4adfa9382129458)), Base.RefValue{Ptr{Nothing}}, Base.RefValue{UInt64}) = CUDA_SUCCESS
 2: Ptr{Nothing} @0x00000000069f3b68
 3: 220016
cuModuleLoadDataEx(Base.RefValue{Ptr{Nothing}}, Ptr{UInt8} @0x00000000069f3b68, 3, 3-element Vector{CUDA.CUjit_option_enum}, 3-element Vector{Ptr{Nothing}}) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000061f3310
cuCtxGetCurrent(Base.RefValue{Ptr{Nothing}}) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000026454f0
cuModuleGetFunction(Base.RefValue{Ptr{Nothing}}, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458)), _Z27julia_broadcast_kernel_344815CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS0_I7Float32Li1ELi1EES3_I4BoolES3_IS5_EES7_IS0_IS1_Li1ELi1EES3_IS9_ES3_IS5_EEEES5_) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00000000072a2f80
cuModuleGetGlobal_v2(Base.RefValue{CuPtr{Nothing}}, Base.RefValue{UInt64}, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458)), exception_flag) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x00007fda06c1fa00)
 2: 8
cuMemHostAlloc(Base.RefValue{Ptr{Nothing}}, 8, 2) = CUDA_SUCCESS
 1: Ptr{Nothing} @0x00007fda01a00200
cuMemHostGetDevicePointer_v2(Base.RefValue{CuPtr{Nothing}}, Ptr{Nothing} @0x00007fda01a00000, 0) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x00007fda01a00000)
cuMemcpyHtoDAsync_v2(CuGlobal{Ptr{Nothing}}(DeviceBuffer(8 bytes at 0x00007fda06c1fa00)), Base.RefValue{Ptr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 2: Ptr{Nothing} @0x00007fda01a00000
cuMemFree_v2(DeviceBuffer(4 bytes at 0x0000000302000600)) = CUDA_SUCCESS
cuOccupancyMaxPotentialBlockSize(Base.RefValue{Int32}, Base.RefValue{Int32}, CuFunction(Ptr{Nothing} @0x00000000072a2f80, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458))), Ptr{Nothing} @0x0000000000000000, 0, 256) = CUDA_SUCCESS
 1: 192
 2: 256
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x00000000072a2f80, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuCtxSynchronize() = CUDA_ERROR_LAUNCH_FAILED

========= Invalid __global__ write of size 8 bytes
=========     at 0x9a0 in julia_broadcast_kernel_3448(CuKernelContext,CuDeviceArray<Float64,int=1,int=1>,Broadcasted<void,Tuple<OneTo<Int64>>,__,Broadcasted<Extruded<CuDeviceArray<Float32,int=1,int=1>,Broadcasted<Bool>,Broadcasted<OneTo>>,OneTo<Int64,CuDeviceArray<Float64,int=1,int=1>,Broadcasted<__>,Broadcasted<OneTo>>>>,OneTo)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x302000800 is out of bounds
========= 

Relevant parts:

cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 4, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x0000000302000800)
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x0000000005c41750, CuModule(Ptr{Nothing} @0x0000000005dbcc90, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS
cuCtxSynchronize() = CUDA_SUCCESS
cuMemFree_v2(DeviceBuffer(4 bytes at 0x0000000302000800)) = CUDA_SUCCESS

cuMemAllocAsync(Base.RefValue{CuPtr{Nothing}}, 8, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458))) = CUDA_SUCCESS
 1: CuPtr{Nothing}(0x0000000302000800)
cuLaunchKernel(CuFunction(Ptr{Nothing} @0x00000000072a2f80, CuModule(Ptr{Nothing} @0x00000000061f3310, CuContext(0x00000000026454f0, instance f4adfa9382129458))), 1, 1, 1, 1, 1, 1, 0, CuStream(0x00000000027b5eb0, CuContext(0x00000000026454f0, instance f4adfa9382129458)), 3-element Vector{Ptr{Nothing}}, Ptr{Nothing} @0x0000000000000000) = CUDA_SUCCESS

cuCtxSynchronize() = CUDA_ERROR_LAUNCH_FAILED
========= Invalid __global__ write of size 8 bytes
=========     at 0x9a0 in julia_broadcast_kernel_3448(CuKernelContext,CuDeviceArray<Float64,int=1,int=1>,Broadcasted<void,Tuple<OneTo<Int64>>,__,Broadcasted<Extruded<CuDeviceArray<Float32,int=1,int=1>,Broadcasted<Bool>,Broadcasted<OneTo>>,OneTo<Int64,CuDeviceArray<Float64,int=1,int=1>,Broadcasted<__>,Broadcasted<OneTo>>>>,OneTo)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x302000800 is out of bounds
========= 

So we're allocating and freeing a buffer at 0x302000800, after which a new allocation requests returns 0x302000800 again. All of this happens asynchronously on the same stream, but compute-sanitizer seems to think the pointer is invalid. I think this is a compute-sanitizer bug?

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingupstreamSomebody else's problem.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions