Skip to content

Commit eee494f

Browse files
committed
common/cuda: Fix near-hang when remote side has exited
Ignore errors caused by remote side having exited when closing CUDA IPC mappings. openmpi/ompi#3244 Signed-off-by: Sylvain Jeaugey <[email protected]>
1 parent ebc4eb3 commit eee494f

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

opal/mca/common/cuda/common_cuda.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,10 +1157,10 @@ int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg)
11571157
if (ctx_ok) {
11581158
result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
11591159
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1160-
opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
1161-
true, result, cuda_reg->base.alloc_base);
1162-
opal_output(0, "Sleep on %d", getpid());
1163-
sleep(20);
1160+
if (CUDA_ERROR_DEINITIALIZED != result) {
1161+
opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
1162+
true, result, cuda_reg->base.alloc_base);
1163+
}
11641164
/* We will just continue on and hope things continue to work. */
11651165
} else {
11661166
opal_output_verbose(10, mca_common_cuda_output,

0 commit comments

Comments
 (0)