diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4 index 43b4a3662ac..6405ed0ad81 100644 --- a/config/opal_check_cuda.m4 +++ b/config/opal_check_cuda.m4 @@ -27,7 +27,22 @@ dnl dnl $HEADER$ dnl + +# OPAL_CHECK_CUDA(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if CUDA support can be found. sets prefix_{CPPFLAGS, +# LDFLAGS, LIBS} as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found + +# +# Check for CUDA support +# AC_DEFUN([OPAL_CHECK_CUDA],[ +OPAL_VAR_SCOPE_PUSH([cuda_save_CPPFLAGS cuda_save_LDFLAGS cuda_save_LIBS]) + +cuda_save_CPPFLAGS="$CPPFLAGS" +cuda_save_LDFLAGS="$LDFLAGS" +cuda_save_LIBS="$LIBS" # # Check to see if user wants CUDA support # @@ -72,12 +87,15 @@ AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"], opal_cuda_incdir="$with_cuda/include" AC_MSG_RESULT([found ($opal_cuda_incdir/cuda.h)])])])])]) -dnl We cannot have CUDA support without dlopen support. HOWEVER, at -dnl this point in configure, we can't know whether the DL framework -dnl has been configured or not yet (it likely hasn't, since CUDA is a -dnl common framework, and likely configured first). So we have to -dnl defer this check until later (see the OPAL_CHECK_CUDA_AFTER_OPAL_DL m4 -dnl macro, below). :-( +AS_IF([test "$opal_check_cuda_happy" = "yes"], + [OAC_CHECK_PACKAGE([cuda], + [$1], + [cuda.h], + [cuda], + [cuMemFree], + [opal_check_cuda_happy="yes"], + [opal_check_cuda_happy="no"])], + []) # We require CUDA IPC support which started in CUDA 4.1. Error # out if the support is not there. @@ -144,22 +162,9 @@ AM_CONDITIONAL([OPAL_cuda_gdr_support], [test "x$CUDA_VERSION_60_OR_GREATER" = " AC_DEFINE_UNQUOTED([OPAL_CUDA_GDR_SUPPORT],$CUDA_VERSION_60_OR_GREATER, [Whether we have CUDA GDR support available]) +CPPFLAGS=${cuda_save_CPPFLAGS} +LDFLAGS=${cuda_save_LDFLAGS} +LIBS=${cuda_save_LIBS} +OPAL_VAR_SCOPE_POP ]) -dnl -dnl CUDA support requires DL support (it dynamically opens the CUDA -dnl library at run time). But we do not check for OPAL DL support -dnl until lafter the initial OPAL_CHECK_CUDA is called. So put the -dnl CUDA+DL check in a separate macro that can be called after the DL MCA -dnl framework checks in the top-level configure.ac. -dnl -AC_DEFUN([OPAL_CHECK_CUDA_AFTER_OPAL_DL],[ - - # We cannot have CUDA support without OPAL DL support. Error out - # if the user wants CUDA but we do not have OPAL DL support. - AS_IF([test $OPAL_HAVE_DL_SUPPORT -eq 0 && \ - test "$opal_check_cuda_happy" = "yes"], - [AC_MSG_WARN([--with-cuda was specified, but dlopen support is disabled.]) - AC_MSG_WARN([You must reconfigure Open MPI with dlopen ("dl") support.]) - AC_MSG_ERROR([Cannot continue.])]) -]) diff --git a/config/opal_config_files.m4 b/config/opal_config_files.m4 index 18cbe0066e6..78358d998c1 100644 --- a/config/opal_config_files.m4 +++ b/config/opal_config_files.m4 @@ -17,7 +17,6 @@ AC_DEFUN([OPAL_CONFIG_FILES],[ AC_CONFIG_FILES([ opal/Makefile - opal/cuda/Makefile opal/etc/Makefile opal/include/Makefile opal/datatype/Makefile diff --git a/configure.ac b/configure.ac index c87f5f64c78..6ee1de964a2 100644 --- a/configure.ac +++ b/configure.ac @@ -987,7 +987,6 @@ AC_CACHE_SAVE opal_show_title "System-specific tests" -OPAL_CHECK_CUDA ################################## OPAL_CHECK_OS_FLAVORS @@ -1233,8 +1232,6 @@ AC_CACHE_SAVE # be done better by having some kind of "run this check at the end of # all other MCA checks" hook...? -OPAL_CHECK_CUDA_AFTER_OPAL_DL - OPAL_CHECK_ROCM_AFTER_OPAL_DL ################################## diff --git a/opal/Makefile.am b/opal/Makefile.am index a24d3d3114a..1aad41b8ffb 100644 --- a/opal/Makefile.am +++ b/opal/Makefile.am @@ -22,26 +22,18 @@ # $HEADER$ # -if OPAL_cuda_support -LIBOPAL_GPU_SUBDIR = cuda -LIBOPAL_GPU_LA = cuda/libopalcuda.la -endif - - SUBDIRS = \ include \ datatype \ etc \ util \ mca/base \ - $(LIBOPAL_GPU_SUBDIR) \ $(MCA_opal_FRAMEWORKS_SUBDIRS) \ $(MCA_opal_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \ . \ $(MCA_opal_FRAMEWORK_COMPONENT_DSO_SUBDIRS) DIST_SUBDIRS = \ include \ - cuda \ datatype \ etc \ util \ @@ -67,13 +59,11 @@ lib@OPAL_LIB_NAME@_la_LIBADD = \ libopen-pal_core.la \ datatype/libdatatype.la \ util/libopalutil.la \ - $(LIBOPAL_GPU_LA) \ $(MCA_opal_FRAMEWORK_LIBS) lib@OPAL_LIB_NAME@_la_DEPENDENCIES = \ libopen-pal_core.la \ datatype/libdatatype.la \ util/libopalutil.la \ - $(LIBOPAL_GPU_LA) \ $(MCA_opal_FRAMEWORK_LIBS) lib@OPAL_LIB_NAME@_la_LDFLAGS = -version-info @libopen_pal_so_version@ diff --git a/opal/cuda/Makefile.am b/opal/cuda/Makefile.am deleted file mode 100644 index b9f6db41ff6..00000000000 --- a/opal/cuda/Makefile.am +++ /dev/null @@ -1,44 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2013 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved. -# Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(common_cuda_CPPFLAGS) - -# Header files -headers = \ - common_cuda.h - -# Source files -sources = \ - common_cuda.c - -dist_opaldata_DATA = help-mpi-common-cuda.txt - -noinst_LTLIBRARIES = libopalcuda.la - -libopalcuda_la_SOURCES = $(headers) $(sources) -libopalcuda_la_LDFLAGS = -libopalcuda_la_LIBADD = - -# Conditionally install the header files -if WANT_INSTALL_HEADERS -opaldir = $(opalincludedir)/$(subdir) -opal_HEADERS = $(headers) -endif diff --git a/opal/cuda/README.md b/opal/cuda/README.md deleted file mode 100644 index 770c367f69d..00000000000 --- a/opal/cuda/README.md +++ /dev/null @@ -1,47 +0,0 @@ -# A Developer's Note on OMPI CUDA Code - -The initial CUDA implementation in Open MPI was not well factored. -Most of the developers at the time didn't really understand CUDA (or -GPUs), and the developers working on CUDA were new to Open MPI's -abstractions. It was also unclear whether there would be another -interface for someone else's GPUs or whether the world would choose -CUDA. With this background, choices were made. - -The initial implementation put much of the cuda buffer handling -functions in the datatype engine, including the code to determine if -an address referred to a CUDA buffer. Many of the users of those -functions were also users of the datatype engine, so it made sense. -There was also a common/cuda library, which provided wrappers around -common cuda functions. The common/cuda library (usually itself -built as a dso) dlopen'ed the base cuda library, so that no part of -Open MPI had a loader-time dependency on the cuda library. - -In 2021, the default build mode for components (including common -components) was changed from DSO to static (ie, part of the base -library, which may still be a dynamic library) to reduce startup -time. The OFI MTL was also updated to support CUDA buffers, which -required some changes to the datatype interface. During those -changes, George rightly pushed that the CUDA specific code belonged -not in the datatype engine, but in a CUDA-specific library. The -develoepr working on the OFI MTL code dutifully moved the code, not -realizing that he had created a circular dependency that broke the -ability of common/cuda to build as a DSO. The datatype engine -depended on functions in the common/cuda library, but the common/cuda -library depended on libopen-pal. - -To fix this issue with minimal interruption to the 5.0 schedule, we -moved the common/cuda component into libopen-pal (ie, it is no longer -a component, but just part of the base library). Because the cuda -libraries are still dlopen'ed by the OMPI cuda code, this does not -introduce a loader-time dependency on the cuda libraries from Open -MPI, but does break the cycle described above. This is not a great -abstraction situation, but works. - -The "right" solution is an accelerator framework that is in OPAL, -which encapsulates the functions that Open MPI requires from an -accelerator (CUDA, ROCm, Xe, etc.), as we now know there will be more -than one accelerator interface in the world. An initial take is -proposed in https://github.com/open-mpi/ompi/pull/10069, although -significant work remains to prove that said interface is sufficient to -abstract an accelerator interface (where sufficient is defined as "no -`#if HAVE_CUDA` macros in the general codebase"). diff --git a/opal/cuda/common_cuda.c b/opal/cuda/common_cuda.c deleted file mode 100644 index 667ad6bdcad..00000000000 --- a/opal/cuda/common_cuda.c +++ /dev/null @@ -1,2309 +0,0 @@ -/* - * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * This file contains various support functions for doing CUDA - * operations. - */ -#include "opal_config.h" - -#include -#include -#include - -#include "opal/align.h" -#include "opal/datatype/opal_convertor.h" -#include "opal/util/argv.h" -#include "opal/util/output.h" -#include "opal/util/printf.h" -#include "opal/util/proc.h" -#include "opal/util/show_help.h" - -#include "opal/mca/dl/base/base.h" -#include "opal/mca/rcache/base/base.h" -#include "opal/mca/timer/base/base.h" -#include "opal/runtime/opal_params.h" - -#include "common_cuda.h" - -/** - * Since function names can get redefined in cuda.h file, we need to do this - * stringifying to get the latest function name from the header file. For - * example, cuda.h may have something like this: - * #define cuMemFree cuMemFree_v2 - * We want to make sure we find cuMemFree_v2, not cuMemFree. - */ -#define STRINGIFY2(x) #x -#define STRINGIFY(x) STRINGIFY2(x) - -#define OPAL_CUDA_DLSYM(libhandle, funcName) \ - do { \ - char *err_msg; \ - void *ptr; \ - if (OPAL_SUCCESS != opal_dl_lookup(libhandle, STRINGIFY(funcName), &ptr, &err_msg)) { \ - opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true, STRINGIFY(funcName), \ - err_msg); \ - return 1; \ - } else { \ - *(void **) (&cuFunc.funcName) = ptr; \ - opal_output_verbose(15, mca_common_cuda_output, "CUDA: successful dlsym of %s", \ - STRINGIFY(funcName)); \ - } \ - } while (0) - -/* Structure to hold CUDA function pointers that get dynamically loaded. */ -struct cudaFunctionTable { - int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr); - int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream); - int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t); - int (*cuMemAlloc)(CUdeviceptr *, size_t); - int (*cuMemFree)(CUdeviceptr buf); - int (*cuCtxGetCurrent)(void *cuContext); - int (*cuStreamCreate)(CUstream *, int); - int (*cuEventCreate)(CUevent *, int); - int (*cuEventRecord)(CUevent, CUstream); - int (*cuMemHostRegister)(void *, size_t, unsigned int); - int (*cuMemHostUnregister)(void *); - int (*cuEventQuery)(CUevent); - int (*cuEventDestroy)(CUevent); - int (*cuStreamWaitEvent)(CUstream, CUevent, unsigned int); - int (*cuMemGetAddressRange)(CUdeviceptr *, size_t *, CUdeviceptr); - int (*cuIpcGetEventHandle)(CUipcEventHandle *, CUevent); - int (*cuIpcOpenEventHandle)(CUevent *, CUipcEventHandle); - int (*cuIpcOpenMemHandle)(CUdeviceptr *, CUipcMemHandle, unsigned int); - int (*cuIpcCloseMemHandle)(CUdeviceptr); - int (*cuIpcGetMemHandle)(CUipcMemHandle *, CUdeviceptr); - int (*cuCtxGetDevice)(CUdevice *); - int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice); - int (*cuDeviceGet)(CUdevice *, int); -#if OPAL_CUDA_GDR_SUPPORT - int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr); -#endif /* OPAL_CUDA_GDR_SUPPORT */ - int (*cuCtxSetCurrent)(CUcontext); - int (*cuEventSynchronize)(CUevent); - int (*cuStreamSynchronize)(CUstream); - int (*cuStreamDestroy)(CUstream); -#if OPAL_CUDA_GET_ATTRIBUTES - int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr); -#endif /* OPAL_CUDA_GET_ATTRIBUTES */ -}; -typedef struct cudaFunctionTable cudaFunctionTable_t; -static cudaFunctionTable_t cuFunc; - -static int stage_one_init_ref_count = 0; -static bool stage_three_init_complete = false; -static bool common_cuda_initialized = false; -static bool common_cuda_mca_parames_registered = false; -static int mca_common_cuda_verbose; -static int mca_common_cuda_output = 0; -bool mca_common_cuda_enabled = false; -static bool mca_common_cuda_register_memory = true; -static bool mca_common_cuda_warning = false; -static opal_list_t common_cuda_memory_registrations; -static CUstream ipcStream = NULL; -static CUstream dtohStream = NULL; -static CUstream htodStream = NULL; -static CUstream memcpyStream = NULL; -static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1; -static opal_mutex_t common_cuda_init_lock; -static opal_mutex_t common_cuda_htod_lock; -static opal_mutex_t common_cuda_dtoh_lock; -static opal_mutex_t common_cuda_ipc_lock; - -/* Functions called by opal layer - plugged into opal function table */ -static int mca_common_cuda_is_gpu_buffer(const void *, opal_convertor_t *); -static int mca_common_cuda_memmove(void *, void *, size_t); -static int mca_common_cuda_cu_memcpy_async(void *, const void *, size_t, opal_convertor_t *); -static int mca_common_cuda_cu_memcpy(void *, const void *, size_t); - -/* Function that gets plugged into opal layer */ -static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *); - -/* Structure to hold memory registrations that are delayed until first - * call to send or receive a GPU pointer */ -struct common_cuda_mem_regs_t { - opal_list_item_t super; - void *ptr; - size_t amount; - char *msg; -}; -typedef struct common_cuda_mem_regs_t common_cuda_mem_regs_t; -OBJ_CLASS_DECLARATION(common_cuda_mem_regs_t); -OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t, opal_list_item_t, NULL, NULL); - -static int mca_common_cuda_async = 1; -static int mca_common_cuda_cumemcpy_async; -#if OPAL_ENABLE_DEBUG -static int mca_common_cuda_cumemcpy_timing; -#endif /* OPAL_ENABLE_DEBUG */ - -/* Array of CUDA events to be queried for IPC stream, sending side and - * receiving side. */ -CUevent *cuda_event_ipc_array = NULL; -CUevent *cuda_event_dtoh_array = NULL; -CUevent *cuda_event_htod_array = NULL; - -/* Array of fragments currently being moved by cuda async non-blocking - * operations */ -struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL; -struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL; -struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL; - -/* First free/available location in cuda_event_status_array */ -static int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail; - -/* First currently-being used location in the cuda_event_status_array */ -static int cuda_event_ipc_first_used, cuda_event_dtoh_first_used, cuda_event_htod_first_used; - -/* Number of status items currently in use */ -static volatile int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used; - -/* Size of array holding events */ -int cuda_event_max = 400; -static int cuda_event_ipc_most = 0; -static int cuda_event_dtoh_most = 0; -static int cuda_event_htod_most = 0; - -/* Handle to libcuda.so */ -opal_dl_handle_t *libcuda_handle = NULL; - -/* Unused variable that we register at init time and unregister at fini time. - * This is used to detect if user has done a device reset prior to MPI_Finalize. - * This is a workaround to avoid SEGVs. - */ -static int checkmem; -static int ctx_ok = 1; - -#define CUDA_COMMON_TIMING 0 -#if OPAL_ENABLE_DEBUG -/* Some timing support structures. Enable this to help analyze - * internal performance issues. */ -static opal_timer_t ts_start; -static opal_timer_t ts_end; -static double accum; -# define THOUSAND 1000L -# define MILLION 1000000L -static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end); -#endif /* OPAL_ENABLE_DEBUG */ - -/* These functions are typically unused in the optimized builds. */ -static void cuda_dump_evthandle(int, void *, char *) __opal_attribute_unused__; -static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__; -#if OPAL_ENABLE_DEBUG -# define CUDA_DUMP_MEMHANDLE(a) cuda_dump_memhandle a -# define CUDA_DUMP_EVTHANDLE(a) cuda_dump_evthandle a -#else -# define CUDA_DUMP_MEMHANDLE(a) -# define CUDA_DUMP_EVTHANDLE(a) -#endif /* OPAL_ENABLE_DEBUG */ - -/* This is a separate function so we can see these variables with ompi_info and - * also set them with the tools interface */ -void mca_common_cuda_register_mca_variables(void) -{ - - if (false == common_cuda_mca_parames_registered) { - common_cuda_mca_parames_registered = true; - } - /* Set different levels of verbosity in the cuda related code. */ - mca_common_cuda_verbose = 0; - (void) mca_base_var_register("ompi", "mpi", "common_cuda", "verbose", - "Set level of common cuda verbosity", MCA_BASE_VAR_TYPE_INT, NULL, - 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_common_cuda_verbose); - - /* Control whether system buffers get CUDA pinned or not. Allows for - * performance analysis. */ - mca_common_cuda_register_memory = true; - (void) mca_base_var_register("ompi", "mpi", "common_cuda", "register_memory", - "Whether to cuMemHostRegister preallocated BTL buffers", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &mca_common_cuda_register_memory); - - /* Control whether we see warnings when CUDA memory registration fails. This is - * useful when CUDA support is configured in, but we are running a regular MPI - * application without CUDA. */ - mca_common_cuda_warning = true; - (void) mca_base_var_register("ompi", "mpi", "common_cuda", "warning", - "Whether to print warnings when CUDA registration fails", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &mca_common_cuda_warning); - - /* Use this flag to test async vs sync copies */ - mca_common_cuda_async = 1; - (void) mca_base_var_register("ompi", "mpi", "common_cuda", "memcpy_async", - "Set to 0 to force CUDA sync copy instead of async", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &mca_common_cuda_async); - - /* Use this parameter to increase the number of outstanding events allows */ - (void) mca_base_var_register("ompi", "mpi", "common_cuda", "event_max", - "Set number of outstanding CUDA events", MCA_BASE_VAR_TYPE_INT, - NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &cuda_event_max); - - /* Use this flag to test cuMemcpyAsync vs cuMemcpy */ - mca_common_cuda_cumemcpy_async = 1; - (void) mca_base_var_register( - "ompi", "mpi", "common_cuda", "cumemcpy_async", - "Set to 0 to force CUDA cuMemcpy instead of cuMemcpyAsync/cuStreamSynchronize", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, - &mca_common_cuda_cumemcpy_async); - -#if OPAL_ENABLE_DEBUG - /* Use this flag to dump out timing of cumempcy sync and async */ - mca_common_cuda_cumemcpy_timing = 0; - (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_timing", - "Set to 1 to dump timing of eager copies", MCA_BASE_VAR_TYPE_INT, - NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, - &mca_common_cuda_cumemcpy_timing); -#endif /* OPAL_ENABLE_DEBUG */ - - (void) mca_base_var_register( - "ompi", "mpi", "common_cuda", "gpu_mem_check_workaround", - "Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_common_cuda_gpu_mem_check_workaround); -} - -/** - * This is the first stage of initialization. This function is called - * explicitly by any BTLs that can support CUDA-aware. It is called during - * the component open phase of initialization. This function will look for - * the SONAME of the library which is libcuda.so.1. In most cases, this will - * result in the library found. However, there are some setups that require - * the extra steps for searching. This function will then load the symbols - * needed from the CUDA driver library. Any failure will result in this - * initialization failing and status will be set showing that. - */ -int mca_common_cuda_stage_one_init(void) -{ - int retval, i, j; - char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL}; - char *searchpaths[] = {"", "/usr/lib64", NULL}; - char **errmsgs = NULL; - char *errmsg = NULL; - int errsize; - bool stage_one_init_passed = false; - - stage_one_init_ref_count++; - if (stage_one_init_ref_count > 1) { - opal_output_verbose(10, mca_common_cuda_output, - "CUDA: stage_one_init_ref_count is now %d, no need to init", - stage_one_init_ref_count); - return OPAL_SUCCESS; - } - - /* This is a no-op in most cases as the parameters were registered earlier */ - mca_common_cuda_register_mca_variables(); - - OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t); - OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t); - OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t); - OBJ_CONSTRUCT(&common_cuda_ipc_lock, opal_mutex_t); - - mca_common_cuda_output = opal_output_open(NULL); - opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose); - - opal_output_verbose(10, mca_common_cuda_output, - "CUDA: stage_one_init_ref_count is now %d, initializing", - stage_one_init_ref_count); - - /* First check if the support is enabled. In the case that the user has - * turned it off, we do not need to continue with any CUDA specific - * initialization. Do this after MCA parameter registration. */ - if (!opal_cuda_support) { - return 1; - } - - if (!OPAL_HAVE_DL_SUPPORT) { - opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true); - return 1; - } - - /* Now walk through all the potential names libcuda and find one - * that works. If it does, all is good. If not, print out all - * the messages about why things failed. This code was careful - * to try and save away all error messages if the loading ultimately - * failed to help with debugging. - * - * NOTE: On the first loop we just utilize the default loading - * paths from the system. For the second loop, set /usr/lib64 to - * the search path and try again. This is done to handle the case - * where we have both 32 and 64 bit libcuda.so libraries - * installed. Even when running in 64-bit mode, the /usr/lib - * directory is searched first and we may find a 32-bit - * libcuda.so.1 library. Loading of this library will fail as the - * OPAL DL framework does not handle having the wrong ABI in the - * search path (unlike ld or ld.so). Note that we only set this - * search path after the original search. This is so that - * LD_LIBRARY_PATH and run path settings are respected. Setting - * this search path overrides them (rather then being - * appended). */ - j = 0; - while (searchpaths[j] != NULL) { - i = 0; - while (cudalibs[i] != NULL) { - char *filename = NULL; - char *str = NULL; - - /* If there's a non-empty search path, prepend it - to the library filename */ - if (strlen(searchpaths[j]) > 0) { - opal_asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]); - } else { - filename = strdup(cudalibs[i]); - } - if (NULL == filename) { - opal_show_help("help-mpi-common-cuda.txt", "No memory", true, - OPAL_PROC_MY_HOSTNAME); - return 1; - } - - retval = opal_dl_open(filename, false, false, &libcuda_handle, &str); - if (OPAL_SUCCESS != retval || NULL == libcuda_handle) { - if (NULL != str) { - opal_argv_append(&errsize, &errmsgs, str); - } else { - opal_argv_append(&errsize, &errmsgs, "opal_dl_open() returned NULL."); - } - opal_output_verbose(10, mca_common_cuda_output, "CUDA: Library open error: %s", - errmsgs[errsize - 1]); - } else { - opal_output_verbose(10, mca_common_cuda_output, - "CUDA: Library successfully opened %s", cudalibs[i]); - stage_one_init_passed = true; - break; - } - i++; - - free(filename); - } - if (true == stage_one_init_passed) { - break; /* Break out of outer loop */ - } - j++; - } - - if (true != stage_one_init_passed) { - errmsg = opal_argv_join(errmsgs, '\n'); - if (opal_warn_on_missing_libcuda) { - opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", true, errmsg); - } - opal_cuda_support = 0; - } - opal_argv_free(errmsgs); - free(errmsg); - - if (true != stage_one_init_passed) { - return 1; - } - opal_cuda_add_initialization_function(&mca_common_cuda_stage_two_init); - OBJ_CONSTRUCT(&common_cuda_memory_registrations, opal_list_t); - - /* Map in the functions that we need. Note that if there is an error - * the macro OPAL_CUDA_DLSYM will print an error and call return. */ - OPAL_CUDA_DLSYM(libcuda_handle, cuStreamCreate); - OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventCreate); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventRecord); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostRegister); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostUnregister); - OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttribute); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventQuery); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventDestroy); - OPAL_CUDA_DLSYM(libcuda_handle, cuStreamWaitEvent); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpyAsync); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemFree); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemAlloc); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice); - OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer); - OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGet); -#if OPAL_CUDA_GDR_SUPPORT - OPAL_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute); -#endif /* OPAL_CUDA_GDR_SUPPORT */ - OPAL_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventSynchronize); - OPAL_CUDA_DLSYM(libcuda_handle, cuStreamSynchronize); - OPAL_CUDA_DLSYM(libcuda_handle, cuStreamDestroy); -#if OPAL_CUDA_GET_ATTRIBUTES - OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes); -#endif /* OPAL_CUDA_GET_ATTRIBUTES */ - opal_cuda_runtime_initialized = true; - return 0; -} - -/** - * This function is registered with the OPAL CUDA support. In that way, - * these function pointers will be loaded into the OPAL CUDA code when - * the first convertor is initialized. This does not trigger any CUDA - * specific initialization as this may just be a host buffer that is - * triggering this call. - */ -static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *ftable) -{ - if (OPAL_UNLIKELY(!opal_cuda_support)) { - return OPAL_ERROR; - } - - ftable->gpu_is_gpu_buffer = &mca_common_cuda_is_gpu_buffer; - ftable->gpu_cu_memcpy_async = &mca_common_cuda_cu_memcpy_async; - ftable->gpu_cu_memcpy = &mca_common_cuda_cu_memcpy; - ftable->gpu_memmove = &mca_common_cuda_memmove; - ftable->gpu_malloc = &mca_common_cuda_malloc; - ftable->gpu_free = &mca_common_cuda_free; - - opal_output_verbose(30, mca_common_cuda_output, "CUDA: support functions initialized"); - return OPAL_SUCCESS; -} - -/** - * This is the last phase of initialization. This is triggered when we examine - * a buffer pointer and determine it is a GPU buffer. We then assume the user - * has selected their GPU and we can go ahead with all the CUDA related - * initializations. If we get an error, just return. Cleanup of resources - * will happen when fini is called. - */ -static int mca_common_cuda_stage_three_init(void) -{ - int i, s, rc; - CUresult res; - CUcontext cuContext; - common_cuda_mem_regs_t *mem_reg; - - OPAL_THREAD_LOCK(&common_cuda_init_lock); - opal_output_verbose(20, mca_common_cuda_output, "CUDA: entering stage three init"); - - /* Compiled without support or user disabled support */ - if (OPAL_UNLIKELY(!opal_cuda_support)) { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: No mpi cuda support, exiting stage three init"); - stage_three_init_complete = true; - OPAL_THREAD_UNLOCK(&common_cuda_init_lock); - return OPAL_ERROR; - } - - /* In case another thread snuck in and completed the initialization */ - if (true == stage_three_init_complete) { - if (common_cuda_initialized) { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: Stage three already complete, exiting stage three init"); - OPAL_THREAD_UNLOCK(&common_cuda_init_lock); - return OPAL_SUCCESS; - } else { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: Stage three already complete, failed during the init"); - OPAL_THREAD_UNLOCK(&common_cuda_init_lock); - return OPAL_ERROR; - } - } - - /* Check to see if this process is running in a CUDA context. If - * so, all is good. If not, then disable registration of memory. */ - res = cuFunc.cuCtxGetCurrent(&cuContext); - if (CUDA_SUCCESS != res) { - if (mca_common_cuda_warning) { - /* Check for the not initialized error since we can make suggestions to - * user for this error. */ - if (CUDA_ERROR_NOT_INITIALIZED == res) { - opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed not initialized", - true); - } else { - opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed", true, res); - } - } - mca_common_cuda_enabled = false; - mca_common_cuda_register_memory = false; - } else if ((CUDA_SUCCESS == res) && (NULL == cuContext)) { - if (mca_common_cuda_warning) { - opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent returned NULL", true); - } - mca_common_cuda_enabled = false; - mca_common_cuda_register_memory = false; - } else { - /* All is good. mca_common_cuda_register_memory will retain its original - * value. Normally, that is 1, but the user can override it to disable - * registration of the internal buffers. */ - mca_common_cuda_enabled = true; - opal_output_verbose(20, mca_common_cuda_output, "CUDA: cuCtxGetCurrent succeeded"); - } - - /* No need to go on at this point. If we cannot create a context and we are at - * the point where we are making MPI calls, it is time to fully disable - * CUDA support. - */ - if (false == mca_common_cuda_enabled) { - OPAL_THREAD_UNLOCK(&common_cuda_init_lock); - return OPAL_ERROR; - } - - if (true == mca_common_cuda_enabled) { - /* Set up an array to store outstanding IPC async copy events */ - cuda_event_ipc_num_used = 0; - cuda_event_ipc_first_avail = 0; - cuda_event_ipc_first_used = 0; - - cuda_event_ipc_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *)); - if (NULL == cuda_event_ipc_array) { - opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - - /* Create the events since they can be reused. */ - for (i = 0; i < cuda_event_max; i++) { - res = cuFunc.cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING); - if (CUDA_SUCCESS != res) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", true, - OPAL_PROC_MY_HOSTNAME, res); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - } - - /* The first available status index is 0. Make an empty frag - array. */ - cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc( - sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max); - if (NULL == cuda_event_ipc_frag_array) { - opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - } - - if (true == mca_common_cuda_enabled) { - /* Set up an array to store outstanding async dtoh events. Used on the - * sending side for asynchronous copies. */ - cuda_event_dtoh_num_used = 0; - cuda_event_dtoh_first_avail = 0; - cuda_event_dtoh_first_used = 0; - - cuda_event_dtoh_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *)); - if (NULL == cuda_event_dtoh_array) { - opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - - /* Create the events since they can be reused. */ - for (i = 0; i < cuda_event_max; i++) { - res = cuFunc.cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING); - if (CUDA_SUCCESS != res) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", true, - OPAL_PROC_MY_HOSTNAME, res); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - } - - /* The first available status index is 0. Make an empty frag - array. */ - cuda_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **) malloc( - sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max); - if (NULL == cuda_event_dtoh_frag_array) { - opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - - /* Set up an array to store outstanding async htod events. Used on the - * receiving side for asynchronous copies. */ - cuda_event_htod_num_used = 0; - cuda_event_htod_first_avail = 0; - cuda_event_htod_first_used = 0; - - cuda_event_htod_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *)); - if (NULL == cuda_event_htod_array) { - opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - - /* Create the events since they can be reused. */ - for (i = 0; i < cuda_event_max; i++) { - res = cuFunc.cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING); - if (CUDA_SUCCESS != res) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", true, - OPAL_PROC_MY_HOSTNAME, res); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - } - - /* The first available status index is 0. Make an empty frag - array. */ - cuda_event_htod_frag_array = (struct mca_btl_base_descriptor_t **) malloc( - sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max); - if (NULL == cuda_event_htod_frag_array) { - opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - } - - s = opal_list_get_size(&common_cuda_memory_registrations); - for (i = 0; i < s; i++) { - mem_reg = (common_cuda_mem_regs_t *) opal_list_remove_first( - &common_cuda_memory_registrations); - if (mca_common_cuda_enabled && mca_common_cuda_register_memory) { - res = cuFunc.cuMemHostRegister(mem_reg->ptr, mem_reg->amount, 0); - if (res != CUDA_SUCCESS) { - /* If registering the memory fails, print a message and continue. - * This is not a fatal error. */ - opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed", - true, mem_reg->ptr, mem_reg->amount, OPAL_PROC_MY_HOSTNAME, res, - mem_reg->msg); - } else { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemHostRegister OK on rcache %s: " - "address=%p, bufsize=%d", - mem_reg->msg, mem_reg->ptr, (int) mem_reg->amount); - } - } - free(mem_reg->msg); - OBJ_RELEASE(mem_reg); - } - - /* Create stream for use in ipc asynchronous copies */ - res = cuFunc.cuStreamCreate(&ipcStream, 0); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", true, - OPAL_PROC_MY_HOSTNAME, res); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - - /* Create stream for use in dtoh asynchronous copies */ - res = cuFunc.cuStreamCreate(&dtohStream, 0); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", true, - OPAL_PROC_MY_HOSTNAME, res); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - - /* Create stream for use in htod asynchronous copies */ - res = cuFunc.cuStreamCreate(&htodStream, 0); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", true, - OPAL_PROC_MY_HOSTNAME, res); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - - if (mca_common_cuda_cumemcpy_async) { - /* Create stream for use in cuMemcpyAsync synchronous copies */ - res = cuFunc.cuStreamCreate(&memcpyStream, 0); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", true, - OPAL_PROC_MY_HOSTNAME, res); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - } - - res = cuFunc.cuMemHostRegister(&checkmem, sizeof(int), 0); - if (res != CUDA_SUCCESS) { - /* If registering the memory fails, print a message and continue. - * This is not a fatal error. */ - opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed", true, - &checkmem, sizeof(int), OPAL_PROC_MY_HOSTNAME, res, "checkmem"); - - } else { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemHostRegister OK on test region"); - } - - opal_output_verbose(20, mca_common_cuda_output, "CUDA: the extra gpu memory check is %s", - (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on" : "off"); - - opal_output_verbose(30, mca_common_cuda_output, "CUDA: initialized"); - opal_atomic_mb(); /* Make sure next statement does not get reordered */ - common_cuda_initialized = true; - stage_three_init_complete = true; - OPAL_THREAD_UNLOCK(&common_cuda_init_lock); - return OPAL_SUCCESS; - - /* If we are here, something went wrong. Cleanup and return an error. */ -cleanup_and_error: - opal_atomic_mb(); /* Make sure next statement does not get reordered */ - stage_three_init_complete = true; - OPAL_THREAD_UNLOCK(&common_cuda_init_lock); - return rc; -} - -/** - * Cleanup all CUDA resources. - * - * Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm - * rcache. Looks like with the memory pool from openib (grdma), the unregistering is - * called as the free list is destructed. Not true for the sm mpool. This means we - * are currently still leaking some host memory we registered with CUDA. - */ -void mca_common_cuda_fini(void) -{ - int i; - CUresult res; - - if (false == common_cuda_initialized) { - stage_one_init_ref_count--; - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: mca_common_cuda_fini, never completed initialization so " - "skipping fini, ref_count is now %d", - stage_one_init_ref_count); - return; - } - - if (0 == stage_one_init_ref_count) { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete", - stage_one_init_ref_count); - return; - } - - if (1 == stage_one_init_ref_count) { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up started", - stage_one_init_ref_count); - - /* This call is in here to make sure the context is still valid. - * This was the one way of checking which did not cause problems - * while calling into the CUDA library. This check will detect if - * a user has called cudaDeviceReset prior to MPI_Finalize. If so, - * then this call will fail and we skip cleaning up CUDA resources. */ - res = cuFunc.cuMemHostUnregister(&checkmem); - if (CUDA_SUCCESS != res) { - ctx_ok = 0; - } - opal_output_verbose( - 20, mca_common_cuda_output, - "CUDA: mca_common_cuda_fini, cuMemHostUnregister returned %d, ctx_ok=%d", res, ctx_ok); - - if (NULL != cuda_event_ipc_array) { - if (ctx_ok) { - for (i = 0; i < cuda_event_max; i++) { - if (NULL != cuda_event_ipc_array[i]) { - cuFunc.cuEventDestroy(cuda_event_ipc_array[i]); - } - } - } - free(cuda_event_ipc_array); - } - if (NULL != cuda_event_htod_array) { - if (ctx_ok) { - for (i = 0; i < cuda_event_max; i++) { - if (NULL != cuda_event_htod_array[i]) { - cuFunc.cuEventDestroy(cuda_event_htod_array[i]); - } - } - } - free(cuda_event_htod_array); - } - - if (NULL != cuda_event_dtoh_array) { - if (ctx_ok) { - for (i = 0; i < cuda_event_max; i++) { - if (NULL != cuda_event_dtoh_array[i]) { - cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]); - } - } - } - free(cuda_event_dtoh_array); - } - - if (NULL != cuda_event_ipc_frag_array) { - free(cuda_event_ipc_frag_array); - } - if (NULL != cuda_event_htod_frag_array) { - free(cuda_event_htod_frag_array); - } - if (NULL != cuda_event_dtoh_frag_array) { - free(cuda_event_dtoh_frag_array); - } - if ((NULL != ipcStream) && ctx_ok) { - cuFunc.cuStreamDestroy(ipcStream); - } - if ((NULL != dtohStream) && ctx_ok) { - cuFunc.cuStreamDestroy(dtohStream); - } - if ((NULL != htodStream) && ctx_ok) { - cuFunc.cuStreamDestroy(htodStream); - } - if ((NULL != memcpyStream) && ctx_ok) { - cuFunc.cuStreamDestroy(memcpyStream); - } - OBJ_DESTRUCT(&common_cuda_init_lock); - OBJ_DESTRUCT(&common_cuda_htod_lock); - OBJ_DESTRUCT(&common_cuda_dtoh_lock); - OBJ_DESTRUCT(&common_cuda_ipc_lock); - if (NULL != libcuda_handle) { - opal_dl_close(libcuda_handle); - } - - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up all done", - stage_one_init_ref_count); - - opal_output_close(mca_common_cuda_output); - - } else { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use", - stage_one_init_ref_count); - } - stage_one_init_ref_count--; -} - -/** - * Call the CUDA register function so we pin the memory in the CUDA - * space. - */ -void mca_common_cuda_register(void *ptr, size_t amount, char *msg) -{ - int res; - - /* Always first check if the support is enabled. If not, just return */ - if (!opal_cuda_support) - return; - - if (!common_cuda_initialized) { - OPAL_THREAD_LOCK(&common_cuda_init_lock); - if (!common_cuda_initialized) { - common_cuda_mem_regs_t *regptr; - regptr = OBJ_NEW(common_cuda_mem_regs_t); - regptr->ptr = ptr; - regptr->amount = amount; - regptr->msg = strdup(msg); - opal_list_append(&common_cuda_memory_registrations, (opal_list_item_t *) regptr); - OPAL_THREAD_UNLOCK(&common_cuda_init_lock); - return; - } - OPAL_THREAD_UNLOCK(&common_cuda_init_lock); - } - - if (mca_common_cuda_enabled && mca_common_cuda_register_memory) { - res = cuFunc.cuMemHostRegister(ptr, amount, 0); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - /* If registering the memory fails, print a message and continue. - * This is not a fatal error. */ - opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister failed", true, ptr, - amount, OPAL_PROC_MY_HOSTNAME, res, msg); - } else { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemHostRegister OK on rcache %s: " - "address=%p, bufsize=%d", - msg, ptr, (int) amount); - } - } -} - -/** - * Call the CUDA unregister function so we unpin the memory in the CUDA - * space. - */ -void mca_common_cuda_unregister(void *ptr, char *msg) -{ - int res, i, s; - common_cuda_mem_regs_t *mem_reg; - - /* This can happen if memory was queued up to be registered, but - * no CUDA operations happened, so it never was registered. - * Therefore, just release any of the resources. */ - if (!common_cuda_initialized) { - s = opal_list_get_size(&common_cuda_memory_registrations); - for (i = 0; i < s; i++) { - mem_reg = (common_cuda_mem_regs_t *) opal_list_remove_first( - &common_cuda_memory_registrations); - free(mem_reg->msg); - OBJ_RELEASE(mem_reg); - } - return; - } - - if (mca_common_cuda_enabled && mca_common_cuda_register_memory) { - res = cuFunc.cuMemHostUnregister(ptr); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - /* If unregistering the memory fails, just continue. This is during - * shutdown. Only print when running in verbose mode. */ - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemHostUnregister failed: ptr=%p, res=%d, rcache=%s", ptr, - res, msg); - - } else { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemHostUnregister OK on rcache %s: " - "address=%p", - msg, ptr); - } - } -} - -/* - * Get the memory handle of a local section of memory that can be sent - * to the remote size so it can access the memory. This is the - * registration function for the sending side of a message transfer. - */ -int cuda_getmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg, - mca_rcache_base_registration_t *hdrreg) - -{ - CUmemorytype memType; - CUresult result; - CUipcMemHandle *memHandle; - CUdeviceptr pbase; - size_t psize; - - mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t *) newreg; - memHandle = (CUipcMemHandle *) cuda_reg->data.memHandle; - - /* We should only be there if this is a CUDA device pointer */ - result = cuFunc.cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - (CUdeviceptr) base); - assert(CUDA_SUCCESS == result); - assert(CU_MEMORYTYPE_DEVICE == memType); - - /* Get the memory handle so we can send it to the remote process. */ - result = cuFunc.cuIpcGetMemHandle(memHandle, (CUdeviceptr) base); - CUDA_DUMP_MEMHANDLE((100, memHandle, "GetMemHandle-After")); - - if (CUDA_SUCCESS != result) { - opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetMemHandle failed", true, result, base); - return OPAL_ERROR; - } else { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuIpcGetMemHandle passed: base=%p size=%d", base, (int) size); - } - - /* Need to get the real base and size of the memory handle. This is - * how the remote side saves the handles in a cache. */ - result = cuFunc.cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr) base); - if (CUDA_SUCCESS != result) { - opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed", true, result, - base); - return OPAL_ERROR; - } else { - opal_output_verbose( - 10, mca_common_cuda_output, - "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ", base, - (int) size, (void *) pbase, (int) psize); - } - - /* Store all the information in the registration */ - cuda_reg->base.base = (void *) pbase; - cuda_reg->base.bound = (unsigned char *) pbase + psize - 1; - cuda_reg->data.memh_seg_addr.pval = (void *) pbase; - cuda_reg->data.memh_seg_len = psize; - -#if OPAL_CUDA_SYNC_MEMOPS - /* With CUDA 6.0, we can set an attribute on the memory pointer that will - * ensure any synchronous copies are completed prior to any other access - * of the memory region. This means we do not need to record an event - * and send to the remote side. - */ - memType = 1; /* Just use this variable since we already have it */ - result = cuFunc.cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - (CUdeviceptr) base); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed", true, - OPAL_PROC_MY_HOSTNAME, result, base); - return OPAL_ERROR; - } -#else - /* Need to record the event to ensure that any memcopies into the - * device memory have completed. The event handle associated with - * this event is sent to the remote process so that it will wait - * on this event prior to copying data out of the device memory. - * Note that this needs to be the NULL stream to make since it is - * unknown what stream any copies into the device memory were done - * with. */ - result = cuFunc.cuEventRecord((CUevent) cuda_reg->data.event, 0); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true, result, base); - return OPAL_ERROR; - } -#endif /* OPAL_CUDA_SYNC_MEMOPS */ - - return OPAL_SUCCESS; -} - -/* - * This function is called by the local side that called the cuda_getmemhandle. - * There is nothing to be done so just return. - */ -int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg) -{ - opal_output_verbose(10, mca_common_cuda_output, "CUDA: cuda_ungetmemhandle (no-op): base=%p", - reg->base); - CUDA_DUMP_MEMHANDLE( - (100, ((mca_rcache_common_cuda_reg_t *) reg)->data.memHandle, "cuda_ungetmemhandle")); - - return OPAL_SUCCESS; -} - -/* - * Open a memory handle that refers to remote memory so we can get an address - * that works on the local side. This is the registration function for the - * remote side of a transfer. newreg contains the new handle. hddrreg contains - * the memory handle that was received from the remote side. - */ -int cuda_openmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg, - mca_rcache_base_registration_t *hdrreg) -{ - CUresult result; - CUipcMemHandle *memHandle; - mca_rcache_common_cuda_reg_t *cuda_newreg = (mca_rcache_common_cuda_reg_t *) newreg; - - /* Save in local variable to avoid ugly casting */ - memHandle = (CUipcMemHandle *) cuda_newreg->data.memHandle; - CUDA_DUMP_MEMHANDLE((100, memHandle, "Before call to cuIpcOpenMemHandle")); - - /* Open the memory handle and store it into the registration structure. */ - result = cuFunc.cuIpcOpenMemHandle((CUdeviceptr *) &newreg->alloc_base, *memHandle, - CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); - - /* If there are some stale entries in the cache, they can cause other - * registrations to fail. Let the caller know that so that can attempt - * to clear them out. */ - if (CUDA_ERROR_ALREADY_MAPPED == result) { - opal_output_verbose(10, mca_common_cuda_output, - "CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for " - "p=%p,size=%d: notify memory pool\n", - base, (int) size); - return OPAL_ERR_WOULD_BLOCK; - } - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenMemHandle failed", true, - OPAL_PROC_MY_HOSTNAME, result, base); - /* Currently, this is a non-recoverable error */ - return OPAL_ERROR; - } else { - opal_output_verbose(10, mca_common_cuda_output, - "CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)", - newreg->alloc_base, base, (int) size); - CUDA_DUMP_MEMHANDLE((200, memHandle, "cuIpcOpenMemHandle")); - } - - return OPAL_SUCCESS; -} - -/* - * Close a memory handle that refers to remote memory. - */ -int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg) -{ - CUresult result; - mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t *) reg; - - /* Only attempt to close if we have valid context. This can change if a call - * to the fini function is made and we discover context is gone. */ - if (ctx_ok) { - result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr) cuda_reg->base.alloc_base); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - if (CUDA_ERROR_DEINITIALIZED != result) { - opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed", true, - result, cuda_reg->base.alloc_base); - } - /* We will just continue on and hope things continue to work. */ - } else { - opal_output_verbose(10, mca_common_cuda_output, - "CUDA: cuIpcCloseMemHandle passed: base=%p", - cuda_reg->base.alloc_base); - CUDA_DUMP_MEMHANDLE((100, cuda_reg->data.memHandle, "cuIpcCloseMemHandle")); - } - } - - return OPAL_SUCCESS; -} - -void mca_common_cuda_construct_event_and_handle(uintptr_t *event, void *handle) -{ - CUresult result; - - result = cuFunc.cuEventCreate((CUevent *) event, - CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", true, - OPAL_PROC_MY_HOSTNAME, result); - } - - result = cuFunc.cuIpcGetEventHandle((CUipcEventHandle *) handle, (CUevent) *event); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetEventHandle failed", true, result); - } - - CUDA_DUMP_EVTHANDLE((10, handle, "construct_event_and_handle")); -} - -void mca_common_cuda_destruct_event(uintptr_t event) -{ - CUresult result; - - /* Only attempt to destroy if we have valid context. This can change if a call - * to the fini function is made and we discover context is gone. */ - if (ctx_ok) { - result = cuFunc.cuEventDestroy((CUevent) event); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed", true, result); - } - } -} - -/* - * Put remote event on stream to ensure that the the start of the - * copy does not start until the completion of the event. - */ -void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg) -{ -#if OPAL_CUDA_SYNC_MEMOPS - /* No need for any of this with SYNC_MEMOPS feature */ - return; -#else /* OPAL_CUDA_SYNC_MEMOPS */ - CUipcEventHandle evtHandle; - CUevent event; - CUresult result; - - memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle)); - CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize")); - - result = cuFunc.cuIpcOpenEventHandle(&event, evtHandle); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed", true, result); - } - - /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier - * versions. Need to record an event on the stream, even though - * it is not used, to make sure we do not short circuit our way - * out of the cuStreamWaitEvent test. - */ - result = cuFunc.cuEventRecord(event, 0); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true, - OPAL_PROC_MY_HOSTNAME, result); - } - /* END of Workaround */ - - result = cuFunc.cuStreamWaitEvent(0, event, 0); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuStreamWaitEvent failed", true, result); - } - - /* All done with this event. */ - result = cuFunc.cuEventDestroy(event); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed", true, result); - } -#endif /* OPAL_CUDA_SYNC_MEMOPS */ -} - -/* - * Start the asynchronous copy. Then record and save away an event that will - * be queried to indicate the copy has completed. - */ -int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg, - struct mca_btl_base_descriptor_t *frag, int *done) -{ - CUresult result; - int iter; - - OPAL_THREAD_LOCK(&common_cuda_ipc_lock); - /* First make sure there is room to store the event. If not, then - * return an error. The error message will tell the user to try and - * run again, but with a larger array for storing events. */ - if (cuda_event_ipc_num_used == cuda_event_max) { - opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles", true, cuda_event_max, - cuda_event_max + 100, cuda_event_max + 100); - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - if (cuda_event_ipc_num_used > cuda_event_ipc_most) { - cuda_event_ipc_most = cuda_event_ipc_num_used; - /* Just print multiples of 10 */ - if (0 == (cuda_event_ipc_most % 10)) { - opal_output_verbose(20, mca_common_cuda_output, "Maximum ipc events used is now %d", - cuda_event_ipc_most); - } - } - - /* This is the standard way to run. Running with synchronous copies is available - * to measure the advantages of asynchronous copies. */ - if (OPAL_LIKELY(mca_common_cuda_async)) { - result = cuFunc.cuMemcpyAsync((CUdeviceptr) dst, (CUdeviceptr) src, amount, ipcStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, dst, src, - amount, result); - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return OPAL_ERROR; - } else { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d", dst, src, - (int) amount); - } - result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true, - OPAL_PROC_MY_HOSTNAME, result); - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return OPAL_ERROR; - } - cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag; - - /* Bump up the first available slot and number used by 1 */ - cuda_event_ipc_first_avail++; - if (cuda_event_ipc_first_avail >= cuda_event_max) { - cuda_event_ipc_first_avail = 0; - } - cuda_event_ipc_num_used++; - - *done = 0; - } else { - /* Mimic the async function so they use the same memcpy call. */ - result = cuFunc.cuMemcpyAsync((CUdeviceptr) dst, (CUdeviceptr) src, amount, ipcStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, dst, src, - amount, result); - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return OPAL_ERROR; - } else { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d", dst, src, - (int) amount); - } - - /* Record an event, then wait for it to complete with calls to cuEventQuery */ - result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true, - OPAL_PROC_MY_HOSTNAME, result); - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return OPAL_ERROR; - } - - cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag; - - /* Bump up the first available slot and number used by 1 */ - cuda_event_ipc_first_avail++; - if (cuda_event_ipc_first_avail >= cuda_event_max) { - cuda_event_ipc_first_avail = 0; - } - cuda_event_ipc_num_used++; - - result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]); - if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result); - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return OPAL_ERROR; - } - - iter = 0; - while (CUDA_ERROR_NOT_READY == result) { - if (0 == (iter % 10)) { - opal_output(-1, "EVENT NOT DONE (iter=%d)", iter); - } - result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]); - if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result); - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return OPAL_ERROR; - } - iter++; - } - - --cuda_event_ipc_num_used; - ++cuda_event_ipc_first_used; - if (cuda_event_ipc_first_used >= cuda_event_max) { - cuda_event_ipc_first_used = 0; - } - *done = 1; - } - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return OPAL_SUCCESS; -} - -/* - * Record an event and save the frag. This is called by the sending side and - * is used to queue an event when a htod copy has been initiated. - */ -int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_t *frag) -{ - CUresult result; - - /* First make sure there is room to store the event. If not, then - * return an error. The error message will tell the user to try and - * run again, but with a larger array for storing events. */ - OPAL_THREAD_LOCK(&common_cuda_dtoh_lock); - if (cuda_event_dtoh_num_used == cuda_event_max) { - opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles", true, cuda_event_max, - cuda_event_max + 100, cuda_event_max + 100); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - if (cuda_event_dtoh_num_used > cuda_event_dtoh_most) { - cuda_event_dtoh_most = cuda_event_dtoh_num_used; - /* Just print multiples of 10 */ - if (0 == (cuda_event_dtoh_most % 10)) { - opal_output_verbose(20, mca_common_cuda_output, "Maximum DtoH events used is now %d", - cuda_event_dtoh_most); - } - } - - result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true, - OPAL_PROC_MY_HOSTNAME, result); - OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock); - return OPAL_ERROR; - } - cuda_event_dtoh_frag_array[cuda_event_dtoh_first_avail] = frag; - - /* Bump up the first available slot and number used by 1 */ - cuda_event_dtoh_first_avail++; - if (cuda_event_dtoh_first_avail >= cuda_event_max) { - cuda_event_dtoh_first_avail = 0; - } - cuda_event_dtoh_num_used++; - - OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock); - return OPAL_SUCCESS; -} - -/* - * Record an event and save the frag. This is called by the receiving side and - * is used to queue an event when a dtoh copy has been initiated. - */ -int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag) -{ - CUresult result; - - OPAL_THREAD_LOCK(&common_cuda_htod_lock); - /* First make sure there is room to store the event. If not, then - * return an error. The error message will tell the user to try and - * run again, but with a larger array for storing events. */ - if (cuda_event_htod_num_used == cuda_event_max) { - opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles", true, cuda_event_max, - cuda_event_max + 100, cuda_event_max + 100); - OPAL_THREAD_UNLOCK(&common_cuda_htod_lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - if (cuda_event_htod_num_used > cuda_event_htod_most) { - cuda_event_htod_most = cuda_event_htod_num_used; - /* Just print multiples of 10 */ - if (0 == (cuda_event_htod_most % 10)) { - opal_output_verbose(20, mca_common_cuda_output, "Maximum HtoD events used is now %d", - cuda_event_htod_most); - } - } - - result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true, - OPAL_PROC_MY_HOSTNAME, result); - OPAL_THREAD_UNLOCK(&common_cuda_htod_lock); - return OPAL_ERROR; - } - cuda_event_htod_frag_array[cuda_event_htod_first_avail] = frag; - - /* Bump up the first available slot and number used by 1 */ - cuda_event_htod_first_avail++; - if (cuda_event_htod_first_avail >= cuda_event_max) { - cuda_event_htod_first_avail = 0; - } - cuda_event_htod_num_used++; - - OPAL_THREAD_UNLOCK(&common_cuda_htod_lock); - return OPAL_SUCCESS; -} - -/** - * Used to get the dtoh stream for initiating asynchronous copies. - */ -void *mca_common_cuda_get_dtoh_stream(void) -{ - return (void *) dtohStream; -} - -/** - * Used to get the htod stream for initiating asynchronous copies. - */ -void *mca_common_cuda_get_htod_stream(void) -{ - return (void *) htodStream; -} - -/* - * Function is called every time progress is called with the sm BTL. If there - * are outstanding events, check to see if one has completed. If so, hand - * back the fragment for further processing. - */ -int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **frag) -{ - CUresult result; - - if (OPAL_LIKELY(0 == cuda_event_ipc_num_used)) - return 0; - - OPAL_THREAD_LOCK(&common_cuda_ipc_lock); - if (cuda_event_ipc_num_used > 0) { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: progress_one_cuda_ipc_event, outstanding_events=%d", - cuda_event_ipc_num_used); - - result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]); - - /* We found an event that is not ready, so return. */ - if (CUDA_ERROR_NOT_READY == result) { - opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY"); - *frag = NULL; - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return 0; - } else if (CUDA_SUCCESS != result) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result); - *frag = NULL; - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return OPAL_ERROR; - } - - *frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used]; - opal_output_verbose(10, mca_common_cuda_output, "CUDA: cuEventQuery returned %d", result); - - /* Bump counters, loop around the circular buffer if necessary */ - --cuda_event_ipc_num_used; - ++cuda_event_ipc_first_used; - if (cuda_event_ipc_first_used >= cuda_event_max) { - cuda_event_ipc_first_used = 0; - } - /* A return value of 1 indicates an event completed and a frag was returned */ - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return 1; - } - OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock); - return 0; -} - -/** - * Progress any dtoh event completions. - */ -int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag) -{ - CUresult result; - - OPAL_THREAD_LOCK(&common_cuda_dtoh_lock); - if (cuda_event_dtoh_num_used > 0) { - opal_output_verbose(30, mca_common_cuda_output, - "CUDA: progress_one_cuda_dtoh_event, outstanding_events=%d", - cuda_event_dtoh_num_used); - - result = cuFunc.cuEventQuery(cuda_event_dtoh_array[cuda_event_dtoh_first_used]); - - /* We found an event that is not ready, so return. */ - if (CUDA_ERROR_NOT_READY == result) { - opal_output_verbose(30, mca_common_cuda_output, - "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY"); - *frag = NULL; - OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock); - return 0; - } else if (CUDA_SUCCESS != result) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result); - *frag = NULL; - OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock); - return OPAL_ERROR; - } - - *frag = cuda_event_dtoh_frag_array[cuda_event_dtoh_first_used]; - opal_output_verbose(30, mca_common_cuda_output, "CUDA: cuEventQuery returned %d", result); - - /* Bump counters, loop around the circular buffer if necessary */ - --cuda_event_dtoh_num_used; - ++cuda_event_dtoh_first_used; - if (cuda_event_dtoh_first_used >= cuda_event_max) { - cuda_event_dtoh_first_used = 0; - } - /* A return value of 1 indicates an event completed and a frag was returned */ - OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock); - return 1; - } - OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock); - return 0; -} - -/** - * Progress any dtoh event completions. - */ -int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) -{ - CUresult result; - - OPAL_THREAD_LOCK(&common_cuda_htod_lock); - if (cuda_event_htod_num_used > 0) { - opal_output_verbose(30, mca_common_cuda_output, - "CUDA: progress_one_cuda_htod_event, outstanding_events=%d", - cuda_event_htod_num_used); - - result = cuFunc.cuEventQuery(cuda_event_htod_array[cuda_event_htod_first_used]); - - /* We found an event that is not ready, so return. */ - if (CUDA_ERROR_NOT_READY == result) { - opal_output_verbose(30, mca_common_cuda_output, - "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY"); - *frag = NULL; - OPAL_THREAD_UNLOCK(&common_cuda_htod_lock); - return 0; - } else if (CUDA_SUCCESS != result) { - opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result); - *frag = NULL; - OPAL_THREAD_UNLOCK(&common_cuda_htod_lock); - return OPAL_ERROR; - } - - *frag = cuda_event_htod_frag_array[cuda_event_htod_first_used]; - opal_output_verbose(30, mca_common_cuda_output, "CUDA: cuEventQuery returned %d", result); - - /* Bump counters, loop around the circular buffer if necessary */ - --cuda_event_htod_num_used; - ++cuda_event_htod_first_used; - if (cuda_event_htod_first_used >= cuda_event_max) { - cuda_event_htod_first_used = 0; - } - /* A return value of 1 indicates an event completed and a frag was returned */ - OPAL_THREAD_UNLOCK(&common_cuda_htod_lock); - return 1; - } - OPAL_THREAD_UNLOCK(&common_cuda_htod_lock); - return OPAL_ERR_RESOURCE_BUSY; -} - -/** - * Need to make sure the handle we are retrieving from the cache is still - * valid. Compare the cached handle to the one received. - */ -int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg, - mca_rcache_common_cuda_reg_t *old_reg) -{ - - if (0 - == memcmp(new_reg->data.memHandle, old_reg->data.memHandle, - sizeof(new_reg->data.memHandle))) { - return 1; - } else { - return 0; - } -} - -/* - * Function to dump memory handle information. This is based on - * definitions from cuiinterprocess_private.h. - */ -static void cuda_dump_memhandle(int verbose, void *memHandle, char *str) -{ - - struct InterprocessMemHandleInternal { - /* The first two entries are the CUinterprocessCtxHandle */ - int64_t ctxId; /* unique (within a process) id of the sharing context */ - int pid; /* pid of sharing context */ - - int64_t size; - int64_t blocksize; - int64_t offset; - int gpuId; - int subDeviceIndex; - int64_t serial; - } memH; - - if (NULL == str) { - str = "CUDA"; - } - memcpy(&memH, memHandle, sizeof(memH)); - opal_output_verbose(verbose, mca_common_cuda_output, - "%s:ctxId=0x%" PRIx64 ", pid=%d, size=%" PRIu64 ", blocksize=%" PRIu64 - ", offset=%" PRIu64 ", gpuId=%d, subDeviceIndex=%d, serial=%" PRIu64, - str, memH.ctxId, memH.pid, memH.size, memH.blocksize, memH.offset, - memH.gpuId, memH.subDeviceIndex, memH.serial); -} - -/* - * Function to dump memory handle information. This is based on - * definitions from cuiinterprocess_private.h. - */ -static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) -{ - - struct InterprocessEventHandleInternal { - unsigned long pid; - unsigned long serial; - int index; - } evtH; - - if (NULL == str) { - str = "CUDA"; - } - memcpy(&evtH, evtHandle, sizeof(evtH)); - opal_output_verbose(verbose, mca_common_cuda_output, "CUDA: %s:pid=%lu, serial=%lu, index=%d", - str, evtH.pid, evtH.serial, evtH.index); -} - -/* Return microseconds of elapsed time. Microseconds are relevant when - * trying to understand the fixed overhead of the communication. Used - * when trying to time various functions. - * - * Cut and past the following to get timings where wanted. - * - * clock_gettime(CLOCK_MONOTONIC, &ts_start); - * FUNCTION OF INTEREST - * clock_gettime(CLOCK_MONOTONIC, &ts_end); - * accum = mydifftime(ts_start, ts_end); - * opal_output(0, "Function took %7.2f usecs\n", accum); - * - */ -#if OPAL_ENABLE_DEBUG -static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) -{ - return (ts_end - ts_start); -} -#endif /* OPAL_ENABLE_DEBUG */ - -/* Routines that get plugged into the opal datatype code */ -static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor) -{ - int res; - CUmemorytype memType = 0; - CUdeviceptr dbuf = (CUdeviceptr) pUserBuf; - CUcontext ctx = NULL, memCtx = NULL; -#if OPAL_CUDA_GET_ATTRIBUTES - uint32_t isManaged = 0; - /* With CUDA 7.0, we can get multiple attributes with a single call */ - CUpointer_attribute attributes[3] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - CU_POINTER_ATTRIBUTE_CONTEXT, - CU_POINTER_ATTRIBUTE_IS_MANAGED}; - void *attrdata[] = {(void *) &memType, (void *) &memCtx, (void *) &isManaged}; - - res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf); - OPAL_OUTPUT_VERBOSE((101, mca_common_cuda_output, - "dbuf=%p, memType=%d, memCtx=%p, isManaged=%d, res=%d", (void *) dbuf, - (int) memType, (void *) memCtx, isManaged, res)); - - /* Mark unified memory buffers with a flag. This will allow all unified - * memory to be forced through host buffers. Note that this memory can - * be either host or device so we need to set this flag prior to that check. */ - if (1 == isManaged) { - if (NULL != convertor) { - convertor->flags |= CONVERTOR_ACCELERATOR_UNIFIED; - } - } - if (res != CUDA_SUCCESS) { - /* If we cannot determine it is device pointer, - * just assume it is not. */ - return 0; - } else if (memType == CU_MEMORYTYPE_HOST) { - /* Host memory, nothing to do here */ - return 0; - } else if (memType == 0) { - /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */ - return 0; - } - /* Must be a device pointer */ - assert(memType == CU_MEMORYTYPE_DEVICE); -#else /* OPAL_CUDA_GET_ATTRIBUTES */ - res = cuFunc.cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf); - if (res != CUDA_SUCCESS) { - /* If we cannot determine it is device pointer, - * just assume it is not. */ - return 0; - } else if (memType == CU_MEMORYTYPE_HOST) { - /* Host memory, nothing to do here */ - return 0; - } - /* Must be a device pointer */ - assert(memType == CU_MEMORYTYPE_DEVICE); -#endif /* OPAL_CUDA_GET_ATTRIBUTES */ - - /* This piece of code was added in to handle in a case involving - * OMP threads. The user had initialized CUDA and then spawned - * two threads. The first thread had the CUDA context, but the - * second thread did not. We therefore had no context to act upon - * and future CUDA driver calls would fail. Therefore, if we have - * GPU memory, but no context, get the context from the GPU memory - * and set the current context to that. It is rare that we will not - * have a context. */ - res = cuFunc.cuCtxGetCurrent(&ctx); - if (OPAL_UNLIKELY(NULL == ctx)) { - if (CUDA_SUCCESS == res) { -#if !OPAL_CUDA_GET_ATTRIBUTES - res = cuFunc.cuPointerGetAttribute(&memCtx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_output(0, - "CUDA: error calling cuPointerGetAttribute: " - "res=%d, ptr=%p aborting...", - res, pUserBuf); - return OPAL_ERROR; - } -#endif /* OPAL_CUDA_GET_ATTRIBUTES */ - res = cuFunc.cuCtxSetCurrent(memCtx); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_output(0, - "CUDA: error calling cuCtxSetCurrent: " - "res=%d, ptr=%p aborting...", - res, pUserBuf); - return OPAL_ERROR; - } else { - OPAL_OUTPUT_VERBOSE( - (10, mca_common_cuda_output, "CUDA: cuCtxSetCurrent passed: ptr=%p", pUserBuf)); - } - } else { - /* Print error and proceed */ - opal_output(0, - "CUDA: error calling cuCtxGetCurrent: " - "res=%d, ptr=%p aborting...", - res, pUserBuf); - return OPAL_ERROR; - } - } - - /* WORKAROUND - They are times when the above code determines a piece of memory - * is GPU memory, but it actually is not. That has been seen on multi-GPU systems - * with 6 or 8 GPUs on them. Therefore, we will do this extra check. Note if we - * made it this far, then the assumption at this point is we have GPU memory. - * Unfortunately, this extra call is costing us another 100 ns almost doubling - * the cost of this entire function. */ - if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) { - CUdeviceptr pbase; - size_t psize; - res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf); - if (CUDA_SUCCESS != res) { - opal_output_verbose(5, mca_common_cuda_output, - "CUDA: cuMemGetAddressRange failed on this pointer: res=%d, buf=%p " - "Overriding check and setting to host pointer. ", - res, (void *) dbuf); - /* This cannot be GPU memory if the previous call failed */ - return 0; - } - } - - /* First access on a device pointer finalizes CUDA support initialization. - * If initialization fails, disable support. */ - if (!stage_three_init_complete) { - if (0 != mca_common_cuda_stage_three_init()) { - opal_cuda_support = 0; - } - } - - return 1; -} - -static int mca_common_cuda_cu_memcpy_async(void *dest, const void *src, size_t size, - opal_convertor_t *convertor) -{ - return cuFunc.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, - (CUstream) convertor->stream); -} - -/** - * This function is plugged into various areas where a cuMemcpy would be called. - * This is a synchronous operation that will not return until the copy is complete. - */ -static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size) -{ - CUresult result; -#if OPAL_ENABLE_DEBUG - CUmemorytype memTypeSrc, memTypeDst; - if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) { - /* Nice to know type of source and destination for timing output. Do - * not care about return code as memory type will just be set to 0 */ - result = cuFunc.cuPointerGetAttribute(&memTypeDst, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - (CUdeviceptr) dest); - result = cuFunc.cuPointerGetAttribute(&memTypeSrc, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - (CUdeviceptr) src); - ts_start = opal_timer_base_get_usec(); - } -#endif - if (mca_common_cuda_cumemcpy_async) { - result = cuFunc.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, memcpyStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, dest, src, - size, result); - return OPAL_ERROR; - } - result = cuFunc.cuStreamSynchronize(memcpyStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed", true, - OPAL_PROC_MY_HOSTNAME, result); - return OPAL_ERROR; - } - } else { - result = cuFunc.cuMemcpy((CUdeviceptr) dest, (CUdeviceptr) src, size); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed", true, - OPAL_PROC_MY_HOSTNAME, result); - return OPAL_ERROR; - } - } -#if OPAL_ENABLE_DEBUG - if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) { - ts_end = opal_timer_base_get_usec(); - accum = mydifftime(ts_start, ts_end); - if (mca_common_cuda_cumemcpy_async) { - opal_output(0, - "cuMemcpyAsync took %7.2f usecs, size=%d, (src=%p (%d), dst=%p (%d))\n", - accum, (int) size, src, memTypeSrc, dest, memTypeDst); - } else { - opal_output(0, "cuMemcpy took %7.2f usecs, size=%d, (src=%p (%d), dst=%p (%d))\n", - accum, (int) size, src, memTypeSrc, dest, memTypeDst); - } - } -#endif - return OPAL_SUCCESS; -} - -int mca_common_cuda_malloc(void **dptr, size_t size) -{ - int res, count = 0; - if (size > 0) { - res = cuFunc.cuMemAlloc((CUdeviceptr *) dptr, size); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_output(0, "CUDA: cuMemAlloc failed: res=%d", res); - return res; - } - } - return 0; -} - -int mca_common_cuda_free(void *dptr) -{ - int res; - if (NULL != dptr) { - res = cuFunc.cuMemFree((CUdeviceptr) dptr); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_output(0, "CUDA: cuMemFree failed: res=%d", res); - return res; - } - } - return 0; -} - -static int mca_common_cuda_memmove(void *dest, void *src, size_t size) -{ - CUdeviceptr tmp; - int result; - - result = cuFunc.cuMemAlloc(&tmp, size); - if (mca_common_cuda_cumemcpy_async) { - result = cuFunc.cuMemcpyAsync(tmp, (CUdeviceptr) src, size, memcpyStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size, - result); - return OPAL_ERROR; - } - result = cuFunc.cuMemcpyAsync((CUdeviceptr) dest, tmp, size, memcpyStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp, - size, result); - return OPAL_ERROR; - } - result = cuFunc.cuStreamSynchronize(memcpyStream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed", true, - OPAL_PROC_MY_HOSTNAME, result); - return OPAL_ERROR; - } - } else { - result = cuFunc.cuMemcpy(tmp, (CUdeviceptr) src, size); - if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { - opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", - result, (void *) tmp, src, (int) size); - return OPAL_ERROR; - } - result = cuFunc.cuMemcpy((CUdeviceptr) dest, tmp, size); - if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { - opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", - result, dest, (void *) tmp, (int) size); - return OPAL_ERROR; - } - } - cuFunc.cuMemFree(tmp); - return OPAL_SUCCESS; -} - -int mca_common_cuda_get_device(int *devicenum) -{ - CUdevice cuDev; - int res; - - res = cuFunc.cuCtxGetDevice(&cuDev); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_output(0, "CUDA: cuCtxGetDevice failed: res=%d", res); - return res; - } - *devicenum = cuDev; - return 0; -} - -int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2) -{ - int res; - res = cuFunc.cuDeviceCanAccessPeer(access, (CUdevice) dev1, (CUdevice) dev2); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_output(0, "CUDA: cuDeviceCanAccessPeer failed: res=%d", res); - return res; - } - return 0; -} - -int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base) -{ - CUresult result; - result = cuFunc.cuMemGetAddressRange((CUdeviceptr *) pbase, psize, (CUdeviceptr) base); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed 2", true, - OPAL_PROC_MY_HOSTNAME, result, base); - return OPAL_ERROR; - } else { - opal_output_verbose(50, mca_common_cuda_output, - "CUDA: cuMemGetAddressRange passed: addr=%p, pbase=%p, psize=%lu ", - base, *(char **) pbase, *psize); - } - return 0; -} - -#if OPAL_CUDA_GDR_SUPPORT -/* Check to see if the memory was freed between the time it was stored in - * the registration cache and now. Return true if the memory was previously - * freed. This is indicated by the BUFFER_ID value in the registration cache - * not matching the BUFFER_ID of the buffer we are checking. Return false - * if the registration is still good. - */ -bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg) -{ - int res; - unsigned long long bufID; - unsigned char *dbuf = reg->base; - - res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) dbuf); - /* If we cannot determine the BUFFER_ID, then print a message and default - * to forcing the registration to be kicked out. */ - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_show_help("help-mpi-common-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME, - res); - return true; - } - opal_output_verbose(50, mca_common_cuda_output, - "CUDA: base=%p, bufID=%llu, reg->gpu_bufID=%llu, %s", dbuf, bufID, - reg->gpu_bufID, - (reg->gpu_bufID == bufID ? "BUFFER_ID match" : "BUFFER_ID do not match")); - if (bufID != reg->gpu_bufID) { - return true; - } else { - return false; - } -} - -/* - * Get the buffer ID from the memory and store it in the registration. - * This is needed to ensure the cached registration is not stale. If - * we fail to get buffer ID, print an error and set buffer ID to 0. - * Also set SYNC_MEMOPS on any GPU registration to ensure that - * synchronous copies complete before the buffer is accessed. - */ -void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg) -{ - int res; - unsigned long long bufID = 0; - unsigned char *dbuf = reg->base; - int enable = 1; - - res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) dbuf); - if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { - opal_show_help("help-mpi-common-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME, - res); - } - reg->gpu_bufID = bufID; - - res = cuFunc.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - (CUdeviceptr) dbuf); - if (OPAL_UNLIKELY(CUDA_SUCCESS != res)) { - opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed", true, - OPAL_PROC_MY_HOSTNAME, res, dbuf); - } -} - -static bool initialized = false; -int opal_cuda_verbose = 0; -static int opal_cuda_enabled = 0; /* Starts out disabled */ -static int opal_cuda_output = 0; -static void opal_cuda_support_init(void); -static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL; -static opal_common_cuda_function_table_t ftable; - -/* This function allows the common cuda code to register an - * initialization function that gets called the first time an attempt - * is made to send or receive a GPU pointer. This allows us to delay - * some CUDA initialization until after MPI_Init(). - */ -void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) -{ - common_cuda_initialization_function = fptr; -} - -/** - * This function is called when a convertor is instantiated. It has to call - * the opal_cuda_support_init() function once to figure out if CUDA support - * is enabled or not. If CUDA is not enabled, then short circuit out - * for all future calls. - */ -void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf) -{ - /* Only do the initialization on the first GPU access */ - if (!initialized) { - opal_cuda_support_init(); - } - - /* This is needed to handle case where convertor is not fully initialized - * like when trying to do a sendi with convertor on the statck */ - convertor->cbmemcpy = (memcpy_fct_t) &opal_cuda_memcpy; - - /* If not enabled, then nothing else to do */ - if (!opal_cuda_enabled) { - return; - } - - if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) { - convertor->flags |= CONVERTOR_ACCELERATOR; - } -} - -/* Checks the type of pointer - * - * @param dest One pointer to check - * @param source Another pointer to check - */ -bool opal_cuda_check_bufs(char *dest, char *src) -{ - /* Only do the initialization on the first GPU access */ - if (!initialized) { - opal_cuda_support_init(); - } - - if (!opal_cuda_enabled) { - return false; - } - - if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) { - return true; - } else { - return false; - } -} - -/* - * With CUDA enabled, all contiguous copies will pass through this function. - * Therefore, the first check is to see if the convertor is a GPU buffer. - * Note that if there is an error with any of the CUDA calls, the program - * aborts as there is no recovering. - */ - -/* Checks the type of pointer - * - * @param buf check one pointer providing a convertor. - * Provides additional information, e.g. managed vs. unmanaged GPU buffer - */ -bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor) -{ - /* Only do the initialization on the first GPU access */ - if (!initialized) { - opal_cuda_support_init(); - } - - if (!opal_cuda_enabled) { - return false; - } - - return (ftable.gpu_is_gpu_buffer(buf, convertor)); -} - -/* - * This function allocates a buffer using either cuMemAlloc - * or malloc, depending on if the convertor flag CONVERTOR_CUDA - * is set. - * - * @param size Size of buffer to be allocated - * @param convertor The convertor with flags describing if the buf - * should be a Host or Cuda buffer. - * - * @returns void * A pointer to the newly allocated buffer. - */ -void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor) -{ - int res; - void *buffer; - if (!(convertor->flags & CONVERTOR_ACCELERATOR)) { - return malloc(size); - } - res = ftable.gpu_malloc(buffer, size); - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemAlloc: size=%d", (int) size); - abort(); - } else { - return buffer; - } -} - -/* - * This function frees a buffer using either cuMemFree() or free(), - * depending on if the convertor flag CONVERTOR_CUDA is set. - * - * @param buffer Pointer to buffer to be freed - * @param convertor The convertor with flags describing if the buf - * should be a Host or Cuda buffer. - * - */ -void opal_cuda_free(void *buffer, opal_convertor_t *convertor) -{ - int res; - if (!(convertor->flags & CONVERTOR_ACCELERATOR)) { - free(buffer); - return; - } - res = ftable.gpu_free(buffer); - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemFree: ptr=%p", buffer); - abort(); - } - return; -} - -/* - * With CUDA enabled, all contiguous copies will pass through this function. - * Therefore, the first check is to see if the convertor is a GPU buffer. - * Note that if there is an error with any of the CUDA calls, the program - * aborts as there is no recovering. - */ - -void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor) -{ - int res; - - if (!(convertor->flags & CONVERTOR_ACCELERATOR)) { - return memcpy(dest, src, size); - } - - if (convertor->flags & CONVERTOR_ACCELERATOR_ASYNC) { - res = ftable.gpu_cu_memcpy_async(dest, (void *) src, size, convertor); - } else { - res = ftable.gpu_cu_memcpy(dest, (void *) src, size); - } - - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, - (int) size); - abort(); - } else { - return dest; - } -} - -/* - * This function is needed in cases where we do not have contiguous - * datatypes. The current code has macros that cannot handle a convertor - * argument to the memcpy call. - */ -void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size) -{ - int res; - res = ftable.gpu_cu_memcpy(dest, src, size); - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, - (int) size); - abort(); - } else { - return dest; - } -} - -/* - * In some cases, need an implementation of memmove. This is not fast, but - * it is not often needed. - */ -void *opal_cuda_memmove(void *dest, void *src, size_t size) -{ - int res; - - res = ftable.gpu_memmove(dest, src, size); - if (res != 0) { - opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d", res, dest, - src, (int) size); - abort(); - } - return dest; -} - -/** - * This function gets called once to check if the program is running in a cuda - * environment. - */ -static void opal_cuda_support_init(void) -{ - if (initialized) { - return; - } - - /* Set different levels of verbosity in the cuda related code. */ - opal_cuda_output = opal_output_open(NULL); - opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose); - - /* Callback into the common cuda initialization routine. This is only - * set if some work had been done already in the common cuda code.*/ - if (NULL != common_cuda_initialization_function) { - if (0 == common_cuda_initialization_function(&ftable)) { - opal_cuda_enabled = 1; - } - } - - if (1 == opal_cuda_enabled) { - opal_output_verbose(10, opal_cuda_output, - "CUDA: enabled successfully, CUDA device pointers will work"); - } else { - opal_output_verbose(10, opal_cuda_output, - "CUDA: not enabled, CUDA device pointers will not work"); - } - - initialized = true; -} - -/** - * Tell the convertor that copies will be asynchronous CUDA copies. The - * flags are cleared when the convertor is reinitialized. - */ -void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream) -{ - convertor->flags |= CONVERTOR_ACCELERATOR_ASYNC; - convertor->stream = stream; -} -#endif /* OPAL_CUDA_GDR_SUPPORT */ diff --git a/opal/cuda/common_cuda.h b/opal/cuda/common_cuda.h deleted file mode 100644 index 431fff6daa7..00000000000 --- a/opal/cuda/common_cuda.h +++ /dev/null @@ -1,139 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OPAL_MCA_COMMON_CUDA_H -#define OPAL_MCA_COMMON_CUDA_H -#include "opal/datatype/opal_convertor.h" -#include "opal/mca/btl/btl.h" - -#define MEMHANDLE_SIZE 8 -#define EVTHANDLE_SIZE 8 - -struct mca_rcache_common_cuda_reg_data_t { - uint64_t memHandle[MEMHANDLE_SIZE]; - uint64_t evtHandle[EVTHANDLE_SIZE]; - uint64_t event; - opal_ptr_t memh_seg_addr; - size_t memh_seg_len; -}; -typedef struct mca_rcache_common_cuda_reg_data_t mca_rcache_common_cuda_reg_data_t; - -struct mca_rcache_common_cuda_reg_t { - mca_rcache_base_registration_t base; - mca_rcache_common_cuda_reg_data_t data; -}; -typedef struct mca_rcache_common_cuda_reg_t mca_rcache_common_cuda_reg_t; -extern bool mca_common_cuda_enabled; - -OPAL_DECLSPEC void mca_common_cuda_register_mca_variables(void); - -OPAL_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg); - -OPAL_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg); - -OPAL_DECLSPEC void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg); - -OPAL_DECLSPEC int mca_common_cuda_malloc(void **buffer, size_t size); -OPAL_DECLSPEC int mca_common_cuda_free(void *buffer); - -OPAL_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg, - struct mca_btl_base_descriptor_t *, int *done); - -OPAL_DECLSPEC int mca_common_cuda_record_ipc_event(char *msg, - struct mca_btl_base_descriptor_t *frag); -OPAL_DECLSPEC int mca_common_cuda_record_dtoh_event(char *msg, - struct mca_btl_base_descriptor_t *frag); -OPAL_DECLSPEC int mca_common_cuda_record_htod_event(char *msg, - struct mca_btl_base_descriptor_t *frag); - -OPAL_DECLSPEC void *mca_common_cuda_get_dtoh_stream(void); -OPAL_DECLSPEC void *mca_common_cuda_get_htod_stream(void); - -OPAL_DECLSPEC int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **); -OPAL_DECLSPEC int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **); -OPAL_DECLSPEC int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **); - -OPAL_DECLSPEC int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg, - mca_rcache_common_cuda_reg_t *old_reg); - -OPAL_DECLSPEC void mca_common_cuda_construct_event_and_handle(uintptr_t *event, void *handle); -OPAL_DECLSPEC void mca_common_cuda_destruct_event(uintptr_t event); - -OPAL_DECLSPEC int cuda_getmemhandle(void *base, size_t, mca_rcache_base_registration_t *newreg, - mca_rcache_base_registration_t *hdrreg); -OPAL_DECLSPEC int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg); -OPAL_DECLSPEC int cuda_openmemhandle(void *base, size_t size, - mca_rcache_base_registration_t *newreg, - mca_rcache_base_registration_t *hdrreg); -OPAL_DECLSPEC int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg); -OPAL_DECLSPEC int mca_common_cuda_get_device(int *devicenum); -OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2); -OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void); -OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base); -OPAL_DECLSPEC void mca_common_cuda_fini(void); -#if OPAL_CUDA_GDR_SUPPORT -OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg); -OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg); -#endif /* OPAL_CUDA_GDR_SUPPORT */ -/** - * Return: 0 if no packing is required for sending (the upper layer - * can use directly the pointer to the contiguous user - * buffer). - * 1 if data does need to be packed, i.e. heterogeneous peers - * (source arch != dest arch) or non contiguous memory - * layout. - */ -static inline int32_t opal_convertor_cuda_need_buffers(opal_convertor_t *pConvertor) -{ - int32_t retval; - uint32_t cudaflag = pConvertor->flags & CONVERTOR_ACCELERATOR; /* Save CUDA flag */ - pConvertor->flags &= ~CONVERTOR_ACCELERATOR; /* Clear CUDA flag if it exists */ - retval = opal_convertor_need_buffers(pConvertor); - pConvertor->flags |= cudaflag; /* Restore CUDA flag */ - return retval; -} - -/* Structure to hold CUDA support functions that gets filled in when the - * common cuda code is initialized. This removes any dependency on - * in the opal cuda datatype code. */ -struct opal_common_cuda_function_table { - int (*gpu_is_gpu_buffer)(const void *, opal_convertor_t *); - int (*gpu_cu_memcpy_async)(void *, const void *, size_t, opal_convertor_t *); - int (*gpu_cu_memcpy)(void *, const void *, size_t); - int (*gpu_memmove)(void *, void *, size_t); - int (*gpu_malloc)(void *, size_t); - int (*gpu_free)(void *); -}; -typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t; - -void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf); -bool opal_cuda_check_bufs(char *dest, char *src); -bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor); -void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor); -void opal_cuda_free(void *buffer, opal_convertor_t *convertor); -void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor); -void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size); -void *opal_cuda_memmove(void *dest, void *src, size_t size); -void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)); -void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream); - -#endif /* OPAL_MCA_COMMON_CUDA_H */ diff --git a/opal/cuda/help-mpi-common-cuda.txt b/opal/cuda/help-mpi-common-cuda.txt deleted file mode 100644 index e6f7913316b..00000000000 --- a/opal/cuda/help-mpi-common-cuda.txt +++ /dev/null @@ -1,212 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2011-2015 NVIDIA. All rights reserved. -# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -[cuCtxGetCurrent failed not initialized] -WARNING: The call to cuCtxGetCurrent() failed while attempting to register -internal memory with the CUDA environment. The program will continue to run, -but the performance of GPU memory transfers may be reduced. This failure -indicates that the CUDA environment is not yet initialized. To eliminate -this warning, ensure that CUDA is initialized prior to calling MPI_Init. - -NOTE: You can turn off this warning by setting the MCA parameter - mpi_common_cuda_warning to 0. -# -[cuCtxGetCurrent failed] -WARNING: The call to cuCtxGetCurrent() failed while attempting to register -internal memory with the CUDA environment. The program will continue to run, -but the performance of GPU memory transfers may be reduced. - cuCtxGetCurrent return value: %d - -NOTE: You can turn off this warning by setting the MCA parameter - mpi_common_cuda_warning to 0. -# -[cuCtxGetCurrent returned NULL] -WARNING: The call to cuCtxGetCurrent() failed while attempting to register -internal memory with the CUDA environment. The program will continue to run, -but the performance of GPU memory transfers may be reduced. This failure -indicates that there is no CUDA context yet. To eliminate this warning, -ensure that there is a CUDA context prior to calling MPI_Init. - -NOTE: You can turn off this warning by setting the MCA parameter - mpi_common_cuda_warning to 0. -# -[cuMemHostRegister during init failed] -The call to cuMemHostRegister(%p, %d, 0) failed. - Host: %s - cuMemHostRegister return value: %d - Registration cache: %s -# -[cuMemHostRegister failed] -The call to cuMemHostRegister(%p, %d, 0) failed. - Host: %s - cuMemHostRegister return value: %d - Registration cache: %s -# -[cuIpcGetMemHandle failed] -The call to cuIpcGetMemHandle failed. This means the GPU RDMA protocol -cannot be used. - cuIpcGetMemHandle return value: %d - address: %p -Check the cuda.h file for what the return value means. Perhaps a reboot -of the node will clear the problem. -# -[cuMemGetAddressRange failed] -The call to cuMemGetAddressRange failed. This means the GPU RDMA protocol -cannot be used. - cuMemGetAddressRange return value: %d - address: %p -Check the cuda.h file for what the return value means. Perhaps a reboot -of the node will clear the problem. -# -[cuMemGetAddressRange failed 2] -The call to cuMemGetAddressRange failed during the GPU RDMA protocol. - Host: %s - cuMemGetAddressRange return value: %d - address: %p -Check the cuda.h file for what the return value means. This is highly -unusual and should not happen. The program will probably abort. -# -[Out of cuEvent handles] -The library has exceeded its number of outstanding event handles. -For better performance, this number should be increased. - Current maximum handles: %4d - Suggested new maximum: %4d -Rerun with --mca mpi_common_cuda_event_max %d -# -[cuIpcOpenMemHandle failed] -The call to cuIpcOpenMemHandle failed. This is an unrecoverable error -and will cause the program to abort. - Hostname: %s - cuIpcOpenMemHandle return value: %d - address: %p -Check the cuda.h file for what the return value means. A possible cause -for this is not enough free device memory. Try to reduce the device -memory footprint of your application. -# -[cuIpcCloseMemHandle failed] -The call to cuIpcCloseMemHandle failed. This is a warning and the program -will continue to run. - cuIpcCloseMemHandle return value: %d - address: %p -Check the cuda.h file for what the return value means. Perhaps a reboot -of the node will clear the problem. -# -[cuMemcpyAsync failed] -The call to cuMemcpyAsync failed. This is a unrecoverable error and will -cause the program to abort. - cuMemcpyAsync(%p, %p, %d) returned value %d -Check the cuda.h file for what the return value means. -# -[cuEventCreate failed] -The call to cuEventCreate failed. This is a unrecoverable error and will -cause the program to abort. - Hostname: %s - cuEventCreate return value: %d -Check the cuda.h file for what the return value means. -# -[cuEventRecord failed] -The call to cuEventRecord failed. This is a unrecoverable error and will -cause the program to abort. - Hostname: %s - cuEventRecord return value: %d -Check the cuda.h file for what the return value means. -# -[cuEventQuery failed] -The call to cuEventQuery failed. This is a unrecoverable error and will -cause the program to abort. - cuEventQuery return value: %d -Check the cuda.h file for what the return value means. -# -[cuIpcGetEventHandle failed] -The call to cuIpcGetEventHandle failed. This is a unrecoverable error and will -cause the program to abort. - cuIpcGetEventHandle return value: %d -Check the cuda.h file for what the return value means. -# -[cuIpcOpenEventHandle failed] -The call to cuIpcOpenEventHandle failed. This is a unrecoverable error and will -cause the program to abort. - cuIpcOpenEventHandle return value: %d -Check the cuda.h file for what the return value means. -# -[cuStreamWaitEvent failed] -The call to cuStreamWaitEvent failed. This is a unrecoverable error and will -cause the program to abort. - cuStreamWaitEvent return value: %d -Check the cuda.h file for what the return value means. -# -[cuEventDestroy failed] -The call to cuEventDestory failed. This is a unrecoverable error and will -cause the program to abort. - cuEventDestory return value: %d -Check the cuda.h file for what the return value means. -# -[cuStreamCreate failed] -The call to cuStreamCreate failed. This is a unrecoverable error and will -cause the program to abort. - Hostname: %s - cuStreamCreate return value: %d -Check the cuda.h file for what the return vale means. -# -[dlopen disabled] -Open MPI was compiled without dynamic library support (e.g., with the - --disable-dlopen flag), and therefore cannot utilize CUDA support. - -If you need CUDA support, reconfigure Open MPI with dynamic library support enabled. -# -[dlopen failed] -The library attempted to open the following supporting CUDA libraries, -but each of them failed. CUDA-aware support is disabled. -%s -If you do not require CUDA-aware support, then run with ---mca opal_warn_on_missing_libcuda 0 to suppress this message. If you do -require CUDA-aware support, then try setting LD_LIBRARY_PATH to the location -of libcuda.so.1 to resolve this issue. -# -[dlsym failed] -An error occurred while trying to map in the address of a function. - Function Name: %s - Error string: %s -CUDA-aware support is disabled. -# -[bufferID failed] -An error occurred while trying to get the BUFFER_ID of a GPU memory -region. This could cause incorrect results. Turn of GPU Direct RDMA -support by running with --mca btl_openib_cuda_want_gdr_support 0. - Hostname: %s - cuPointerGetAttribute return value: %d -Check the cuda.h file for what the return value means. -[cuPointerSetAttribute failed] -The call to cuPointerSetAttribute with CU_POINTER_ATTRIBUTE_SYNC_MEMOPS -failed. This is highly unusual and should not happen. The program will -continue, but report this error to the Open MPI developers. - Hostname: %s - cuPointerSetAttribute return value: %d - Address: %p -Check the cuda.h file for what the return value means. -# -[cuStreamSynchronize failed] -The call to cuStreamSynchronize failed. This is highly unusual and should -not happen. Please report this error to the Open MPI developers. - Hostname: %s - cuStreamSynchronize return value: %d -Check the cuda.h file for what the return value means. -# -[cuMemcpy failed] -The call to cuMemcpy failed. This is highly unusual and should -not happen. Please report this error to the Open MPI developers. - Hostname: %s - cuMemcpy return value: %d -Check the cuda.h file for what the return value means. -# -[No memory] -A call to allocate memory within the CUDA support failed. This is -an unrecoverable error and will cause the program to abort. - Hostname: %s diff --git a/opal/cuda/owner.txt b/opal/cuda/owner.txt deleted file mode 100644 index 9a3b6b5a6d4..00000000000 --- a/opal/cuda/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: NVIDIA -status:active diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c index fc9573eef70..527bb310bb1 100644 --- a/opal/datatype/opal_datatype_module.c +++ b/opal/datatype/opal_datatype_module.c @@ -48,8 +48,6 @@ bool opal_ddt_copy_debug = false; bool opal_ddt_raw_debug = false; int opal_ddt_verbose = -1; /* Has the datatype verbose it's own output stream */ -extern int opal_cuda_verbose; - /* Using this macro implies that at this point _all_ information needed * to fill up the datatype are known. * We fill all the static information, the pointer to desc.desc is setup @@ -226,16 +224,6 @@ int opal_datatype_register_params(void) if (0 > ret) { return ret; } -# if OPAL_CUDA_SUPPORT - /* Set different levels of verbosity in the cuda related code. */ - ret = mca_base_var_register("opal", "opal", NULL, "cuda_verbose", - "Set level of opal cuda verbosity", MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_cuda_verbose); - if (0 > ret) { - return ret; - } -# endif #endif /* OPAL_ENABLE_DEBUG */ diff --git a/opal/include/opal/Makefile.am b/opal/include/opal/Makefile.am index ed657307caf..baf470529eb 100644 --- a/opal/include/opal/Makefile.am +++ b/opal/include/opal/Makefile.am @@ -29,7 +29,8 @@ headers += \ opal/hash_string.h \ opal/frameworks.h \ opal/opal_portable_platform.h \ - opal/opal_portable_platform_real.h + opal/opal_portable_platform_real.h \ + opal/opal_cuda.h nodist_headers += \ opal/version.h diff --git a/opal/include/opal/opal_cuda.h b/opal/include/opal/opal_cuda.h new file mode 100644 index 00000000000..5c91716cc2d --- /dev/null +++ b/opal/include/opal/opal_cuda.h @@ -0,0 +1,50 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024-2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2006 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * This file is intended only to carry shared types. If actual cuda + * symbols are required, they need to be added to a new common cuda + * component. + */ + +#ifndef OPAL_CUDA_H +#define OPAL_CUDA_H +#include "opal/mca/rcache/rcache.h" + +#define MEMHANDLE_SIZE 8 +#define EVTHANDLE_SIZE 8 + +struct mca_opal_cuda_reg_data_t { + uint64_t memHandle[MEMHANDLE_SIZE]; + uint64_t evtHandle[EVTHANDLE_SIZE]; + uint64_t event; + opal_ptr_t memh_seg_addr; + size_t memh_seg_len; +}; +typedef struct mca_opal_cuda_reg_data_t mca_opal_cuda_reg_data_t; + +struct mca_opal_cuda_reg_t { + mca_rcache_base_registration_t base; + mca_opal_cuda_reg_data_t data; +}; +typedef struct mca_opal_cuda_reg_t mca_opal_cuda_reg_t; +#endif /* OPAL_CUDA_H */ diff --git a/opal/mca/accelerator/cuda/Makefile.am b/opal/mca/accelerator/cuda/Makefile.am index a2463d729ef..5646890bab3 100644 --- a/opal/mca/accelerator/cuda/Makefile.am +++ b/opal/mca/accelerator/cuda/Makefile.am @@ -32,10 +32,13 @@ endif mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component_install) + mca_accelerator_cuda_la_SOURCES = $(sources) mca_accelerator_cuda_la_LDFLAGS = -module -avoid-version -mca_accelerator_cuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la +mca_accelerator_cuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \ + $(accelerator_cuda_LIBS) noinst_LTLIBRARIES = $(component_noinst) libmca_accelerator_cuda_la_SOURCES =$(sources) libmca_accelerator_cuda_la_LDFLAGS = -module -avoid-version +libmca_accelerator_cuda_la_LIBADD = $(accelerator_cuda_LIBS) diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index 5369680839c..9a955bac596 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -97,7 +97,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * CU_POINTER_ATTRIBUTE_IS_MANAGED}; void *attrdata[] = {(void *) &mem_type, (void *) &mem_ctx, (void *) &is_managed}; - result = opal_accelerator_cuda_func.cuPointerGetAttributes(3, attributes, attrdata, dbuf); + result = cuPointerGetAttributes(3, attributes, attrdata, dbuf); OPAL_OUTPUT_VERBOSE((101, opal_accelerator_base_framework.framework_output, "dbuf=%p, mem_type=%d, mem_ctx=%p, is_managed=%d, result=%d", (void *) dbuf, (int) mem_type, (void *) mem_ctx, is_managed, result)); @@ -121,7 +121,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * /* Must be a device pointer */ assert(CU_MEMORYTYPE_DEVICE == mem_type); #else /* OPAL_CUDA_GET_ATTRIBUTES */ - result = opal_accelerator_cuda_func.cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf); + result = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf); if (CUDA_SUCCESS != result) { /* If we cannot determine it is device pointer, * just assume it is not. */ @@ -142,11 +142,11 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * * GPU memory, but no context, get the context from the GPU memory * and set the current context to that. It is rare that we will not * have a context. */ - result = opal_accelerator_cuda_func.cuCtxGetCurrent(&ctx); + result = cuCtxGetCurrent(&ctx); if (OPAL_UNLIKELY(NULL == ctx)) { if (CUDA_SUCCESS == result) { #if !OPAL_CUDA_GET_ATTRIBUTES - result = opal_accelerator_cuda_func.cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf); + result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_output(0, "CUDA: error calling cuPointerGetAttribute: " @@ -155,7 +155,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * return OPAL_ERROR; } #endif /* OPAL_CUDA_GET_ATTRIBUTES */ - result = opal_accelerator_cuda_func.cuCtxSetCurrent(mem_ctx); + result = cuCtxSetCurrent(mem_ctx); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_output(0, "CUDA: error calling cuCtxSetCurrent: " @@ -185,7 +185,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * if (OPAL_LIKELY(((CUDA_VERSION > 7000) ? 0 : 1))) { CUdeviceptr pbase; size_t psize; - result = opal_accelerator_cuda_func.cuMemGetAddressRange(&pbase, &psize, dbuf); + result = cuMemGetAddressRange(&pbase, &psize, dbuf); if (CUDA_SUCCESS != result) { opal_output_verbose(5, opal_accelerator_base_framework.framework_output, "CUDA: cuMemGetAddressRange failed on this pointer: result=%d, buf=%p " @@ -214,7 +214,7 @@ static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t return OPAL_ERR_OUT_OF_RESOURCE; } - result = opal_accelerator_cuda_func.cuStreamCreate((*stream)->stream, 0); + result = cuStreamCreate((*stream)->stream, 0); if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true, OPAL_PROC_MY_HOSTNAME, result); @@ -230,7 +230,7 @@ static void opal_accelerator_cuda_stream_destruct(opal_accelerator_cuda_stream_t CUresult result; if (NULL != stream->base.stream) { - result = opal_accelerator_cuda_func.cuStreamDestroy(*(CUstream *)stream->base.stream); + result = cuStreamDestroy(*(CUstream *)stream->base.stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamDestroy failed", true, result); @@ -259,7 +259,7 @@ static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t ** OBJ_RELEASE(*event); return OPAL_ERR_OUT_OF_RESOURCE; } - result = opal_accelerator_cuda_func.cuEventCreate((*event)->event, CU_EVENT_DISABLE_TIMING); + result = cuEventCreate((*event)->event, CU_EVENT_DISABLE_TIMING); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuEventCreate failed", true, OPAL_PROC_MY_HOSTNAME, result); @@ -274,7 +274,7 @@ static void opal_accelerator_cuda_event_destruct(opal_accelerator_cuda_event_t * { CUresult result; if (NULL != event->base.event) { - result = opal_accelerator_cuda_func.cuEventDestroy(*(CUevent *)event->base.event); + result = cuEventDestroy(*(CUevent *)event->base.event); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuEventDestroy failed", true, result); @@ -297,7 +297,7 @@ static int accelerator_cuda_record_event(int dev_id, opal_accelerator_event_t *e return OPAL_ERR_BAD_PARAM; } - result = opal_accelerator_cuda_func.cuEventRecord(*(CUevent *)event->event, *(CUstream *)stream->stream); + result = cuEventRecord(*(CUevent *)event->event, *(CUstream *)stream->stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuEventRecord failed", true, OPAL_PROC_MY_HOSTNAME, result); @@ -314,7 +314,7 @@ static int accelerator_cuda_query_event(int dev_id, opal_accelerator_event_t *ev return OPAL_ERR_BAD_PARAM; } - result = opal_accelerator_cuda_func.cuEventQuery(*(CUevent *)event->event); + result = cuEventQuery(*(CUevent *)event->event); switch (result) { case CUDA_SUCCESS: { @@ -344,7 +344,7 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void * return OPAL_ERR_BAD_PARAM; } - result = opal_accelerator_cuda_func.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, *(CUstream *)stream->stream); + result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, *(CUstream *)stream->stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src, size, result); @@ -370,13 +370,13 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, * Additionally, cuMemcpy is not necessarily always synchronous. See: * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html * TODO: Add optimizations for type field */ - result = opal_accelerator_cuda_func.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream); + result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src, size, result); return OPAL_ERROR; } - result = opal_accelerator_cuda_func.cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream); + result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true, OPAL_PROC_MY_HOSTNAME, result); @@ -395,29 +395,29 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, return OPAL_ERR_BAD_PARAM; } - result = opal_accelerator_cuda_func.cuMemAlloc(&tmp, size); + result = cuMemAlloc(&tmp, size); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { return OPAL_ERROR; } - result = opal_accelerator_cuda_func.cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream); + result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size, result); return OPAL_ERROR; } - result = opal_accelerator_cuda_func.cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream); + result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp, size, result); return OPAL_ERROR; } - result = opal_accelerator_cuda_func.cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream); + result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true, OPAL_PROC_MY_HOSTNAME, result); return OPAL_ERROR; } - opal_accelerator_cuda_func.cuMemFree(tmp); + cuMemFree(tmp); return OPAL_SUCCESS; } @@ -430,7 +430,7 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size) } if (size > 0) { - result = opal_accelerator_cuda_func.cuMemAlloc((CUdeviceptr *) ptr, size); + result = cuMemAlloc((CUdeviceptr *) ptr, size); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true, OPAL_PROC_MY_HOSTNAME, result); @@ -444,7 +444,7 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr) { CUresult result; if (NULL != ptr) { - result = opal_accelerator_cuda_func.cuMemFree((CUdeviceptr) ptr); + result = cuMemFree((CUdeviceptr) ptr); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true, OPAL_PROC_MY_HOSTNAME, result); @@ -463,7 +463,7 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void return OPAL_ERR_BAD_PARAM; } - result = opal_accelerator_cuda_func.cuMemGetAddressRange((CUdeviceptr *) base, size, (CUdeviceptr) ptr); + result = cuMemGetAddressRange((CUdeviceptr *) base, size, (CUdeviceptr) ptr); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemGetAddressRange failed 2", true, OPAL_PROC_MY_HOSTNAME, result, ptr); @@ -483,7 +483,7 @@ static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size) return OPAL_ERR_BAD_PARAM; } - result = opal_accelerator_cuda_func.cuMemHostRegister(ptr, size, 0); + result = cuMemHostRegister(ptr, size, 0); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister failed", true, ptr, size, OPAL_PROC_MY_HOSTNAME, result); @@ -497,7 +497,7 @@ static int accelerator_cuda_host_unregister(int dev_id, void *ptr) { CUresult result; if (NULL != ptr) { - result = opal_accelerator_cuda_func.cuMemHostUnregister(ptr); + result = cuMemHostUnregister(ptr); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemHostUnregister failed", true, ptr, OPAL_PROC_MY_HOSTNAME, result); @@ -516,7 +516,7 @@ static int accelerator_cuda_get_device(int *dev_id) return OPAL_ERR_BAD_PARAM; } - result = opal_accelerator_cuda_func.cuCtxGetDevice(&cuDev); + result = cuCtxGetDevice(&cuDev); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuCtxGetDevice failed", true, result); @@ -534,7 +534,7 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de return OPAL_ERR_BAD_PARAM; } - result = opal_accelerator_cuda_func.cuDeviceCanAccessPeer(access, (CUdevice) dev1, (CUdevice) dev2); + result = cuDeviceCanAccessPeer(access, (CUdevice) dev1, (CUdevice) dev2); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuDeviceCanAccessPeer failed", true, OPAL_PROC_MY_HOSTNAME, result); @@ -554,13 +554,13 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc { CUresult result; int enable = 1; - result = opal_accelerator_cuda_func.cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr); + result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr); if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME, result); return result; } - result = opal_accelerator_cuda_func.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + result = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) addr); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuPointerSetAttribute failed", true, diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h index 4646029ce06..8efde778761 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.h +++ b/opal/mca/accelerator/cuda/accelerator_cuda.h @@ -25,41 +25,6 @@ typedef struct { opal_accelerator_base_component_t super; } opal_accelerator_cuda_component_t; -/* Structure to hold CUDA function pointers that get dynamically loaded. */ -struct accelerator_cuda_func_table { - int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr); - int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream); - int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t); - int (*cuMemcpy2D)(const CUDA_MEMCPY2D* pCopy); - int (*cuMemAlloc)(CUdeviceptr *, size_t); - int (*cuMemFree)(CUdeviceptr buf); - int (*cuCtxGetCurrent)(void *cuContext); - int (*cuStreamCreate)(CUstream *, int); - int (*cuEventCreate)(CUevent *, int); - int (*cuEventRecord)(CUevent, CUstream); - int (*cuEventQuery)(CUevent); - int (*cuEventDestroy)(CUevent); - int (*cuMemHostRegister)(void *, size_t, unsigned int); - int (*cuMemHostUnregister)(void *); - int (*cuMemGetAddressRange)(CUdeviceptr *, size_t *, CUdeviceptr); - int (*cuIpcGetEventHandle)(CUipcEventHandle *, CUevent); - int (*cuIpcOpenEventHandle)(CUevent *, CUipcEventHandle); - int (*cuIpcOpenMemHandle)(CUdeviceptr *, CUipcMemHandle, unsigned int); - int (*cuIpcCloseMemHandle)(CUdeviceptr); - int (*cuIpcGetMemHandle)(CUipcMemHandle *, CUdeviceptr); - int (*cuCtxGetDevice)(CUdevice *); - int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice); - int (*cuCtxSetCurrent)(CUcontext); - int (*cuEventSynchronize)(CUevent); - int (*cuStreamSynchronize)(CUstream); - int (*cuStreamDestroy)(CUstream); - int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr); -#if OPAL_CUDA_GET_ATTRIBUTES - int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr); -#endif /* OPAL_CUDA_GET_ATTRIBUTES */ -}; -typedef struct accelerator_cuda_func_table accelerator_cuda_func_table_t; - struct opal_accelerator_cuda_stream_t { opal_accelerator_stream_t base; }; @@ -73,7 +38,6 @@ typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t; OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t); /* Declare extern variables, defined in accelerator_cuda_component.c */ -OPAL_DECLSPEC extern accelerator_cuda_func_table_t opal_accelerator_cuda_func; OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream; OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock; diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c index dd3f9aade7f..2ffeebafd00 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c @@ -34,26 +34,11 @@ /* Define global variables, used in accelerator_cuda.c */ -accelerator_cuda_func_table_t opal_accelerator_cuda_func = {0}; CUstream opal_accelerator_cuda_memcpy_stream = NULL; opal_mutex_t opal_accelerator_cuda_stream_lock = {0}; #define STRINGIFY2(x) #x #define STRINGIFY(x) STRINGIFY2(x) -#define OPAL_CUDA_DLSYM(libhandle, func_name) \ - do { \ - char *err_msg; \ - void *ptr; \ - if (OPAL_SUCCESS != opal_dl_lookup(libhandle, STRINGIFY(func_name), &ptr, &err_msg)) { \ - opal_show_help("help-mpi-accelerator-cuda.txt", "dlsym failed", true, STRINGIFY(func_name), \ - err_msg); \ - return -1; \ - } else { \ - *(void **) (&opal_accelerator_cuda_func.func_name) = ptr; \ - opal_output_verbose(15, opal_accelerator_base_framework.framework_output, "CUDA: successful dlsym of %s", \ - STRINGIFY(funcName)); \ - } \ - } while (0) /* Unused variable that we register at init time and unregister at fini time. * This is used to detect if user has done a device reset prior to MPI_Finalize. @@ -76,7 +61,6 @@ static int accelerator_cuda_close(void); static int accelerator_cuda_component_register(void); static opal_accelerator_base_module_t* accelerator_cuda_init(void); static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module); -static int accelerator_cuda_populate_func_table(opal_dl_handle_t *libcuda_handle); /* * Instantiate the public struct with all of our public information * and pointers to our public functions in it @@ -134,13 +118,6 @@ static int accelerator_cuda_component_register(void) static opal_accelerator_base_module_t* accelerator_cuda_init(void) { int retval, i, j; - char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL}; - char *searchpaths[] = {"", "/usr/lib64", NULL}; - char **errmsgs = NULL; - char *errmsg = NULL; - int errsize; - bool found_libraries = false; - opal_dl_handle_t *libcuda_handle = NULL; CUresult result; CUcontext cuContext; @@ -153,95 +130,9 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void) return NULL; } - if (!OPAL_HAVE_DL_SUPPORT) { - opal_show_help("help-accelerator-cuda.txt", "dlopen disabled", true); - return NULL; - } - - /* Now walk through all the potential names libcuda and find one - * that works. If it does, all is good. If not, print out all - * the messages about why things failed. This code was careful - * to try and save away all error messages if the loading ultimately - * failed to help with debugging. - * - * NOTE: On the first loop we just utilize the default loading - * paths from the system. For the second loop, set /usr/lib64 to - * the search path and try again. This is done to handle the case - * where we have both 32 and 64 bit libcuda.so libraries - * installed. Even when running in 64-bit mode, the /usr/lib - * directory is searched first and we may find a 32-bit - * libcuda.so.1 library. Loading of this library will fail as the - * OPAL DL framework does not handle having the wrong ABI in the - * search path (unlike ld or ld.so). Note that we only set this - * search path after the original search. This is so that - * LD_LIBRARY_PATH and run path settings are respected. Setting - * this search path overrides them (rather then being - * appended). */ - - j = 0; - while (searchpaths[j] != NULL) { - i = 0; - while (cudalibs[i] != NULL) { - char *filename = NULL; - char *str = NULL; - - /* If there's a non-empty search path, prepend it - * to the library filename */ - if (strlen(searchpaths[j]) > 0) { - opal_asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]); - } else { - filename = strdup(cudalibs[i]); - } - if (NULL == filename) { - opal_show_help("help-accelerator-cuda.txt", "No memory", true, - OPAL_PROC_MY_HOSTNAME); - return NULL; - } - - retval = opal_dl_open(filename, false, false, &libcuda_handle, &str); - if (OPAL_SUCCESS != retval || NULL == libcuda_handle) { - if (NULL != str) { - opal_argv_append(&errsize, &errmsgs, str); - } else { - opal_argv_append(&errsize, &errmsgs, "opal_dl_open() returned NULL."); - } - opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "CUDA: Library open error: %s", - errmsgs[errsize - 1]); - } else { - opal_output_verbose(10, opal_accelerator_base_framework.framework_output, - "CUDA: Library successfully opened %s", cudalibs[i]); - found_libraries = true; - break; - } - i++; - free(filename); - } - if (true == found_libraries) { - break; /* Break out of outer loop */ - } - j++; - } - - if (true != found_libraries) { - errmsg = opal_argv_join(errmsgs, '\n'); - if (opal_warn_on_missing_libcuda) { - opal_show_help("help-accelerator-cuda.txt", "dlopen failed", true, errmsg); - } - } - opal_argv_free(errmsgs); - free(errmsg); - - if (true != found_libraries) { - return NULL; - } - - if (OPAL_SUCCESS != accelerator_cuda_populate_func_table(libcuda_handle)) { - return NULL; - } - /* Check to see if this process is running in a CUDA context. If * so, all is good. If not, then disable registration of memory. */ - result = opal_accelerator_cuda_func.cuCtxGetCurrent(&cuContext); + result = cuCtxGetCurrent(&cuContext); if (CUDA_SUCCESS != result) { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed"); return NULL; @@ -253,14 +144,14 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void) } /* Create stream for use in cuMemcpyAsync synchronous copies */ - result = opal_accelerator_cuda_func.cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0); + result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0); if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true, OPAL_PROC_MY_HOSTNAME, result); return NULL; } - result = opal_accelerator_cuda_func.cuMemHostRegister(&checkmem, sizeof(int), 0); + result = cuMemHostRegister(&checkmem, sizeof(int), 0); if (result != CUDA_SUCCESS) { /* If registering the memory fails, print a message and continue. * This is not a fatal error. */ @@ -283,51 +174,14 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module) * while calling into the CUDA library. This check will detect if * a user has called cudaDeviceReset prior to MPI_Finalize. If so, * then this call will fail and we skip cleaning up CUDA resources. */ - result = opal_accelerator_cuda_func.cuMemHostUnregister(&checkmem); + result = cuMemHostUnregister(&checkmem); if (CUDA_SUCCESS != result) { ctx_ok = 0; } if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) { - opal_accelerator_cuda_func.cuStreamDestroy(opal_accelerator_cuda_memcpy_stream); + cuStreamDestroy(opal_accelerator_cuda_memcpy_stream); } OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock); return; } - -static int accelerator_cuda_populate_func_table(opal_dl_handle_t *libcuda_handle) -{ - /* Map in the functions that we need. Note that if there is an error - * the macro OPAL_CUDA_DLSYM will print an error and call return. */ - OPAL_CUDA_DLSYM(libcuda_handle, cuStreamCreate); - OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventCreate); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventRecord); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventQuery); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventSynchronize); - OPAL_CUDA_DLSYM(libcuda_handle, cuEventDestroy); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostRegister); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostUnregister); - OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttribute); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpyAsync); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy2D); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemFree); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemAlloc); - OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle); - OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice); - OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer); - OPAL_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent); - OPAL_CUDA_DLSYM(libcuda_handle, cuStreamSynchronize); - OPAL_CUDA_DLSYM(libcuda_handle, cuStreamDestroy); - OPAL_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute); -#if OPAL_CUDA_GET_ATTRIBUTES - OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes); -#endif /* OPAL_CUDA_GET_ATTRIBUTES */ - return OPAL_SUCCESS; -} diff --git a/opal/mca/accelerator/cuda/configure.m4 b/opal/mca/accelerator/cuda/configure.m4 index 499801c34ca..aa67623c8b2 100644 --- a/opal/mca/accelerator/cuda/configure.m4 +++ b/opal/mca/accelerator/cuda/configure.m4 @@ -15,18 +15,22 @@ # # If CUDA support was requested, then build the CUDA support library. -# This code checks just makes sure the check was done earlier by the -# opal_check_cuda.m4 code. -# +# This code checks makes sure the check was done earlier by the +# opal_check_cuda.m4 code. It also copies the flags and libs under +# opal_cuda_CPPFLAGS, opal_cuda_LDFLAGS, and opal_cuda_LIBS AC_DEFUN([MCA_opal_accelerator_cuda_CONFIG],[ + AC_CONFIG_FILES([opal/mca/accelerator/cuda/Makefile]) - # make sure that CUDA-aware checks have been done - AC_REQUIRE([OPAL_CHECK_CUDA]) + OPAL_CHECK_CUDA([accelerator_cuda]) AS_IF([test "x$CUDA_SUPPORT" = "x1"], [$1], [$2]) + AC_SUBST([accelerator_cuda_CPPFLAGS]) + AC_SUBST([accelerator_cuda_LDFLAGS]) + AC_SUBST([accelerator_cuda_LIBS]) + ])dnl diff --git a/opal/mca/btl/smcuda/Makefile.am b/opal/mca/btl/smcuda/Makefile.am index 3b465af577e..f1a89df8dce 100644 --- a/opal/mca/btl/smcuda/Makefile.am +++ b/opal/mca/btl/smcuda/Makefile.am @@ -53,10 +53,12 @@ mcacomponent_LTLIBRARIES = $(component_install) mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources) mca_btl_smcuda_la_LDFLAGS = -module -avoid-version mca_btl_smcuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \ - $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la + $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la \ + $(btl_smcuda_LIBS) mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS) noinst_LTLIBRARIES = $(component_noinst) libmca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources) libmca_btl_smcuda_la_LDFLAGS = -module -avoid-version libmca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS) +libmca_btl_smcuda_la_LIBADD = $(btl_smcuda_LIBS) diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c index 7d73892950f..748568c289b 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.c +++ b/opal/mca/btl/smcuda/btl_smcuda.c @@ -68,7 +68,8 @@ #include "btl_smcuda_frag.h" #include "btl_smcuda_accelerator.h" -#include "opal/cuda/common_cuda.h" + +#include "opal/include/opal/opal_cuda.h" static struct mca_btl_base_registration_handle_t * mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl, @@ -1000,7 +1001,7 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl, uint32_t flags) { mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl; - mca_rcache_common_cuda_reg_t *reg; + mca_opal_cuda_reg_t *reg; int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rcache_flags = 0; @@ -1023,15 +1024,64 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl, struct mca_btl_base_registration_handle_t *handle) { mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl; - mca_rcache_common_cuda_reg_t *reg = (mca_rcache_common_cuda_reg_t + mca_opal_cuda_reg_t *reg = (mca_opal_cuda_reg_t *) ((intptr_t) handle - - offsetof(mca_rcache_common_cuda_reg_t, data)); + - offsetof(mca_opal_cuda_reg_t, data)); smcuda_module->rcache->rcache_deregister(smcuda_module->rcache, ®->base); return OPAL_SUCCESS; } +/* + * Put remote event on stream to ensure that the the start of the + * copy does not start until the completion of the event. + */ +static void mca_btl_smcuda_wait_stream_synchronize(mca_opal_cuda_reg_t *rget_reg) +{ +#if OPAL_CUDA_SYNC_MEMOPS + /* No need for any of this with SYNC_MEMOPS feature */ + return; +#else /* OPAL_CUDA_SYNC_MEMOPS */ + CUipcEventHandle evtHandle; + CUevent event; + CUresult result; + + memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle)); + + result = cuIpcOpenEventHandle(&event, evtHandle); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output, + "cuIpcOpenEventHandle failed"); + } + + /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier + * versions. Need to record an event on the stream, even though + * it is not used, to make sure we do not short circuit our way + * out of the cuStreamWaitEvent test. + */ + result = cuEventRecord(event, 0); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output, + "cuEventRecord failed"); + } + /* END of Workaround */ + + result = cuStreamWaitEvent(0, event, 0); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output, + "cuStreamWaitEvent failed"); + } + + /* All done with this event. */ + result = cuEventDestroy(event); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output, + "cuStreamWaitEvent failed"); + } +#endif /* OPAL_CUDA_SYNC_MEMOPS */ +} + int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, @@ -1039,8 +1089,8 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_rcache_common_cuda_reg_t rget_reg; - mca_rcache_common_cuda_reg_t *reg_ptr = &rget_reg; + mca_opal_cuda_reg_t rget_reg; + mca_opal_cuda_reg_t *reg_ptr = &rget_reg; int rc, done; void *remote_memory_address; size_t offset; @@ -1111,7 +1161,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba * is available in the sender's GPU buffer. Therefore, do a stream synchronize * on the IPC event that we received. Note that we pull it from * rget_reg, not reg_ptr, as we do not cache the event. */ - mca_common_wait_stream_synchronize(&rget_reg); + mca_btl_smcuda_wait_stream_synchronize(&rget_reg); rc = mca_btl_smcuda_memcpy(local_address, remote_memory_address, size, "mca_btl_smcuda_get", (mca_btl_base_descriptor_t *) frag); diff --git a/opal/mca/btl/smcuda/btl_smcuda_frag.h b/opal/mca/btl/smcuda/btl_smcuda_frag.h index 53ececfe6e8..886dd1490ab 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_frag.h +++ b/opal/mca/btl/smcuda/btl_smcuda_frag.h @@ -31,7 +31,7 @@ #include "opal_config.h" #include "btl_smcuda.h" -#include "opal/cuda/common_cuda.h" +#include "opal/include/opal/opal_cuda.h" #define MCA_BTL_SMCUDA_FRAG_TYPE_MASK ((uintptr_t) 0x3) #define MCA_BTL_SMCUDA_FRAG_SEND ((uintptr_t) 0x0) @@ -52,7 +52,7 @@ struct mca_btl_smcuda_hdr_t { typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t; struct mca_btl_base_registration_handle_t { - mca_rcache_common_cuda_reg_data_t reg_data; + mca_opal_cuda_reg_data_t reg_data; }; struct mca_btl_smcuda_segment_t { diff --git a/opal/mca/btl/smcuda/configure.m4 b/opal/mca/btl/smcuda/configure.m4 index 82b71aa858b..10b3721022c 100644 --- a/opal/mca/btl/smcuda/configure.m4 +++ b/opal/mca/btl/smcuda/configure.m4 @@ -19,12 +19,15 @@ AC_DEFUN([MCA_opal_btl_smcuda_CONFIG],[ AC_CONFIG_FILES([opal/mca/btl/smcuda/Makefile]) - # make sure that CUDA-aware checks have been done - AC_REQUIRE([OPAL_CHECK_CUDA]) + OPAL_CHECK_CUDA([btl_smcuda]) # Only build if CUDA support is available AS_IF([test "x$CUDA_SUPPORT" = "x1"], [$1 OPAL_MCA_CHECK_DEPENDENCY([opal], [btl], [smcuda], [opal], [common], [sm])], [$2]) + + AC_SUBST([btl_smcuda_CPPFLAGS]) + AC_SUBST([btl_smcuda_LDFLAGS]) + AC_SUBST([btl_smcuda_LIBS]) ])dnl diff --git a/opal/mca/rcache/gpusm/configure.m4 b/opal/mca/rcache/gpusm/configure.m4 index 2b792d7cc8c..65dd94811c5 100644 --- a/opal/mca/rcache/gpusm/configure.m4 +++ b/opal/mca/rcache/gpusm/configure.m4 @@ -19,9 +19,14 @@ AC_DEFUN([MCA_opal_rcache_gpusm_CONFIG],[ AC_CONFIG_FILES([opal/mca/rcache/gpusm/Makefile]) + OPAL_CHECK_CUDA([rcache_gpusm]) + # Use CUDA_SUPPORT which was filled in by the opal configure code. AS_IF([test "x$CUDA_SUPPORT" = "x1"], [$1], [$2]) + AC_SUBST([rcache_gpusm_CPPFLAGS]) + AC_SUBST([rcache_gpusm_LDFLAGS]) + AC_SUBST([rcache_gpusm_LIBS]) ])dnl diff --git a/opal/mca/rcache/gpusm/rcache_gpusm_module.c b/opal/mca/rcache/gpusm/rcache_gpusm_module.c index 37aa6696a3f..a38ef3e89b6 100644 --- a/opal/mca/rcache/gpusm/rcache_gpusm_module.c +++ b/opal/mca/rcache/gpusm/rcache_gpusm_module.c @@ -41,7 +41,8 @@ #include "opal_config.h" #include "opal/mca/rcache/base/base.h" #include "opal/mca/rcache/gpusm/rcache_gpusm.h" -#include "opal/cuda/common_cuda.h" +#include "opal/include/opal/opal_cuda.h" +#include /** * Called when the registration free list is created. An event is created @@ -49,7 +50,20 @@ */ static void mca_rcache_gpusm_registration_constructor(mca_rcache_gpusm_registration_t *item) { - mca_common_cuda_construct_event_and_handle(&item->event, (void *) &item->evtHandle); + uintptr_t *event = &item->event; + void *handle = (void *) &item->evtHandle; + CUresult result; + + result = cuEventCreate((CUevent *) event, + CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_output(0, "cuEventCreate failed\n"); + } + + result = cuIpcGetEventHandle((CUipcEventHandle *) handle, (CUevent) *event); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_output(0, "cuIpcGetEventHandle failed\n"); + } } /** @@ -57,8 +71,13 @@ static void mca_rcache_gpusm_registration_constructor(mca_rcache_gpusm_registrat */ static void mca_rcache_gpusm_registration_destructor(mca_rcache_gpusm_registration_t *item) { - mca_common_cuda_destruct_event(item->event); + uintptr_t event = item->event; + CUresult result; + result = cuEventDestroy((CUevent) event); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_output(0, "cuEventDestroy failed"); + } } OBJ_CLASS_INSTANCE(mca_rcache_gpusm_registration_t, mca_rcache_base_registration_t, @@ -81,7 +100,7 @@ void mca_rcache_gpusm_module_init(mca_rcache_gpusm_module_t *rcache) /* Start with 0 entries in the free list since CUDA may not have * been initialized when this free list is created and there is * some CUDA specific activities that need to be done. */ - opal_free_list_init(&rcache->reg_list, sizeof(struct mca_rcache_common_cuda_reg_t), + opal_free_list_init(&rcache->reg_list, sizeof(struct mca_opal_cuda_reg_t), opal_cache_line_size, OBJ_CLASS(mca_rcache_gpusm_registration_t), 0, opal_cache_line_size, 0, -1, 64, NULL, 0, NULL, NULL, NULL); } @@ -96,6 +115,77 @@ int mca_rcache_gpusm_find(mca_rcache_base_module_t *rcache, void *addr, size_t s return mca_rcache_gpusm_register(rcache, addr, size, 0, 0, reg); } +/* + * Get the memory handle of a local section of memory that can be sent + * to the remote size so it can access the memory. This is the + * registration function for the sending side of a message transfer. + */ +static int mca_rcache_gpusm_get_mem_handle(void *base, size_t size, mca_rcache_base_registration_t *newreg) +{ + CUmemorytype memType; + CUresult result; + CUipcMemHandle *memHandle; + CUdeviceptr pbase; + size_t psize; + + mca_opal_cuda_reg_t *cuda_reg = (mca_opal_cuda_reg_t *) newreg; + memHandle = (CUipcMemHandle *) cuda_reg->data.memHandle; + + /* We should only be there if this is a CUDA device pointer */ + result = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (CUdeviceptr) base); + assert(CUDA_SUCCESS == result); + assert(CU_MEMORYTYPE_DEVICE == memType); + + /* Get the memory handle so we can send it to the remote process. */ + result = cuIpcGetMemHandle(memHandle, (CUdeviceptr) base); + + if (CUDA_SUCCESS != result) { + return OPAL_ERROR; + } + + /* Need to get the real base and size of the memory handle. This is + * how the remote side saves the handles in a cache. */ + result = cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr) base); + if (CUDA_SUCCESS != result) { + return OPAL_ERROR; + } + + /* Store all the information in the registration */ + cuda_reg->base.base = (void *) pbase; + cuda_reg->base.bound = (unsigned char *) pbase + psize - 1; + cuda_reg->data.memh_seg_addr.pval = (void *) pbase; + cuda_reg->data.memh_seg_len = psize; + +#if OPAL_CUDA_SYNC_MEMOPS + /* With CUDA 6.0, we can set an attribute on the memory pointer that will + * ensure any synchronous copies are completed prior to any other access + * of the memory region. This means we do not need to record an event + * and send to the remote side. + */ + memType = 1; /* Just use this variable since we already have it */ + result = cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr) base); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + return OPAL_ERROR; + } +#else + /* Need to record the event to ensure that any memcopies into the + * device memory have completed. The event handle associated with + * this event is sent to the remote process so that it will wait + * on this event prior to copying data out of the device memory. + * Note that this needs to be the NULL stream to make since it is + * unknown what stream any copies into the device memory were done + * with. */ + result = cuEventRecord((CUevent) cuda_reg->data.event, 0); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + return OPAL_ERROR; + } +#endif /* OPAL_CUDA_SYNC_MEMOPS */ + + return OPAL_SUCCESS; +} + /* * This is the one function that does all the work. It will call into * the register function to get the memory handle for the sending @@ -133,7 +223,7 @@ int mca_rcache_gpusm_register(mca_rcache_base_module_t *rcache, void *addr, size gpusm_reg->flags = flags; gpusm_reg->access_flags = access_flags; - rc = cuda_getmemhandle(base, size, gpusm_reg, NULL); + rc = mca_rcache_gpusm_get_mem_handle(base, size, gpusm_reg); if (rc != OPAL_SUCCESS) { opal_free_list_return(&rcache_gpusm->reg_list, item); diff --git a/opal/mca/rcache/rgpusm/configure.m4 b/opal/mca/rcache/rgpusm/configure.m4 index a9bce3c39dd..f76c27b8c35 100644 --- a/opal/mca/rcache/rgpusm/configure.m4 +++ b/opal/mca/rcache/rgpusm/configure.m4 @@ -19,9 +19,14 @@ AC_DEFUN([MCA_opal_rcache_rgpusm_CONFIG],[ AC_CONFIG_FILES([opal/mca/rcache/rgpusm/Makefile]) + OPAL_CHECK_CUDA([rcache_rgpusm]) + # Use CUDA_SUPPORT which was filled in by the opal configure code. AS_IF([test "x$CUDA_SUPPORT" = "x1"], [$1], [$2]) + AC_SUBST([rcache_rgpusm_CPPFLAGS]) + AC_SUBST([rcache_rgpusm_LDFLAGS]) + AC_SUBST([rcache_rgpusm_LIBS]) ])dnl diff --git a/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c b/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c index 2859a14c7be..92287055dc3 100644 --- a/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c +++ b/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c @@ -86,10 +86,80 @@ #ifdef HAVE_MALLOC_H # include #endif -#include "opal/cuda/common_cuda.h" +#include "opal/include/opal/opal_cuda.h" #include "opal/mca/rcache/base/base.h" #include "opal/mca/rcache/rcache.h" #include "opal/util/proc.h" +#include + +/* + * Open a memory handle that refers to remote memory so we can get an address + * that works on the local side. This is the registration function for the + * remote side of a transfer. newreg contains the new handle. hddrreg contains + * the memory handle that was received from the remote side. + */ +static int mca_rcache_rgpusm_open_mem_handle(void *base, size_t size, mca_rcache_base_registration_t *newreg) +{ + CUresult result; + CUipcMemHandle *memHandle; + mca_opal_cuda_reg_t *cuda_newreg = (mca_opal_cuda_reg_t *) newreg; + + /* Save in local variable to avoid ugly casting */ + memHandle = (CUipcMemHandle *) cuda_newreg->data.memHandle; + + /* Open the memory handle and store it into the registration structure. */ + result = cuIpcOpenMemHandle((CUdeviceptr *) &newreg->alloc_base, *memHandle, + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + + /* If there are some stale entries in the cache, they can cause other + * registrations to fail. Let the caller know that so that can attempt + * to clear them out. */ + if (CUDA_ERROR_ALREADY_MAPPED == result) { + opal_output_verbose(10, mca_rcache_rgpusm_component.output, + "CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for " + "p=%p,size=%d: notify memory pool\n", + base, (int) size); + return OPAL_ERR_WOULD_BLOCK; + } + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_output_verbose(10, mca_rcache_rgpusm_component.output, + "CUDA: cuIpcOpenMemHandle failed: base=%p (remote base=%p,size=%d)", + newreg->alloc_base, base, (int) size); + /* Currently, this is a non-recoverable error */ + return OPAL_ERROR; + } else { + opal_output_verbose(10, mca_rcache_rgpusm_component.output, + "CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)", + newreg->alloc_base, base, (int) size); + } + + return OPAL_SUCCESS; +} + +/* + * Close a memory handle that refers to remote memory. + */ +static int mca_rcache_rgpusm_close_mem_handle(void *reg_data, mca_rcache_base_registration_t *reg) +{ + CUresult result; + mca_opal_cuda_reg_t *cuda_reg = (mca_opal_cuda_reg_t *) reg; + + result = cuIpcCloseMemHandle((CUdeviceptr) cuda_reg->base.alloc_base); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + if (CUDA_ERROR_DEINITIALIZED != result) { + opal_output_verbose(10, mca_rcache_rgpusm_component.output, + "CUDA: cuIpcCloseMemHandle failed: base=%p", + cuda_reg->base.alloc_base); + } + /* We will just continue on and hope things continue to work. */ + } else { + opal_output_verbose(10, mca_rcache_rgpusm_component.output, + "CUDA: cuIpcCloseMemHandle passed: base=%p", + cuda_reg->base.alloc_base); + } + + return OPAL_SUCCESS; +} static int mca_rcache_rgpusm_deregister_no_lock(struct mca_rcache_base_module_t *, mca_rcache_base_registration_t *); @@ -113,7 +183,7 @@ static inline bool mca_rcache_rgpusm_deregister_lru(mca_rcache_base_module_t *rc /* Drop the rcache lock while we deregister the memory */ OPAL_THREAD_UNLOCK(&rcache->lock); assert(old_reg->ref_count == 0); - rc = cuda_closememhandle(NULL, old_reg); + rc = mca_rcache_rgpusm_close_mem_handle(NULL, old_reg); OPAL_THREAD_LOCK(&rcache->lock); /* This introduces a potential leak of registrations if @@ -145,7 +215,7 @@ void mca_rcache_rgpusm_module_init(mca_rcache_rgpusm_module_t *rcache) rcache->vma_module = mca_rcache_base_vma_module_alloc(); OBJ_CONSTRUCT(&rcache->reg_list, opal_free_list_t); - opal_free_list_init(&rcache->reg_list, sizeof(struct mca_rcache_common_cuda_reg_t), + opal_free_list_init(&rcache->reg_list, sizeof(struct mca_opal_cuda_reg_t), opal_cache_line_size, OBJ_CLASS(mca_rcache_base_registration_t), 0, opal_cache_line_size, 0, -1, 32, NULL, 0, NULL, NULL, NULL); OBJ_CONSTRUCT(&rcache->lru_list, opal_list_t); @@ -164,8 +234,8 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz mca_rcache_base_registration_t **reg) { mca_rcache_rgpusm_module_t *rcache_rgpusm = (mca_rcache_rgpusm_module_t *) rcache; - mca_rcache_common_cuda_reg_t *rgpusm_reg; - mca_rcache_common_cuda_reg_t *rget_reg; + mca_opal_cuda_reg_t *rgpusm_reg; + mca_opal_cuda_reg_t *rget_reg; opal_free_list_item_t *item; int rc; int mypeer; /* just for debugging */ @@ -174,7 +244,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz * function, we are using the **reg variable to not only get back the * registration information, but to hand in the memory handle received * from the remote side. */ - rget_reg = (mca_rcache_common_cuda_reg_t *) *reg; + rget_reg = (mca_opal_cuda_reg_t *) *reg; mypeer = flags; flags = 0; @@ -193,7 +263,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz if (NULL == item) { return OPAL_ERR_OUT_OF_RESOURCE; } - rgpusm_reg = (mca_rcache_common_cuda_reg_t *) item; + rgpusm_reg = (mca_opal_cuda_reg_t *) item; rgpusm_reg->base.rcache = rcache; rgpusm_reg->base.base = addr; rgpusm_reg->base.bound = (unsigned char *) addr + size - 1; @@ -207,8 +277,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz /* The rget_reg registration is holding the memory handle needed * to register the remote memory. This was received from the remote * process. A pointer to the memory is returned in the alloc_base field. */ - rc = cuda_openmemhandle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg, - (mca_rcache_base_registration_t *) rget_reg); + rc = mca_rcache_rgpusm_open_mem_handle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg); /* This error should not happen with no cache in use. */ assert(OPAL_ERR_WOULD_BLOCK != rc); @@ -240,8 +309,8 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz (int) size, (*reg)->base, (int) ((*reg)->bound - (*reg)->base)); if (0 == - memcmp(((mca_rcache_common_cuda_reg_t *)*reg)->data.memHandle, rget_reg->data.memHandle, - sizeof(((mca_rcache_common_cuda_reg_t *)*reg)->data.memHandle))) { + memcmp(((mca_opal_cuda_reg_t *)*reg)->data.memHandle, rget_reg->data.memHandle, + sizeof(((mca_opal_cuda_reg_t *)*reg)->data.memHandle))) { /* Registration matches what was requested. All is good. */ rcache_rgpusm->stat_cache_valid++; } else { @@ -306,7 +375,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz OPAL_THREAD_UNLOCK(&rcache->lock); return OPAL_ERR_OUT_OF_RESOURCE; } - rgpusm_reg = (mca_rcache_common_cuda_reg_t *) item; + rgpusm_reg = (mca_opal_cuda_reg_t *) item; rgpusm_reg->base.rcache = rcache; rgpusm_reg->base.base = addr; @@ -321,8 +390,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz * bound values may be changed by the registration. The memory * associated with the handle comes back in the alloc_base * value. */ - rc = cuda_openmemhandle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg, - (mca_rcache_base_registration_t *) rget_reg); + rc = mca_rcache_rgpusm_open_mem_handle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg); /* There is a chance we can get the OPAL_ERR_WOULD_BLOCK from the * CUDA codes attempt to register the memory. The case that this * can happen is as follows. A block of memory is registered. @@ -360,8 +428,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz rcache_rgpusm->stat_evicted++; /* And try again. This one usually works. */ - rc = cuda_openmemhandle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg, - (mca_rcache_base_registration_t *) rget_reg); + rc = mca_rcache_rgpusm_open_mem_handle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg); } /* There is a chance that another registration is blocking our @@ -373,8 +440,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz break; } /* Clear out one registration. */ - rc = cuda_openmemhandle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg, - (mca_rcache_base_registration_t *) rget_reg); + rc = mca_rcache_rgpusm_open_mem_handle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg); } } @@ -507,7 +573,7 @@ int mca_rcache_rgpusm_deregister(struct mca_rcache_base_module_t *rcache, { assert(reg->ref_count == 0); - rc = cuda_closememhandle(NULL, reg); + rc = mca_rcache_rgpusm_close_mem_handle(NULL, reg); } OPAL_THREAD_LOCK(&rcache->lock); @@ -543,7 +609,7 @@ int mca_rcache_rgpusm_deregister_no_lock(struct mca_rcache_base_module_t *rcache mca_rcache_base_vma_delete(rcache_rgpusm->vma_module, reg); assert(reg->ref_count == 0); - rc = cuda_closememhandle(NULL, reg); + rc = mca_rcache_rgpusm_close_mem_handle(NULL, reg); if (OPAL_SUCCESS == rc) { opal_free_list_return(&rcache_rgpusm->reg_list, (opal_free_list_item_t *) reg); @@ -594,7 +660,7 @@ void mca_rcache_rgpusm_finalize(struct mca_rcache_base_module_t *rcache) /* Drop lock before deregistering memory */ OPAL_THREAD_UNLOCK(&rcache->lock); assert(reg->ref_count == 0); - rc = cuda_closememhandle(NULL, reg); + rc = mca_rcache_rgpusm_close_mem_handle(NULL, reg); OPAL_THREAD_LOCK(&rcache->lock); if (rc != OPAL_SUCCESS) {