diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
index 43b4a3662ac..6405ed0ad81 100644
--- a/config/opal_check_cuda.m4
+++ b/config/opal_check_cuda.m4
@@ -27,7 +27,22 @@ dnl
 dnl $HEADER$
 dnl
 
+
+# OPAL_CHECK_CUDA(prefix, [action-if-found], [action-if-not-found])
+# --------------------------------------------------------
+# check if CUDA support can be found.  sets prefix_{CPPFLAGS,
+# LDFLAGS, LIBS} as needed and runs action-if-found if there is
+# support, otherwise executes action-if-not-found
+
+#
+# Check for CUDA support
+#
 AC_DEFUN([OPAL_CHECK_CUDA],[
+OPAL_VAR_SCOPE_PUSH([cuda_save_CPPFLAGS cuda_save_LDFLAGS cuda_save_LIBS])
+
+cuda_save_CPPFLAGS="$CPPFLAGS"
+cuda_save_LDFLAGS="$LDFLAGS"
+cuda_save_LIBS="$LIBS"
 #
 # Check to see if user wants CUDA support
 #
@@ -72,12 +87,15 @@ AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"],
                             opal_cuda_incdir="$with_cuda/include"
                             AC_MSG_RESULT([found ($opal_cuda_incdir/cuda.h)])])])])])
 
-dnl We cannot have CUDA support without dlopen support.  HOWEVER, at
-dnl this point in configure, we can't know whether the DL framework
-dnl has been configured or not yet (it likely hasn't, since CUDA is a
-dnl common framework, and likely configured first).  So we have to
-dnl defer this check until later (see the OPAL_CHECK_CUDA_AFTER_OPAL_DL m4
-dnl macro, below).  :-(
+AS_IF([test "$opal_check_cuda_happy" = "yes"],
+    [OAC_CHECK_PACKAGE([cuda],
+                       [$1],
+                       [cuda.h],
+                       [cuda],
+                       [cuMemFree],
+                       [opal_check_cuda_happy="yes"],
+                       [opal_check_cuda_happy="no"])],
+    [])
 
 # We require CUDA IPC support which started in CUDA 4.1. Error
 # out if the support is not there.
@@ -144,22 +162,9 @@ AM_CONDITIONAL([OPAL_cuda_gdr_support], [test "x$CUDA_VERSION_60_OR_GREATER" = "
 AC_DEFINE_UNQUOTED([OPAL_CUDA_GDR_SUPPORT],$CUDA_VERSION_60_OR_GREATER,
                    [Whether we have CUDA GDR support available])
 
+CPPFLAGS=${cuda_save_CPPFLAGS}
+LDFLAGS=${cuda_save_LDFLAGS}
+LIBS=${cuda_save_LIBS}
+OPAL_VAR_SCOPE_POP
 ])
 
-dnl
-dnl CUDA support requires DL support (it dynamically opens the CUDA
-dnl library at run time).  But we do not check for OPAL DL support
-dnl until lafter the initial OPAL_CHECK_CUDA is called.  So put the
-dnl CUDA+DL check in a separate macro that can be called after the DL MCA
-dnl framework checks in the top-level configure.ac.
-dnl
-AC_DEFUN([OPAL_CHECK_CUDA_AFTER_OPAL_DL],[
-
-    # We cannot have CUDA support without OPAL DL support.  Error out
-    # if the user wants CUDA but we do not have OPAL DL support.
-    AS_IF([test $OPAL_HAVE_DL_SUPPORT -eq 0 && \
-           test "$opal_check_cuda_happy" = "yes"],
-          [AC_MSG_WARN([--with-cuda was specified, but dlopen support is disabled.])
-           AC_MSG_WARN([You must reconfigure Open MPI with dlopen ("dl") support.])
-           AC_MSG_ERROR([Cannot continue.])])
-])
diff --git a/config/opal_config_files.m4 b/config/opal_config_files.m4
index 18cbe0066e6..78358d998c1 100644
--- a/config/opal_config_files.m4
+++ b/config/opal_config_files.m4
@@ -17,7 +17,6 @@
 AC_DEFUN([OPAL_CONFIG_FILES],[
     AC_CONFIG_FILES([
         opal/Makefile
-        opal/cuda/Makefile
         opal/etc/Makefile
         opal/include/Makefile
         opal/datatype/Makefile
diff --git a/configure.ac b/configure.ac
index c87f5f64c78..6ee1de964a2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -987,7 +987,6 @@ AC_CACHE_SAVE
 
 opal_show_title "System-specific tests"
 
-OPAL_CHECK_CUDA
 ##################################
 OPAL_CHECK_OS_FLAVORS
 
@@ -1233,8 +1232,6 @@ AC_CACHE_SAVE
 # be done better by having some kind of "run this check at the end of
 # all other MCA checks" hook...?
 
-OPAL_CHECK_CUDA_AFTER_OPAL_DL
-
 OPAL_CHECK_ROCM_AFTER_OPAL_DL
 
 ##################################
diff --git a/opal/Makefile.am b/opal/Makefile.am
index a24d3d3114a..1aad41b8ffb 100644
--- a/opal/Makefile.am
+++ b/opal/Makefile.am
@@ -22,26 +22,18 @@
 # $HEADER$
 #
 
-if OPAL_cuda_support
-LIBOPAL_GPU_SUBDIR = cuda
-LIBOPAL_GPU_LA = cuda/libopalcuda.la
-endif
-
-
 SUBDIRS = \
 	include \
         datatype \
         etc \
         util \
 	mca/base \
-	$(LIBOPAL_GPU_SUBDIR) \
 	$(MCA_opal_FRAMEWORKS_SUBDIRS) \
 	$(MCA_opal_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \
         . \
 	$(MCA_opal_FRAMEWORK_COMPONENT_DSO_SUBDIRS)
 DIST_SUBDIRS = \
 	include \
-	cuda \
         datatype \
         etc \
 	util \
@@ -67,13 +59,11 @@ lib@OPAL_LIB_NAME@_la_LIBADD = \
 	libopen-pal_core.la \
         datatype/libdatatype.la \
         util/libopalutil.la \
-	$(LIBOPAL_GPU_LA) \
 	$(MCA_opal_FRAMEWORK_LIBS)
 lib@OPAL_LIB_NAME@_la_DEPENDENCIES = \
 	libopen-pal_core.la \
         datatype/libdatatype.la \
         util/libopalutil.la \
-        $(LIBOPAL_GPU_LA)  \
         $(MCA_opal_FRAMEWORK_LIBS)
 lib@OPAL_LIB_NAME@_la_LDFLAGS = -version-info @libopen_pal_so_version@
 
diff --git a/opal/cuda/Makefile.am b/opal/cuda/Makefile.am
deleted file mode 100644
index b9f6db41ff6..00000000000
--- a/opal/cuda/Makefile.am
+++ /dev/null
@@ -1,44 +0,0 @@
-#
-# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
-#                         University Research and Technology
-#                         Corporation.  All rights reserved.
-# Copyright (c) 2004-2013 The University of Tennessee and The University
-#                         of Tennessee Research Foundation.  All rights
-#                         reserved.
-# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
-#                         University of Stuttgart.  All rights reserved.
-# Copyright (c) 2004-2005 The Regents of the University of California.
-#                         All rights reserved.
-# Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
-# Copyright (c) 2014-2015 Cisco Systems, Inc.  All rights reserved.
-# Copyright (c) 2022      Amazon.com, Inc. or its affiliates.  All Rights reserved.
-# $COPYRIGHT$
-#
-# Additional copyrights may follow
-#
-# $HEADER$
-#
-
-AM_CPPFLAGS = $(common_cuda_CPPFLAGS)
-
-# Header files
-headers = \
-        common_cuda.h
-
-# Source files
-sources = \
-        common_cuda.c
-
-dist_opaldata_DATA = help-mpi-common-cuda.txt
-
-noinst_LTLIBRARIES = libopalcuda.la
-
-libopalcuda_la_SOURCES = $(headers) $(sources)
-libopalcuda_la_LDFLAGS =
-libopalcuda_la_LIBADD =
-
-# Conditionally install the header files
-if WANT_INSTALL_HEADERS
-opaldir = $(opalincludedir)/$(subdir)
-opal_HEADERS = $(headers)
-endif
diff --git a/opal/cuda/README.md b/opal/cuda/README.md
deleted file mode 100644
index 770c367f69d..00000000000
--- a/opal/cuda/README.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# A Developer's Note on OMPI CUDA Code
-
-The initial CUDA implementation in Open MPI was not well factored.
-Most of the developers at the time didn't really understand CUDA (or
-GPUs), and the developers working on CUDA were new to Open MPI's
-abstractions.  It was also unclear whether there would be another
-interface for someone else's GPUs or whether the world would choose
-CUDA.  With this background, choices were made.
-
-The initial implementation put much of the cuda buffer handling
-functions in the datatype engine, including the code to determine if
-an address referred to a CUDA buffer.  Many of the users of those
-functions were also users of the datatype engine, so it made sense.
-There was also a common/cuda library, which provided wrappers around
-common cuda functions.  The common/cuda library (usually itself
-built as a dso) dlopen'ed the base cuda library, so that no part of
-Open MPI had a loader-time dependency on the cuda library.
-
-In 2021, the default build mode for components (including common
-components) was changed from DSO to static (ie, part of the base
-library, which may still be a dynamic library) to reduce startup
-time.  The OFI MTL was also updated to support CUDA buffers, which
-required some changes to the datatype interface.  During those
-changes, George rightly pushed that the CUDA specific code belonged
-not in the datatype engine, but in a CUDA-specific library.  The
-develoepr working on the OFI MTL code dutifully moved the code, not
-realizing that he had created a circular dependency that broke the
-ability of common/cuda to build as a DSO.  The datatype engine
-depended on functions in the common/cuda library, but the common/cuda
-library depended on libopen-pal.
-
-To fix this issue with minimal interruption to the 5.0 schedule, we
-moved the common/cuda component into libopen-pal (ie, it is no longer
-a component, but just part of the base library).  Because the cuda
-libraries are still dlopen'ed by the OMPI cuda code, this does not
-introduce a loader-time dependency on the cuda libraries from Open
-MPI, but does break the cycle described above.  This is not a great
-abstraction situation, but works.
-
-The "right" solution is an accelerator framework that is in OPAL,
-which encapsulates the functions that Open MPI requires from an
-accelerator (CUDA, ROCm, Xe, etc.), as we now know there will be more
-than one accelerator interface in the world.  An initial take is
-proposed in https://github.com/open-mpi/ompi/pull/10069, although
-significant work remains to prove that said interface is sufficient to
-abstract an accelerator interface (where sufficient is defined as "no
-`#if HAVE_CUDA` macros in the general codebase").
diff --git a/opal/cuda/common_cuda.c b/opal/cuda/common_cuda.c
deleted file mode 100644
index 667ad6bdcad..00000000000
--- a/opal/cuda/common_cuda.c
+++ /dev/null
@@ -1,2309 +0,0 @@
-/*
- * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
- *                         University Research and Technology
- *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2014 The University of Tennessee and The University
- *                         of Tennessee Research Foundation.  All rights
- *                         reserved.
- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
- *                         University of Stuttgart.  All rights reserved.
- * Copyright (c) 2004-2006 The Regents of the University of California.
- *                         All rights reserved.
- * Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
- * Copyright (c) 2015      Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2015      Research Organization for Information Science
- *                         and Technology (RIST). All rights reserved.
- * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
- * $COPYRIGHT$
- *
- * Additional copyrights may follow
- *
- * $HEADER$
- */
-
-/**
- * This file contains various support functions for doing CUDA
- * operations.
- */
-#include "opal_config.h"
-
-#include <cuda.h>
-#include <errno.h>
-#include <unistd.h>
-
-#include "opal/align.h"
-#include "opal/datatype/opal_convertor.h"
-#include "opal/util/argv.h"
-#include "opal/util/output.h"
-#include "opal/util/printf.h"
-#include "opal/util/proc.h"
-#include "opal/util/show_help.h"
-
-#include "opal/mca/dl/base/base.h"
-#include "opal/mca/rcache/base/base.h"
-#include "opal/mca/timer/base/base.h"
-#include "opal/runtime/opal_params.h"
-
-#include "common_cuda.h"
-
-/**
- * Since function names can get redefined in cuda.h file, we need to do this
- * stringifying to get the latest function name from the header file.  For
- * example, cuda.h may have something like this:
- * #define cuMemFree cuMemFree_v2
- * We want to make sure we find cuMemFree_v2, not cuMemFree.
- */
-#define STRINGIFY2(x) #x
-#define STRINGIFY(x)  STRINGIFY2(x)
-
-#define OPAL_CUDA_DLSYM(libhandle, funcName)                                                      \
-    do {                                                                                          \
-        char *err_msg;                                                                            \
-        void *ptr;                                                                                \
-        if (OPAL_SUCCESS != opal_dl_lookup(libhandle, STRINGIFY(funcName), &ptr, &err_msg)) {     \
-            opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true, STRINGIFY(funcName), \
-                           err_msg);                                                              \
-            return 1;                                                                             \
-        } else {                                                                                  \
-            *(void **) (&cuFunc.funcName) = ptr;                                                  \
-            opal_output_verbose(15, mca_common_cuda_output, "CUDA: successful dlsym of %s",       \
-                                STRINGIFY(funcName));                                             \
-        }                                                                                         \
-    } while (0)
-
-/* Structure to hold CUDA function pointers that get dynamically loaded. */
-struct cudaFunctionTable {
-    int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr);
-    int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-    int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
-    int (*cuMemAlloc)(CUdeviceptr *, size_t);
-    int (*cuMemFree)(CUdeviceptr buf);
-    int (*cuCtxGetCurrent)(void *cuContext);
-    int (*cuStreamCreate)(CUstream *, int);
-    int (*cuEventCreate)(CUevent *, int);
-    int (*cuEventRecord)(CUevent, CUstream);
-    int (*cuMemHostRegister)(void *, size_t, unsigned int);
-    int (*cuMemHostUnregister)(void *);
-    int (*cuEventQuery)(CUevent);
-    int (*cuEventDestroy)(CUevent);
-    int (*cuStreamWaitEvent)(CUstream, CUevent, unsigned int);
-    int (*cuMemGetAddressRange)(CUdeviceptr *, size_t *, CUdeviceptr);
-    int (*cuIpcGetEventHandle)(CUipcEventHandle *, CUevent);
-    int (*cuIpcOpenEventHandle)(CUevent *, CUipcEventHandle);
-    int (*cuIpcOpenMemHandle)(CUdeviceptr *, CUipcMemHandle, unsigned int);
-    int (*cuIpcCloseMemHandle)(CUdeviceptr);
-    int (*cuIpcGetMemHandle)(CUipcMemHandle *, CUdeviceptr);
-    int (*cuCtxGetDevice)(CUdevice *);
-    int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice);
-    int (*cuDeviceGet)(CUdevice *, int);
-#if OPAL_CUDA_GDR_SUPPORT
-    int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr);
-#endif /* OPAL_CUDA_GDR_SUPPORT */
-    int (*cuCtxSetCurrent)(CUcontext);
-    int (*cuEventSynchronize)(CUevent);
-    int (*cuStreamSynchronize)(CUstream);
-    int (*cuStreamDestroy)(CUstream);
-#if OPAL_CUDA_GET_ATTRIBUTES
-    int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
-#endif /* OPAL_CUDA_GET_ATTRIBUTES */
-};
-typedef struct cudaFunctionTable cudaFunctionTable_t;
-static cudaFunctionTable_t cuFunc;
-
-static int stage_one_init_ref_count = 0;
-static bool stage_three_init_complete = false;
-static bool common_cuda_initialized = false;
-static bool common_cuda_mca_parames_registered = false;
-static int mca_common_cuda_verbose;
-static int mca_common_cuda_output = 0;
-bool mca_common_cuda_enabled = false;
-static bool mca_common_cuda_register_memory = true;
-static bool mca_common_cuda_warning = false;
-static opal_list_t common_cuda_memory_registrations;
-static CUstream ipcStream = NULL;
-static CUstream dtohStream = NULL;
-static CUstream htodStream = NULL;
-static CUstream memcpyStream = NULL;
-static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1;
-static opal_mutex_t common_cuda_init_lock;
-static opal_mutex_t common_cuda_htod_lock;
-static opal_mutex_t common_cuda_dtoh_lock;
-static opal_mutex_t common_cuda_ipc_lock;
-
-/* Functions called by opal layer - plugged into opal function table */
-static int mca_common_cuda_is_gpu_buffer(const void *, opal_convertor_t *);
-static int mca_common_cuda_memmove(void *, void *, size_t);
-static int mca_common_cuda_cu_memcpy_async(void *, const void *, size_t, opal_convertor_t *);
-static int mca_common_cuda_cu_memcpy(void *, const void *, size_t);
-
-/* Function that gets plugged into opal layer */
-static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *);
-
-/* Structure to hold memory registrations that are delayed until first
- * call to send or receive a GPU pointer */
-struct common_cuda_mem_regs_t {
-    opal_list_item_t super;
-    void *ptr;
-    size_t amount;
-    char *msg;
-};
-typedef struct common_cuda_mem_regs_t common_cuda_mem_regs_t;
-OBJ_CLASS_DECLARATION(common_cuda_mem_regs_t);
-OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t, opal_list_item_t, NULL, NULL);
-
-static int mca_common_cuda_async = 1;
-static int mca_common_cuda_cumemcpy_async;
-#if OPAL_ENABLE_DEBUG
-static int mca_common_cuda_cumemcpy_timing;
-#endif /* OPAL_ENABLE_DEBUG */
-
-/* Array of CUDA events to be queried for IPC stream, sending side and
- * receiving side. */
-CUevent *cuda_event_ipc_array = NULL;
-CUevent *cuda_event_dtoh_array = NULL;
-CUevent *cuda_event_htod_array = NULL;
-
-/* Array of fragments currently being moved by cuda async non-blocking
- * operations */
-struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
-struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
-struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;
-
-/* First free/available location in cuda_event_status_array */
-static int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
-
-/* First currently-being used location in the cuda_event_status_array */
-static int cuda_event_ipc_first_used, cuda_event_dtoh_first_used, cuda_event_htod_first_used;
-
-/* Number of status items currently in use */
-static volatile int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used;
-
-/* Size of array holding events */
-int cuda_event_max = 400;
-static int cuda_event_ipc_most = 0;
-static int cuda_event_dtoh_most = 0;
-static int cuda_event_htod_most = 0;
-
-/* Handle to libcuda.so */
-opal_dl_handle_t *libcuda_handle = NULL;
-
-/* Unused variable that we register at init time and unregister at fini time.
- * This is used to detect if user has done a device reset prior to MPI_Finalize.
- * This is a workaround to avoid SEGVs.
- */
-static int checkmem;
-static int ctx_ok = 1;
-
-#define CUDA_COMMON_TIMING 0
-#if OPAL_ENABLE_DEBUG
-/* Some timing support structures.  Enable this to help analyze
- * internal performance issues. */
-static opal_timer_t ts_start;
-static opal_timer_t ts_end;
-static double accum;
-#    define THOUSAND 1000L
-#    define MILLION  1000000L
-static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end);
-#endif /* OPAL_ENABLE_DEBUG */
-
-/* These functions are typically unused in the optimized builds. */
-static void cuda_dump_evthandle(int, void *, char *) __opal_attribute_unused__;
-static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__;
-#if OPAL_ENABLE_DEBUG
-#    define CUDA_DUMP_MEMHANDLE(a) cuda_dump_memhandle a
-#    define CUDA_DUMP_EVTHANDLE(a) cuda_dump_evthandle a
-#else
-#    define CUDA_DUMP_MEMHANDLE(a)
-#    define CUDA_DUMP_EVTHANDLE(a)
-#endif /* OPAL_ENABLE_DEBUG */
-
-/* This is a separate function so we can see these variables with ompi_info and
- * also set them with the tools interface */
-void mca_common_cuda_register_mca_variables(void)
-{
-
-    if (false == common_cuda_mca_parames_registered) {
-        common_cuda_mca_parames_registered = true;
-    }
-    /* Set different levels of verbosity in the cuda related code. */
-    mca_common_cuda_verbose = 0;
-    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "verbose",
-                                 "Set level of common cuda verbosity", MCA_BASE_VAR_TYPE_INT, NULL,
-                                 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
-                                 &mca_common_cuda_verbose);
-
-    /* Control whether system buffers get CUDA pinned or not.  Allows for
-     * performance analysis. */
-    mca_common_cuda_register_memory = true;
-    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "register_memory",
-                                 "Whether to cuMemHostRegister preallocated BTL buffers",
-                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
-                                 MCA_BASE_VAR_SCOPE_READONLY, &mca_common_cuda_register_memory);
-
-    /* Control whether we see warnings when CUDA memory registration fails.  This is
-     * useful when CUDA support is configured in, but we are running a regular MPI
-     * application without CUDA. */
-    mca_common_cuda_warning = true;
-    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "warning",
-                                 "Whether to print warnings when CUDA registration fails",
-                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
-                                 MCA_BASE_VAR_SCOPE_READONLY, &mca_common_cuda_warning);
-
-    /* Use this flag to test async vs sync copies */
-    mca_common_cuda_async = 1;
-    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "memcpy_async",
-                                 "Set to 0 to force CUDA sync copy instead of async",
-                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9,
-                                 MCA_BASE_VAR_SCOPE_READONLY, &mca_common_cuda_async);
-
-    /* Use this parameter to increase the number of outstanding events allows */
-    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "event_max",
-                                 "Set number of outstanding CUDA events", MCA_BASE_VAR_TYPE_INT,
-                                 NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
-                                 &cuda_event_max);
-
-    /* Use this flag to test cuMemcpyAsync vs cuMemcpy */
-    mca_common_cuda_cumemcpy_async = 1;
-    (void) mca_base_var_register(
-        "ompi", "mpi", "common_cuda", "cumemcpy_async",
-        "Set to 0 to force CUDA cuMemcpy instead of cuMemcpyAsync/cuStreamSynchronize",
-        MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
-        &mca_common_cuda_cumemcpy_async);
-
-#if OPAL_ENABLE_DEBUG
-    /* Use this flag to dump out timing of cumempcy sync and async */
-    mca_common_cuda_cumemcpy_timing = 0;
-    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_timing",
-                                 "Set to 1 to dump timing of eager copies", MCA_BASE_VAR_TYPE_INT,
-                                 NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
-                                 &mca_common_cuda_cumemcpy_timing);
-#endif /* OPAL_ENABLE_DEBUG */
-
-    (void) mca_base_var_register(
-        "ompi", "mpi", "common_cuda", "gpu_mem_check_workaround",
-        "Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.",
-        MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
-        &mca_common_cuda_gpu_mem_check_workaround);
-}
-
-/**
- * This is the first stage of initialization.  This function is called
- * explicitly by any BTLs that can support CUDA-aware. It is called during
- * the component open phase of initialization. This function will look for
- * the SONAME of the library which is libcuda.so.1. In most cases, this will
- * result in the library found.  However, there are some setups that require
- * the extra steps for searching. This function will then load the symbols
- * needed from the CUDA driver library. Any failure will result in this
- * initialization failing and status will be set showing that.
- */
-int mca_common_cuda_stage_one_init(void)
-{
-    int retval, i, j;
-    char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
-    char *searchpaths[] = {"", "/usr/lib64", NULL};
-    char **errmsgs = NULL;
-    char *errmsg = NULL;
-    int errsize;
-    bool stage_one_init_passed = false;
-
-    stage_one_init_ref_count++;
-    if (stage_one_init_ref_count > 1) {
-        opal_output_verbose(10, mca_common_cuda_output,
-                            "CUDA: stage_one_init_ref_count is now %d, no need to init",
-                            stage_one_init_ref_count);
-        return OPAL_SUCCESS;
-    }
-
-    /* This is a no-op in most cases as the parameters were registered earlier */
-    mca_common_cuda_register_mca_variables();
-
-    OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
-    OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
-    OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
-    OBJ_CONSTRUCT(&common_cuda_ipc_lock, opal_mutex_t);
-
-    mca_common_cuda_output = opal_output_open(NULL);
-    opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
-
-    opal_output_verbose(10, mca_common_cuda_output,
-                        "CUDA: stage_one_init_ref_count is now %d, initializing",
-                        stage_one_init_ref_count);
-
-    /* First check if the support is enabled.  In the case that the user has
-     * turned it off, we do not need to continue with any CUDA specific
-     * initialization.  Do this after MCA parameter registration. */
-    if (!opal_cuda_support) {
-        return 1;
-    }
-
-    if (!OPAL_HAVE_DL_SUPPORT) {
-        opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
-        return 1;
-    }
-
-    /* Now walk through all the potential names libcuda and find one
-     * that works.  If it does, all is good.  If not, print out all
-     * the messages about why things failed.  This code was careful
-     * to try and save away all error messages if the loading ultimately
-     * failed to help with debugging.
-     *
-     * NOTE: On the first loop we just utilize the default loading
-     * paths from the system.  For the second loop, set /usr/lib64 to
-     * the search path and try again.  This is done to handle the case
-     * where we have both 32 and 64 bit libcuda.so libraries
-     * installed.  Even when running in 64-bit mode, the /usr/lib
-     * directory is searched first and we may find a 32-bit
-     * libcuda.so.1 library.  Loading of this library will fail as the
-     * OPAL DL framework does not handle having the wrong ABI in the
-     * search path (unlike ld or ld.so).  Note that we only set this
-     * search path after the original search.  This is so that
-     * LD_LIBRARY_PATH and run path settings are respected.  Setting
-     * this search path overrides them (rather then being
-     * appended). */
-    j = 0;
-    while (searchpaths[j] != NULL) {
-        i = 0;
-        while (cudalibs[i] != NULL) {
-            char *filename = NULL;
-            char *str = NULL;
-
-            /* If there's a non-empty search path, prepend it
-               to the library filename */
-            if (strlen(searchpaths[j]) > 0) {
-                opal_asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]);
-            } else {
-                filename = strdup(cudalibs[i]);
-            }
-            if (NULL == filename) {
-                opal_show_help("help-mpi-common-cuda.txt", "No memory", true,
-                               OPAL_PROC_MY_HOSTNAME);
-                return 1;
-            }
-
-            retval = opal_dl_open(filename, false, false, &libcuda_handle, &str);
-            if (OPAL_SUCCESS != retval || NULL == libcuda_handle) {
-                if (NULL != str) {
-                    opal_argv_append(&errsize, &errmsgs, str);
-                } else {
-                    opal_argv_append(&errsize, &errmsgs, "opal_dl_open() returned NULL.");
-                }
-                opal_output_verbose(10, mca_common_cuda_output, "CUDA: Library open error: %s",
-                                    errmsgs[errsize - 1]);
-            } else {
-                opal_output_verbose(10, mca_common_cuda_output,
-                                    "CUDA: Library successfully opened %s", cudalibs[i]);
-                stage_one_init_passed = true;
-                break;
-            }
-            i++;
-
-            free(filename);
-        }
-        if (true == stage_one_init_passed) {
-            break; /* Break out of outer loop */
-        }
-        j++;
-    }
-
-    if (true != stage_one_init_passed) {
-        errmsg = opal_argv_join(errmsgs, '\n');
-        if (opal_warn_on_missing_libcuda) {
-            opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", true, errmsg);
-        }
-        opal_cuda_support = 0;
-    }
-    opal_argv_free(errmsgs);
-    free(errmsg);
-
-    if (true != stage_one_init_passed) {
-        return 1;
-    }
-    opal_cuda_add_initialization_function(&mca_common_cuda_stage_two_init);
-    OBJ_CONSTRUCT(&common_cuda_memory_registrations, opal_list_t);
-
-    /* Map in the functions that we need.  Note that if there is an error
-     * the macro OPAL_CUDA_DLSYM will print an error and call return.  */
-    OPAL_CUDA_DLSYM(libcuda_handle, cuStreamCreate);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventCreate);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventRecord);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostRegister);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostUnregister);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttribute);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventQuery);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventDestroy);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuStreamWaitEvent);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpyAsync);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemFree);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemAlloc);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGet);
-#if OPAL_CUDA_GDR_SUPPORT
-    OPAL_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute);
-#endif /* OPAL_CUDA_GDR_SUPPORT */
-    OPAL_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventSynchronize);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuStreamSynchronize);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuStreamDestroy);
-#if OPAL_CUDA_GET_ATTRIBUTES
-    OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes);
-#endif /* OPAL_CUDA_GET_ATTRIBUTES */
-    opal_cuda_runtime_initialized = true;
-    return 0;
-}
-
-/**
- * This function is registered with the OPAL CUDA support.  In that way,
- * these function pointers will be loaded into the OPAL CUDA code when
- * the first convertor is initialized.  This does not trigger any CUDA
- * specific initialization as this may just be a host buffer that is
- * triggering this call.
- */
-static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *ftable)
-{
-    if (OPAL_UNLIKELY(!opal_cuda_support)) {
-        return OPAL_ERROR;
-    }
-
-    ftable->gpu_is_gpu_buffer = &mca_common_cuda_is_gpu_buffer;
-    ftable->gpu_cu_memcpy_async = &mca_common_cuda_cu_memcpy_async;
-    ftable->gpu_cu_memcpy = &mca_common_cuda_cu_memcpy;
-    ftable->gpu_memmove = &mca_common_cuda_memmove;
-    ftable->gpu_malloc = &mca_common_cuda_malloc;
-    ftable->gpu_free = &mca_common_cuda_free;
-
-    opal_output_verbose(30, mca_common_cuda_output, "CUDA: support functions initialized");
-    return OPAL_SUCCESS;
-}
-
-/**
- * This is the last phase of initialization.  This is triggered when we examine
- * a buffer pointer and determine it is a GPU buffer.  We then assume the user
- * has selected their GPU and we can go ahead with all the CUDA related
- * initializations.  If we get an error, just return.  Cleanup of resources
- * will happen when fini is called.
- */
-static int mca_common_cuda_stage_three_init(void)
-{
-    int i, s, rc;
-    CUresult res;
-    CUcontext cuContext;
-    common_cuda_mem_regs_t *mem_reg;
-
-    OPAL_THREAD_LOCK(&common_cuda_init_lock);
-    opal_output_verbose(20, mca_common_cuda_output, "CUDA: entering stage three init");
-
-    /* Compiled without support or user disabled support */
-    if (OPAL_UNLIKELY(!opal_cuda_support)) {
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: No mpi cuda support, exiting stage three init");
-        stage_three_init_complete = true;
-        OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
-        return OPAL_ERROR;
-    }
-
-    /* In case another thread snuck in and completed the initialization */
-    if (true == stage_three_init_complete) {
-        if (common_cuda_initialized) {
-            opal_output_verbose(20, mca_common_cuda_output,
-                                "CUDA: Stage three already complete, exiting stage three init");
-            OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
-            return OPAL_SUCCESS;
-        } else {
-            opal_output_verbose(20, mca_common_cuda_output,
-                                "CUDA: Stage three already complete, failed during the init");
-            OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
-            return OPAL_ERROR;
-        }
-    }
-
-    /* Check to see if this process is running in a CUDA context.  If
-     * so, all is good.  If not, then disable registration of memory. */
-    res = cuFunc.cuCtxGetCurrent(&cuContext);
-    if (CUDA_SUCCESS != res) {
-        if (mca_common_cuda_warning) {
-            /* Check for the not initialized error since we can make suggestions to
-             * user for this error. */
-            if (CUDA_ERROR_NOT_INITIALIZED == res) {
-                opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed not initialized",
-                               true);
-            } else {
-                opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed", true, res);
-            }
-        }
-        mca_common_cuda_enabled = false;
-        mca_common_cuda_register_memory = false;
-    } else if ((CUDA_SUCCESS == res) && (NULL == cuContext)) {
-        if (mca_common_cuda_warning) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent returned NULL", true);
-        }
-        mca_common_cuda_enabled = false;
-        mca_common_cuda_register_memory = false;
-    } else {
-        /* All is good.  mca_common_cuda_register_memory will retain its original
-         * value.  Normally, that is 1, but the user can override it to disable
-         * registration of the internal buffers. */
-        mca_common_cuda_enabled = true;
-        opal_output_verbose(20, mca_common_cuda_output, "CUDA: cuCtxGetCurrent succeeded");
-    }
-
-    /* No need to go on at this point.  If we cannot create a context and we are at
-     * the point where we are making MPI calls, it is time to fully disable
-     * CUDA support.
-     */
-    if (false == mca_common_cuda_enabled) {
-        OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
-        return OPAL_ERROR;
-    }
-
-    if (true == mca_common_cuda_enabled) {
-        /* Set up an array to store outstanding IPC async copy events */
-        cuda_event_ipc_num_used = 0;
-        cuda_event_ipc_first_avail = 0;
-        cuda_event_ipc_first_used = 0;
-
-        cuda_event_ipc_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
-        if (NULL == cuda_event_ipc_array) {
-            opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME);
-            rc = OPAL_ERROR;
-            goto cleanup_and_error;
-        }
-
-        /* Create the events since they can be reused. */
-        for (i = 0; i < cuda_event_max; i++) {
-            res = cuFunc.cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
-            if (CUDA_SUCCESS != res) {
-                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", true,
-                               OPAL_PROC_MY_HOSTNAME, res);
-                rc = OPAL_ERROR;
-                goto cleanup_and_error;
-            }
-        }
-
-        /* The first available status index is 0.  Make an empty frag
-           array. */
-        cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
-            sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
-        if (NULL == cuda_event_ipc_frag_array) {
-            opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME);
-            rc = OPAL_ERROR;
-            goto cleanup_and_error;
-        }
-    }
-
-    if (true == mca_common_cuda_enabled) {
-        /* Set up an array to store outstanding async dtoh events.  Used on the
-         * sending side for asynchronous copies. */
-        cuda_event_dtoh_num_used = 0;
-        cuda_event_dtoh_first_avail = 0;
-        cuda_event_dtoh_first_used = 0;
-
-        cuda_event_dtoh_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
-        if (NULL == cuda_event_dtoh_array) {
-            opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME);
-            rc = OPAL_ERROR;
-            goto cleanup_and_error;
-        }
-
-        /* Create the events since they can be reused. */
-        for (i = 0; i < cuda_event_max; i++) {
-            res = cuFunc.cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING);
-            if (CUDA_SUCCESS != res) {
-                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", true,
-                               OPAL_PROC_MY_HOSTNAME, res);
-                rc = OPAL_ERROR;
-                goto cleanup_and_error;
-            }
-        }
-
-        /* The first available status index is 0.  Make an empty frag
-           array. */
-        cuda_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
-            sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
-        if (NULL == cuda_event_dtoh_frag_array) {
-            opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME);
-            rc = OPAL_ERROR;
-            goto cleanup_and_error;
-        }
-
-        /* Set up an array to store outstanding async htod events.  Used on the
-         * receiving side for asynchronous copies. */
-        cuda_event_htod_num_used = 0;
-        cuda_event_htod_first_avail = 0;
-        cuda_event_htod_first_used = 0;
-
-        cuda_event_htod_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
-        if (NULL == cuda_event_htod_array) {
-            opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME);
-            rc = OPAL_ERROR;
-            goto cleanup_and_error;
-        }
-
-        /* Create the events since they can be reused. */
-        for (i = 0; i < cuda_event_max; i++) {
-            res = cuFunc.cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING);
-            if (CUDA_SUCCESS != res) {
-                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", true,
-                               OPAL_PROC_MY_HOSTNAME, res);
-                rc = OPAL_ERROR;
-                goto cleanup_and_error;
-            }
-        }
-
-        /* The first available status index is 0.  Make an empty frag
-           array. */
-        cuda_event_htod_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
-            sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
-        if (NULL == cuda_event_htod_frag_array) {
-            opal_show_help("help-mpi-common-cuda.txt", "No memory", true, OPAL_PROC_MY_HOSTNAME);
-            rc = OPAL_ERROR;
-            goto cleanup_and_error;
-        }
-    }
-
-    s = opal_list_get_size(&common_cuda_memory_registrations);
-    for (i = 0; i < s; i++) {
-        mem_reg = (common_cuda_mem_regs_t *) opal_list_remove_first(
-            &common_cuda_memory_registrations);
-        if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
-            res = cuFunc.cuMemHostRegister(mem_reg->ptr, mem_reg->amount, 0);
-            if (res != CUDA_SUCCESS) {
-                /* If registering the memory fails, print a message and continue.
-                 * This is not a fatal error. */
-                opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
-                               true, mem_reg->ptr, mem_reg->amount, OPAL_PROC_MY_HOSTNAME, res,
-                               mem_reg->msg);
-            } else {
-                opal_output_verbose(20, mca_common_cuda_output,
-                                    "CUDA: cuMemHostRegister OK on rcache %s: "
-                                    "address=%p, bufsize=%d",
-                                    mem_reg->msg, mem_reg->ptr, (int) mem_reg->amount);
-            }
-        }
-        free(mem_reg->msg);
-        OBJ_RELEASE(mem_reg);
-    }
-
-    /* Create stream for use in ipc asynchronous copies */
-    res = cuFunc.cuStreamCreate(&ipcStream, 0);
-    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", true,
-                       OPAL_PROC_MY_HOSTNAME, res);
-        rc = OPAL_ERROR;
-        goto cleanup_and_error;
-    }
-
-    /* Create stream for use in dtoh asynchronous copies */
-    res = cuFunc.cuStreamCreate(&dtohStream, 0);
-    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", true,
-                       OPAL_PROC_MY_HOSTNAME, res);
-        rc = OPAL_ERROR;
-        goto cleanup_and_error;
-    }
-
-    /* Create stream for use in htod asynchronous copies */
-    res = cuFunc.cuStreamCreate(&htodStream, 0);
-    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", true,
-                       OPAL_PROC_MY_HOSTNAME, res);
-        rc = OPAL_ERROR;
-        goto cleanup_and_error;
-    }
-
-    if (mca_common_cuda_cumemcpy_async) {
-        /* Create stream for use in cuMemcpyAsync synchronous copies */
-        res = cuFunc.cuStreamCreate(&memcpyStream, 0);
-        if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", true,
-                           OPAL_PROC_MY_HOSTNAME, res);
-            rc = OPAL_ERROR;
-            goto cleanup_and_error;
-        }
-    }
-
-    res = cuFunc.cuMemHostRegister(&checkmem, sizeof(int), 0);
-    if (res != CUDA_SUCCESS) {
-        /* If registering the memory fails, print a message and continue.
-         * This is not a fatal error. */
-        opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed", true,
-                       &checkmem, sizeof(int), OPAL_PROC_MY_HOSTNAME, res, "checkmem");
-
-    } else {
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: cuMemHostRegister OK on test region");
-    }
-
-    opal_output_verbose(20, mca_common_cuda_output, "CUDA: the extra gpu memory check is %s",
-                        (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on" : "off");
-
-    opal_output_verbose(30, mca_common_cuda_output, "CUDA: initialized");
-    opal_atomic_mb(); /* Make sure next statement does not get reordered */
-    common_cuda_initialized = true;
-    stage_three_init_complete = true;
-    OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
-    return OPAL_SUCCESS;
-
-    /* If we are here, something went wrong.  Cleanup and return an error. */
-cleanup_and_error:
-    opal_atomic_mb(); /* Make sure next statement does not get reordered */
-    stage_three_init_complete = true;
-    OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
-    return rc;
-}
-
-/**
- * Cleanup all CUDA resources.
- *
- * Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm
- * rcache.  Looks like with the memory pool from openib (grdma), the unregistering is
- * called as the free list is destructed.  Not true for the sm mpool.  This means we
- * are currently still leaking some host memory we registered with CUDA.
- */
-void mca_common_cuda_fini(void)
-{
-    int i;
-    CUresult res;
-
-    if (false == common_cuda_initialized) {
-        stage_one_init_ref_count--;
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: mca_common_cuda_fini, never completed initialization so "
-                            "skipping fini, ref_count is now %d",
-                            stage_one_init_ref_count);
-        return;
-    }
-
-    if (0 == stage_one_init_ref_count) {
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete",
-                            stage_one_init_ref_count);
-        return;
-    }
-
-    if (1 == stage_one_init_ref_count) {
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up started",
-                            stage_one_init_ref_count);
-
-        /* This call is in here to make sure the context is still valid.
-         * This was the one way of checking which did not cause problems
-         * while calling into the CUDA library.  This check will detect if
-         * a user has called cudaDeviceReset prior to MPI_Finalize. If so,
-         * then this call will fail and we skip cleaning up CUDA resources. */
-        res = cuFunc.cuMemHostUnregister(&checkmem);
-        if (CUDA_SUCCESS != res) {
-            ctx_ok = 0;
-        }
-        opal_output_verbose(
-            20, mca_common_cuda_output,
-            "CUDA: mca_common_cuda_fini, cuMemHostUnregister returned %d, ctx_ok=%d", res, ctx_ok);
-
-        if (NULL != cuda_event_ipc_array) {
-            if (ctx_ok) {
-                for (i = 0; i < cuda_event_max; i++) {
-                    if (NULL != cuda_event_ipc_array[i]) {
-                        cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
-                    }
-                }
-            }
-            free(cuda_event_ipc_array);
-        }
-        if (NULL != cuda_event_htod_array) {
-            if (ctx_ok) {
-                for (i = 0; i < cuda_event_max; i++) {
-                    if (NULL != cuda_event_htod_array[i]) {
-                        cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
-                    }
-                }
-            }
-            free(cuda_event_htod_array);
-        }
-
-        if (NULL != cuda_event_dtoh_array) {
-            if (ctx_ok) {
-                for (i = 0; i < cuda_event_max; i++) {
-                    if (NULL != cuda_event_dtoh_array[i]) {
-                        cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
-                    }
-                }
-            }
-            free(cuda_event_dtoh_array);
-        }
-
-        if (NULL != cuda_event_ipc_frag_array) {
-            free(cuda_event_ipc_frag_array);
-        }
-        if (NULL != cuda_event_htod_frag_array) {
-            free(cuda_event_htod_frag_array);
-        }
-        if (NULL != cuda_event_dtoh_frag_array) {
-            free(cuda_event_dtoh_frag_array);
-        }
-        if ((NULL != ipcStream) && ctx_ok) {
-            cuFunc.cuStreamDestroy(ipcStream);
-        }
-        if ((NULL != dtohStream) && ctx_ok) {
-            cuFunc.cuStreamDestroy(dtohStream);
-        }
-        if ((NULL != htodStream) && ctx_ok) {
-            cuFunc.cuStreamDestroy(htodStream);
-        }
-        if ((NULL != memcpyStream) && ctx_ok) {
-            cuFunc.cuStreamDestroy(memcpyStream);
-        }
-        OBJ_DESTRUCT(&common_cuda_init_lock);
-        OBJ_DESTRUCT(&common_cuda_htod_lock);
-        OBJ_DESTRUCT(&common_cuda_dtoh_lock);
-        OBJ_DESTRUCT(&common_cuda_ipc_lock);
-        if (NULL != libcuda_handle) {
-            opal_dl_close(libcuda_handle);
-        }
-
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up all done",
-                            stage_one_init_ref_count);
-
-        opal_output_close(mca_common_cuda_output);
-
-    } else {
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use",
-                            stage_one_init_ref_count);
-    }
-    stage_one_init_ref_count--;
-}
-
-/**
- * Call the CUDA register function so we pin the memory in the CUDA
- * space.
- */
-void mca_common_cuda_register(void *ptr, size_t amount, char *msg)
-{
-    int res;
-
-    /* Always first check if the support is enabled.  If not, just return */
-    if (!opal_cuda_support)
-        return;
-
-    if (!common_cuda_initialized) {
-        OPAL_THREAD_LOCK(&common_cuda_init_lock);
-        if (!common_cuda_initialized) {
-            common_cuda_mem_regs_t *regptr;
-            regptr = OBJ_NEW(common_cuda_mem_regs_t);
-            regptr->ptr = ptr;
-            regptr->amount = amount;
-            regptr->msg = strdup(msg);
-            opal_list_append(&common_cuda_memory_registrations, (opal_list_item_t *) regptr);
-            OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
-            return;
-        }
-        OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
-    }
-
-    if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
-        res = cuFunc.cuMemHostRegister(ptr, amount, 0);
-        if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-            /* If registering the memory fails, print a message and continue.
-             * This is not a fatal error. */
-            opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister failed", true, ptr,
-                           amount, OPAL_PROC_MY_HOSTNAME, res, msg);
-        } else {
-            opal_output_verbose(20, mca_common_cuda_output,
-                                "CUDA: cuMemHostRegister OK on rcache %s: "
-                                "address=%p, bufsize=%d",
-                                msg, ptr, (int) amount);
-        }
-    }
-}
-
-/**
- * Call the CUDA unregister function so we unpin the memory in the CUDA
- * space.
- */
-void mca_common_cuda_unregister(void *ptr, char *msg)
-{
-    int res, i, s;
-    common_cuda_mem_regs_t *mem_reg;
-
-    /* This can happen if memory was queued up to be registered, but
-     * no CUDA operations happened, so it never was registered.
-     * Therefore, just release any of the resources. */
-    if (!common_cuda_initialized) {
-        s = opal_list_get_size(&common_cuda_memory_registrations);
-        for (i = 0; i < s; i++) {
-            mem_reg = (common_cuda_mem_regs_t *) opal_list_remove_first(
-                &common_cuda_memory_registrations);
-            free(mem_reg->msg);
-            OBJ_RELEASE(mem_reg);
-        }
-        return;
-    }
-
-    if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
-        res = cuFunc.cuMemHostUnregister(ptr);
-        if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-            /* If unregistering the memory fails, just continue.  This is during
-             * shutdown.  Only print when running in verbose mode. */
-            opal_output_verbose(20, mca_common_cuda_output,
-                                "CUDA: cuMemHostUnregister failed: ptr=%p, res=%d, rcache=%s", ptr,
-                                res, msg);
-
-        } else {
-            opal_output_verbose(20, mca_common_cuda_output,
-                                "CUDA: cuMemHostUnregister OK on rcache %s: "
-                                "address=%p",
-                                msg, ptr);
-        }
-    }
-}
-
-/*
- * Get the memory handle of a local section of memory that can be sent
- * to the remote size so it can access the memory.  This is the
- * registration function for the sending side of a message transfer.
- */
-int cuda_getmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
-                      mca_rcache_base_registration_t *hdrreg)
-
-{
-    CUmemorytype memType;
-    CUresult result;
-    CUipcMemHandle *memHandle;
-    CUdeviceptr pbase;
-    size_t psize;
-
-    mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t *) newreg;
-    memHandle = (CUipcMemHandle *) cuda_reg->data.memHandle;
-
-    /* We should only be there if this is a CUDA device pointer */
-    result = cuFunc.cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-                                          (CUdeviceptr) base);
-    assert(CUDA_SUCCESS == result);
-    assert(CU_MEMORYTYPE_DEVICE == memType);
-
-    /* Get the memory handle so we can send it to the remote process. */
-    result = cuFunc.cuIpcGetMemHandle(memHandle, (CUdeviceptr) base);
-    CUDA_DUMP_MEMHANDLE((100, memHandle, "GetMemHandle-After"));
-
-    if (CUDA_SUCCESS != result) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetMemHandle failed", true, result, base);
-        return OPAL_ERROR;
-    } else {
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: cuIpcGetMemHandle passed: base=%p size=%d", base, (int) size);
-    }
-
-    /* Need to get the real base and size of the memory handle.  This is
-     * how the remote side saves the handles in a cache. */
-    result = cuFunc.cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr) base);
-    if (CUDA_SUCCESS != result) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed", true, result,
-                       base);
-        return OPAL_ERROR;
-    } else {
-        opal_output_verbose(
-            10, mca_common_cuda_output,
-            "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ", base,
-            (int) size, (void *) pbase, (int) psize);
-    }
-
-    /* Store all the information in the registration */
-    cuda_reg->base.base = (void *) pbase;
-    cuda_reg->base.bound = (unsigned char *) pbase + psize - 1;
-    cuda_reg->data.memh_seg_addr.pval = (void *) pbase;
-    cuda_reg->data.memh_seg_len = psize;
-
-#if OPAL_CUDA_SYNC_MEMOPS
-    /* With CUDA 6.0, we can set an attribute on the memory pointer that will
-     * ensure any synchronous copies are completed prior to any other access
-     * of the memory region.  This means we do not need to record an event
-     * and send to the remote side.
-     */
-    memType = 1; /* Just use this variable since we already have it */
-    result = cuFunc.cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-                                          (CUdeviceptr) base);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed", true,
-                       OPAL_PROC_MY_HOSTNAME, result, base);
-        return OPAL_ERROR;
-    }
-#else
-    /* Need to record the event to ensure that any memcopies into the
-     * device memory have completed.  The event handle associated with
-     * this event is sent to the remote process so that it will wait
-     * on this event prior to copying data out of the device memory.
-     * Note that this needs to be the NULL stream to make since it is
-     * unknown what stream any copies into the device memory were done
-     * with. */
-    result = cuFunc.cuEventRecord((CUevent) cuda_reg->data.event, 0);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true, result, base);
-        return OPAL_ERROR;
-    }
-#endif /* OPAL_CUDA_SYNC_MEMOPS */
-
-    return OPAL_SUCCESS;
-}
-
-/*
- * This function is called by the local side that called the cuda_getmemhandle.
- * There is nothing to be done so just return.
- */
-int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg)
-{
-    opal_output_verbose(10, mca_common_cuda_output, "CUDA: cuda_ungetmemhandle (no-op): base=%p",
-                        reg->base);
-    CUDA_DUMP_MEMHANDLE(
-        (100, ((mca_rcache_common_cuda_reg_t *) reg)->data.memHandle, "cuda_ungetmemhandle"));
-
-    return OPAL_SUCCESS;
-}
-
-/*
- * Open a memory handle that refers to remote memory so we can get an address
- * that works on the local side.  This is the registration function for the
- * remote side of a transfer.  newreg contains the new handle.  hddrreg contains
- * the memory handle that was received from the remote side.
- */
-int cuda_openmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
-                       mca_rcache_base_registration_t *hdrreg)
-{
-    CUresult result;
-    CUipcMemHandle *memHandle;
-    mca_rcache_common_cuda_reg_t *cuda_newreg = (mca_rcache_common_cuda_reg_t *) newreg;
-
-    /* Save in local variable to avoid ugly casting */
-    memHandle = (CUipcMemHandle *) cuda_newreg->data.memHandle;
-    CUDA_DUMP_MEMHANDLE((100, memHandle, "Before call to cuIpcOpenMemHandle"));
-
-    /* Open the memory handle and store it into the registration structure. */
-    result = cuFunc.cuIpcOpenMemHandle((CUdeviceptr *) &newreg->alloc_base, *memHandle,
-                                       CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
-
-    /* If there are some stale entries in the cache, they can cause other
-     * registrations to fail.  Let the caller know that so that can attempt
-     * to clear them out. */
-    if (CUDA_ERROR_ALREADY_MAPPED == result) {
-        opal_output_verbose(10, mca_common_cuda_output,
-                            "CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for "
-                            "p=%p,size=%d: notify memory pool\n",
-                            base, (int) size);
-        return OPAL_ERR_WOULD_BLOCK;
-    }
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenMemHandle failed", true,
-                       OPAL_PROC_MY_HOSTNAME, result, base);
-        /* Currently, this is a non-recoverable error */
-        return OPAL_ERROR;
-    } else {
-        opal_output_verbose(10, mca_common_cuda_output,
-                            "CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)",
-                            newreg->alloc_base, base, (int) size);
-        CUDA_DUMP_MEMHANDLE((200, memHandle, "cuIpcOpenMemHandle"));
-    }
-
-    return OPAL_SUCCESS;
-}
-
-/*
- * Close a memory handle that refers to remote memory.
- */
-int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg)
-{
-    CUresult result;
-    mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t *) reg;
-
-    /* Only attempt to close if we have valid context.  This can change if a call
-     * to the fini function is made and we discover context is gone. */
-    if (ctx_ok) {
-        result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr) cuda_reg->base.alloc_base);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            if (CUDA_ERROR_DEINITIALIZED != result) {
-                opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed", true,
-                               result, cuda_reg->base.alloc_base);
-            }
-            /* We will just continue on and hope things continue to work. */
-        } else {
-            opal_output_verbose(10, mca_common_cuda_output,
-                                "CUDA: cuIpcCloseMemHandle passed: base=%p",
-                                cuda_reg->base.alloc_base);
-            CUDA_DUMP_MEMHANDLE((100, cuda_reg->data.memHandle, "cuIpcCloseMemHandle"));
-        }
-    }
-
-    return OPAL_SUCCESS;
-}
-
-void mca_common_cuda_construct_event_and_handle(uintptr_t *event, void *handle)
-{
-    CUresult result;
-
-    result = cuFunc.cuEventCreate((CUevent *) event,
-                                  CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", true,
-                       OPAL_PROC_MY_HOSTNAME, result);
-    }
-
-    result = cuFunc.cuIpcGetEventHandle((CUipcEventHandle *) handle, (CUevent) *event);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetEventHandle failed", true, result);
-    }
-
-    CUDA_DUMP_EVTHANDLE((10, handle, "construct_event_and_handle"));
-}
-
-void mca_common_cuda_destruct_event(uintptr_t event)
-{
-    CUresult result;
-
-    /* Only attempt to destroy if we have valid context.  This can change if a call
-     * to the fini function is made and we discover context is gone. */
-    if (ctx_ok) {
-        result = cuFunc.cuEventDestroy((CUevent) event);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed", true, result);
-        }
-    }
-}
-
-/*
- * Put remote event on stream to ensure that the the start of the
- * copy does not start until the completion of the event.
- */
-void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg)
-{
-#if OPAL_CUDA_SYNC_MEMOPS
-    /* No need for any of this with SYNC_MEMOPS feature */
-    return;
-#else /* OPAL_CUDA_SYNC_MEMOPS */
-    CUipcEventHandle evtHandle;
-    CUevent event;
-    CUresult result;
-
-    memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle));
-    CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize"));
-
-    result = cuFunc.cuIpcOpenEventHandle(&event, evtHandle);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed", true, result);
-    }
-
-    /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
-     * versions.  Need to record an event on the stream, even though
-     * it is not used, to make sure we do not short circuit our way
-     * out of the cuStreamWaitEvent test.
-     */
-    result = cuFunc.cuEventRecord(event, 0);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true,
-                       OPAL_PROC_MY_HOSTNAME, result);
-    }
-    /* END of Workaround */
-
-    result = cuFunc.cuStreamWaitEvent(0, event, 0);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuStreamWaitEvent failed", true, result);
-    }
-
-    /* All done with this event. */
-    result = cuFunc.cuEventDestroy(event);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed", true, result);
-    }
-#endif /* OPAL_CUDA_SYNC_MEMOPS */
-}
-
-/*
- * Start the asynchronous copy.  Then record and save away an event that will
- * be queried to indicate the copy has completed.
- */
-int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
-                           struct mca_btl_base_descriptor_t *frag, int *done)
-{
-    CUresult result;
-    int iter;
-
-    OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
-    /* First make sure there is room to store the event.  If not, then
-     * return an error.  The error message will tell the user to try and
-     * run again, but with a larger array for storing events. */
-    if (cuda_event_ipc_num_used == cuda_event_max) {
-        opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles", true, cuda_event_max,
-                       cuda_event_max + 100, cuda_event_max + 100);
-        OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-        return OPAL_ERR_OUT_OF_RESOURCE;
-    }
-
-    if (cuda_event_ipc_num_used > cuda_event_ipc_most) {
-        cuda_event_ipc_most = cuda_event_ipc_num_used;
-        /* Just print multiples of 10 */
-        if (0 == (cuda_event_ipc_most % 10)) {
-            opal_output_verbose(20, mca_common_cuda_output, "Maximum ipc events used is now %d",
-                                cuda_event_ipc_most);
-        }
-    }
-
-    /* This is the standard way to run.  Running with synchronous copies is available
-     * to measure the advantages of asynchronous copies. */
-    if (OPAL_LIKELY(mca_common_cuda_async)) {
-        result = cuFunc.cuMemcpyAsync((CUdeviceptr) dst, (CUdeviceptr) src, amount, ipcStream);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, dst, src,
-                           amount, result);
-            OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-            return OPAL_ERROR;
-        } else {
-            opal_output_verbose(20, mca_common_cuda_output,
-                                "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d", dst, src,
-                                (int) amount);
-        }
-        result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true,
-                           OPAL_PROC_MY_HOSTNAME, result);
-            OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-            return OPAL_ERROR;
-        }
-        cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
-
-        /* Bump up the first available slot and number used by 1 */
-        cuda_event_ipc_first_avail++;
-        if (cuda_event_ipc_first_avail >= cuda_event_max) {
-            cuda_event_ipc_first_avail = 0;
-        }
-        cuda_event_ipc_num_used++;
-
-        *done = 0;
-    } else {
-        /* Mimic the async function so they use the same memcpy call. */
-        result = cuFunc.cuMemcpyAsync((CUdeviceptr) dst, (CUdeviceptr) src, amount, ipcStream);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, dst, src,
-                           amount, result);
-            OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-            return OPAL_ERROR;
-        } else {
-            opal_output_verbose(20, mca_common_cuda_output,
-                                "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d", dst, src,
-                                (int) amount);
-        }
-
-        /* Record an event, then wait for it to complete with calls to cuEventQuery */
-        result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true,
-                           OPAL_PROC_MY_HOSTNAME, result);
-            OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-            return OPAL_ERROR;
-        }
-
-        cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
-
-        /* Bump up the first available slot and number used by 1 */
-        cuda_event_ipc_first_avail++;
-        if (cuda_event_ipc_first_avail >= cuda_event_max) {
-            cuda_event_ipc_first_avail = 0;
-        }
-        cuda_event_ipc_num_used++;
-
-        result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
-        if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result);
-            OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-            return OPAL_ERROR;
-        }
-
-        iter = 0;
-        while (CUDA_ERROR_NOT_READY == result) {
-            if (0 == (iter % 10)) {
-                opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
-            }
-            result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
-            if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
-                opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result);
-                OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-                return OPAL_ERROR;
-            }
-            iter++;
-        }
-
-        --cuda_event_ipc_num_used;
-        ++cuda_event_ipc_first_used;
-        if (cuda_event_ipc_first_used >= cuda_event_max) {
-            cuda_event_ipc_first_used = 0;
-        }
-        *done = 1;
-    }
-    OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-    return OPAL_SUCCESS;
-}
-
-/*
- * Record an event and save the frag.  This is called by the sending side and
- * is used to queue an event when a htod copy has been initiated.
- */
-int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_t *frag)
-{
-    CUresult result;
-
-    /* First make sure there is room to store the event.  If not, then
-     * return an error.  The error message will tell the user to try and
-     * run again, but with a larger array for storing events. */
-    OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
-    if (cuda_event_dtoh_num_used == cuda_event_max) {
-        opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles", true, cuda_event_max,
-                       cuda_event_max + 100, cuda_event_max + 100);
-        return OPAL_ERR_OUT_OF_RESOURCE;
-    }
-
-    if (cuda_event_dtoh_num_used > cuda_event_dtoh_most) {
-        cuda_event_dtoh_most = cuda_event_dtoh_num_used;
-        /* Just print multiples of 10 */
-        if (0 == (cuda_event_dtoh_most % 10)) {
-            opal_output_verbose(20, mca_common_cuda_output, "Maximum DtoH events used is now %d",
-                                cuda_event_dtoh_most);
-        }
-    }
-
-    result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true,
-                       OPAL_PROC_MY_HOSTNAME, result);
-        OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
-        return OPAL_ERROR;
-    }
-    cuda_event_dtoh_frag_array[cuda_event_dtoh_first_avail] = frag;
-
-    /* Bump up the first available slot and number used by 1 */
-    cuda_event_dtoh_first_avail++;
-    if (cuda_event_dtoh_first_avail >= cuda_event_max) {
-        cuda_event_dtoh_first_avail = 0;
-    }
-    cuda_event_dtoh_num_used++;
-
-    OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
-    return OPAL_SUCCESS;
-}
-
-/*
- * Record an event and save the frag.  This is called by the receiving side and
- * is used to queue an event when a dtoh copy has been initiated.
- */
-int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
-{
-    CUresult result;
-
-    OPAL_THREAD_LOCK(&common_cuda_htod_lock);
-    /* First make sure there is room to store the event.  If not, then
-     * return an error.  The error message will tell the user to try and
-     * run again, but with a larger array for storing events. */
-    if (cuda_event_htod_num_used == cuda_event_max) {
-        opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles", true, cuda_event_max,
-                       cuda_event_max + 100, cuda_event_max + 100);
-        OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
-        return OPAL_ERR_OUT_OF_RESOURCE;
-    }
-
-    if (cuda_event_htod_num_used > cuda_event_htod_most) {
-        cuda_event_htod_most = cuda_event_htod_num_used;
-        /* Just print multiples of 10 */
-        if (0 == (cuda_event_htod_most % 10)) {
-            opal_output_verbose(20, mca_common_cuda_output, "Maximum HtoD events used is now %d",
-                                cuda_event_htod_most);
-        }
-    }
-
-    result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", true,
-                       OPAL_PROC_MY_HOSTNAME, result);
-        OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
-        return OPAL_ERROR;
-    }
-    cuda_event_htod_frag_array[cuda_event_htod_first_avail] = frag;
-
-    /* Bump up the first available slot and number used by 1 */
-    cuda_event_htod_first_avail++;
-    if (cuda_event_htod_first_avail >= cuda_event_max) {
-        cuda_event_htod_first_avail = 0;
-    }
-    cuda_event_htod_num_used++;
-
-    OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
-    return OPAL_SUCCESS;
-}
-
-/**
- * Used to get the dtoh stream for initiating asynchronous copies.
- */
-void *mca_common_cuda_get_dtoh_stream(void)
-{
-    return (void *) dtohStream;
-}
-
-/**
- * Used to get the htod stream for initiating asynchronous copies.
- */
-void *mca_common_cuda_get_htod_stream(void)
-{
-    return (void *) htodStream;
-}
-
-/*
- * Function is called every time progress is called with the sm BTL.  If there
- * are outstanding events, check to see if one has completed.  If so, hand
- * back the fragment for further processing.
- */
-int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **frag)
-{
-    CUresult result;
-
-    if (OPAL_LIKELY(0 == cuda_event_ipc_num_used))
-        return 0;
-
-    OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
-    if (cuda_event_ipc_num_used > 0) {
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: progress_one_cuda_ipc_event, outstanding_events=%d",
-                            cuda_event_ipc_num_used);
-
-        result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
-
-        /* We found an event that is not ready, so return. */
-        if (CUDA_ERROR_NOT_READY == result) {
-            opal_output_verbose(20, mca_common_cuda_output,
-                                "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
-            *frag = NULL;
-            OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-            return 0;
-        } else if (CUDA_SUCCESS != result) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result);
-            *frag = NULL;
-            OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-            return OPAL_ERROR;
-        }
-
-        *frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used];
-        opal_output_verbose(10, mca_common_cuda_output, "CUDA: cuEventQuery returned %d", result);
-
-        /* Bump counters, loop around the circular buffer if necessary */
-        --cuda_event_ipc_num_used;
-        ++cuda_event_ipc_first_used;
-        if (cuda_event_ipc_first_used >= cuda_event_max) {
-            cuda_event_ipc_first_used = 0;
-        }
-        /* A return value of 1 indicates an event completed and a frag was returned */
-        OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-        return 1;
-    }
-    OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
-    return 0;
-}
-
-/**
- * Progress any dtoh event completions.
- */
-int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag)
-{
-    CUresult result;
-
-    OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
-    if (cuda_event_dtoh_num_used > 0) {
-        opal_output_verbose(30, mca_common_cuda_output,
-                            "CUDA: progress_one_cuda_dtoh_event, outstanding_events=%d",
-                            cuda_event_dtoh_num_used);
-
-        result = cuFunc.cuEventQuery(cuda_event_dtoh_array[cuda_event_dtoh_first_used]);
-
-        /* We found an event that is not ready, so return. */
-        if (CUDA_ERROR_NOT_READY == result) {
-            opal_output_verbose(30, mca_common_cuda_output,
-                                "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
-            *frag = NULL;
-            OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
-            return 0;
-        } else if (CUDA_SUCCESS != result) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result);
-            *frag = NULL;
-            OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
-            return OPAL_ERROR;
-        }
-
-        *frag = cuda_event_dtoh_frag_array[cuda_event_dtoh_first_used];
-        opal_output_verbose(30, mca_common_cuda_output, "CUDA: cuEventQuery returned %d", result);
-
-        /* Bump counters, loop around the circular buffer if necessary */
-        --cuda_event_dtoh_num_used;
-        ++cuda_event_dtoh_first_used;
-        if (cuda_event_dtoh_first_used >= cuda_event_max) {
-            cuda_event_dtoh_first_used = 0;
-        }
-        /* A return value of 1 indicates an event completed and a frag was returned */
-        OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
-        return 1;
-    }
-    OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
-    return 0;
-}
-
-/**
- * Progress any dtoh event completions.
- */
-int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag)
-{
-    CUresult result;
-
-    OPAL_THREAD_LOCK(&common_cuda_htod_lock);
-    if (cuda_event_htod_num_used > 0) {
-        opal_output_verbose(30, mca_common_cuda_output,
-                            "CUDA: progress_one_cuda_htod_event, outstanding_events=%d",
-                            cuda_event_htod_num_used);
-
-        result = cuFunc.cuEventQuery(cuda_event_htod_array[cuda_event_htod_first_used]);
-
-        /* We found an event that is not ready, so return. */
-        if (CUDA_ERROR_NOT_READY == result) {
-            opal_output_verbose(30, mca_common_cuda_output,
-                                "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
-            *frag = NULL;
-            OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
-            return 0;
-        } else if (CUDA_SUCCESS != result) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", true, result);
-            *frag = NULL;
-            OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
-            return OPAL_ERROR;
-        }
-
-        *frag = cuda_event_htod_frag_array[cuda_event_htod_first_used];
-        opal_output_verbose(30, mca_common_cuda_output, "CUDA: cuEventQuery returned %d", result);
-
-        /* Bump counters, loop around the circular buffer if necessary */
-        --cuda_event_htod_num_used;
-        ++cuda_event_htod_first_used;
-        if (cuda_event_htod_first_used >= cuda_event_max) {
-            cuda_event_htod_first_used = 0;
-        }
-        /* A return value of 1 indicates an event completed and a frag was returned */
-        OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
-        return 1;
-    }
-    OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
-    return OPAL_ERR_RESOURCE_BUSY;
-}
-
-/**
- * Need to make sure the handle we are retrieving from the cache is still
- * valid.  Compare the cached handle to the one received.
- */
-int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg,
-                                      mca_rcache_common_cuda_reg_t *old_reg)
-{
-
-    if (0
-        == memcmp(new_reg->data.memHandle, old_reg->data.memHandle,
-                  sizeof(new_reg->data.memHandle))) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-/*
- * Function to dump memory handle information.  This is based on
- * definitions from cuiinterprocess_private.h.
- */
-static void cuda_dump_memhandle(int verbose, void *memHandle, char *str)
-{
-
-    struct InterprocessMemHandleInternal {
-        /* The first two entries are the CUinterprocessCtxHandle */
-        int64_t ctxId; /* unique (within a process) id of the sharing context */
-        int pid;       /* pid of sharing context */
-
-        int64_t size;
-        int64_t blocksize;
-        int64_t offset;
-        int gpuId;
-        int subDeviceIndex;
-        int64_t serial;
-    } memH;
-
-    if (NULL == str) {
-        str = "CUDA";
-    }
-    memcpy(&memH, memHandle, sizeof(memH));
-    opal_output_verbose(verbose, mca_common_cuda_output,
-                        "%s:ctxId=0x%" PRIx64 ", pid=%d, size=%" PRIu64 ", blocksize=%" PRIu64
-                        ", offset=%" PRIu64 ", gpuId=%d, subDeviceIndex=%d, serial=%" PRIu64,
-                        str, memH.ctxId, memH.pid, memH.size, memH.blocksize, memH.offset,
-                        memH.gpuId, memH.subDeviceIndex, memH.serial);
-}
-
-/*
- * Function to dump memory handle information.  This is based on
- * definitions from cuiinterprocess_private.h.
- */
-static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str)
-{
-
-    struct InterprocessEventHandleInternal {
-        unsigned long pid;
-        unsigned long serial;
-        int index;
-    } evtH;
-
-    if (NULL == str) {
-        str = "CUDA";
-    }
-    memcpy(&evtH, evtHandle, sizeof(evtH));
-    opal_output_verbose(verbose, mca_common_cuda_output, "CUDA: %s:pid=%lu, serial=%lu, index=%d",
-                        str, evtH.pid, evtH.serial, evtH.index);
-}
-
-/* Return microseconds of elapsed time. Microseconds are relevant when
- * trying to understand the fixed overhead of the communication. Used
- * when trying to time various functions.
- *
- * Cut and past the following to get timings where wanted.
- *
- *   clock_gettime(CLOCK_MONOTONIC, &ts_start);
- *   FUNCTION OF INTEREST
- *   clock_gettime(CLOCK_MONOTONIC, &ts_end);
- *   accum = mydifftime(ts_start, ts_end);
- *   opal_output(0, "Function took   %7.2f usecs\n", accum);
- *
- */
-#if OPAL_ENABLE_DEBUG
-static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end)
-{
-    return (ts_end - ts_start);
-}
-#endif /* OPAL_ENABLE_DEBUG */
-
-/* Routines that get plugged into the opal datatype code */
-static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor)
-{
-    int res;
-    CUmemorytype memType = 0;
-    CUdeviceptr dbuf = (CUdeviceptr) pUserBuf;
-    CUcontext ctx = NULL, memCtx = NULL;
-#if OPAL_CUDA_GET_ATTRIBUTES
-    uint32_t isManaged = 0;
-    /* With CUDA 7.0, we can get multiple attributes with a single call */
-    CUpointer_attribute attributes[3] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-                                         CU_POINTER_ATTRIBUTE_CONTEXT,
-                                         CU_POINTER_ATTRIBUTE_IS_MANAGED};
-    void *attrdata[] = {(void *) &memType, (void *) &memCtx, (void *) &isManaged};
-
-    res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
-    OPAL_OUTPUT_VERBOSE((101, mca_common_cuda_output,
-                         "dbuf=%p, memType=%d, memCtx=%p, isManaged=%d, res=%d", (void *) dbuf,
-                         (int) memType, (void *) memCtx, isManaged, res));
-
-    /* Mark unified memory buffers with a flag.  This will allow all unified
-     * memory to be forced through host buffers.  Note that this memory can
-     * be either host or device so we need to set this flag prior to that check. */
-    if (1 == isManaged) {
-        if (NULL != convertor) {
-            convertor->flags |= CONVERTOR_ACCELERATOR_UNIFIED;
-        }
-    }
-    if (res != CUDA_SUCCESS) {
-        /* If we cannot determine it is device pointer,
-         * just assume it is not. */
-        return 0;
-    } else if (memType == CU_MEMORYTYPE_HOST) {
-        /* Host memory, nothing to do here */
-        return 0;
-    } else if (memType == 0) {
-        /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
-        return 0;
-    }
-    /* Must be a device pointer */
-    assert(memType == CU_MEMORYTYPE_DEVICE);
-#else /* OPAL_CUDA_GET_ATTRIBUTES */
-    res = cuFunc.cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
-    if (res != CUDA_SUCCESS) {
-        /* If we cannot determine it is device pointer,
-         * just assume it is not. */
-        return 0;
-    } else if (memType == CU_MEMORYTYPE_HOST) {
-        /* Host memory, nothing to do here */
-        return 0;
-    }
-    /* Must be a device pointer */
-    assert(memType == CU_MEMORYTYPE_DEVICE);
-#endif /* OPAL_CUDA_GET_ATTRIBUTES */
-
-    /* This piece of code was added in to handle in a case involving
-     * OMP threads.  The user had initialized CUDA and then spawned
-     * two threads.  The first thread had the CUDA context, but the
-     * second thread did not.  We therefore had no context to act upon
-     * and future CUDA driver calls would fail.  Therefore, if we have
-     * GPU memory, but no context, get the context from the GPU memory
-     * and set the current context to that.  It is rare that we will not
-     * have a context. */
-    res = cuFunc.cuCtxGetCurrent(&ctx);
-    if (OPAL_UNLIKELY(NULL == ctx)) {
-        if (CUDA_SUCCESS == res) {
-#if !OPAL_CUDA_GET_ATTRIBUTES
-            res = cuFunc.cuPointerGetAttribute(&memCtx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
-            if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-                opal_output(0,
-                            "CUDA: error calling cuPointerGetAttribute: "
-                            "res=%d, ptr=%p aborting...",
-                            res, pUserBuf);
-                return OPAL_ERROR;
-            }
-#endif /* OPAL_CUDA_GET_ATTRIBUTES */
-            res = cuFunc.cuCtxSetCurrent(memCtx);
-            if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-                opal_output(0,
-                            "CUDA: error calling cuCtxSetCurrent: "
-                            "res=%d, ptr=%p aborting...",
-                            res, pUserBuf);
-                return OPAL_ERROR;
-            } else {
-                OPAL_OUTPUT_VERBOSE(
-                    (10, mca_common_cuda_output, "CUDA: cuCtxSetCurrent passed: ptr=%p", pUserBuf));
-            }
-        } else {
-            /* Print error and proceed */
-            opal_output(0,
-                        "CUDA: error calling cuCtxGetCurrent: "
-                        "res=%d, ptr=%p aborting...",
-                        res, pUserBuf);
-            return OPAL_ERROR;
-        }
-    }
-
-    /* WORKAROUND - They are times when the above code determines a piece of memory
-     * is GPU memory, but it actually is not.  That has been seen on multi-GPU systems
-     * with 6 or 8 GPUs on them. Therefore, we will do this extra check.  Note if we
-     * made it this far, then the assumption at this point is we have GPU memory.
-     * Unfortunately, this extra call is costing us another 100 ns almost doubling
-     * the cost of this entire function. */
-    if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) {
-        CUdeviceptr pbase;
-        size_t psize;
-        res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);
-        if (CUDA_SUCCESS != res) {
-            opal_output_verbose(5, mca_common_cuda_output,
-                                "CUDA: cuMemGetAddressRange failed on this pointer: res=%d, buf=%p "
-                                "Overriding check and setting to host pointer. ",
-                                res, (void *) dbuf);
-            /* This cannot be GPU memory if the previous call failed */
-            return 0;
-        }
-    }
-
-    /* First access on a device pointer finalizes CUDA support initialization.
-     * If initialization fails, disable support. */
-    if (!stage_three_init_complete) {
-        if (0 != mca_common_cuda_stage_three_init()) {
-            opal_cuda_support = 0;
-        }
-    }
-
-    return 1;
-}
-
-static int mca_common_cuda_cu_memcpy_async(void *dest, const void *src, size_t size,
-                                           opal_convertor_t *convertor)
-{
-    return cuFunc.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size,
-                                (CUstream) convertor->stream);
-}
-
-/**
- * This function is plugged into various areas where a cuMemcpy would be called.
- * This is a synchronous operation that will not return until the copy is complete.
- */
-static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size)
-{
-    CUresult result;
-#if OPAL_ENABLE_DEBUG
-    CUmemorytype memTypeSrc, memTypeDst;
-    if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
-        /* Nice to know type of source and destination for timing output. Do
-         * not care about return code as memory type will just be set to 0 */
-        result = cuFunc.cuPointerGetAttribute(&memTypeDst, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-                                              (CUdeviceptr) dest);
-        result = cuFunc.cuPointerGetAttribute(&memTypeSrc, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-                                              (CUdeviceptr) src);
-        ts_start = opal_timer_base_get_usec();
-    }
-#endif
-    if (mca_common_cuda_cumemcpy_async) {
-        result = cuFunc.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, memcpyStream);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
-                           size, result);
-            return OPAL_ERROR;
-        }
-        result = cuFunc.cuStreamSynchronize(memcpyStream);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed", true,
-                           OPAL_PROC_MY_HOSTNAME, result);
-            return OPAL_ERROR;
-        }
-    } else {
-        result = cuFunc.cuMemcpy((CUdeviceptr) dest, (CUdeviceptr) src, size);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed", true,
-                           OPAL_PROC_MY_HOSTNAME, result);
-            return OPAL_ERROR;
-        }
-    }
-#if OPAL_ENABLE_DEBUG
-    if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
-        ts_end = opal_timer_base_get_usec();
-        accum = mydifftime(ts_start, ts_end);
-        if (mca_common_cuda_cumemcpy_async) {
-            opal_output(0,
-                        "cuMemcpyAsync took   %7.2f usecs, size=%d, (src=%p (%d), dst=%p (%d))\n",
-                        accum, (int) size, src, memTypeSrc, dest, memTypeDst);
-        } else {
-            opal_output(0, "cuMemcpy took   %7.2f usecs, size=%d,  (src=%p (%d), dst=%p (%d))\n",
-                        accum, (int) size, src, memTypeSrc, dest, memTypeDst);
-        }
-    }
-#endif
-    return OPAL_SUCCESS;
-}
-
-int mca_common_cuda_malloc(void **dptr, size_t size)
-{
-    int res, count = 0;
-    if (size > 0) {
-        res = cuFunc.cuMemAlloc((CUdeviceptr *) dptr, size);
-        if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-            opal_output(0, "CUDA: cuMemAlloc failed: res=%d", res);
-            return res;
-        }
-    }
-    return 0;
-}
-
-int mca_common_cuda_free(void *dptr)
-{
-    int res;
-    if (NULL != dptr) {
-        res = cuFunc.cuMemFree((CUdeviceptr) dptr);
-        if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-            opal_output(0, "CUDA: cuMemFree failed: res=%d", res);
-            return res;
-        }
-    }
-    return 0;
-}
-
-static int mca_common_cuda_memmove(void *dest, void *src, size_t size)
-{
-    CUdeviceptr tmp;
-    int result;
-
-    result = cuFunc.cuMemAlloc(&tmp, size);
-    if (mca_common_cuda_cumemcpy_async) {
-        result = cuFunc.cuMemcpyAsync(tmp, (CUdeviceptr) src, size, memcpyStream);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size,
-                           result);
-            return OPAL_ERROR;
-        }
-        result = cuFunc.cuMemcpyAsync((CUdeviceptr) dest, tmp, size, memcpyStream);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp,
-                           size, result);
-            return OPAL_ERROR;
-        }
-        result = cuFunc.cuStreamSynchronize(memcpyStream);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed", true,
-                           OPAL_PROC_MY_HOSTNAME, result);
-            return OPAL_ERROR;
-        }
-    } else {
-        result = cuFunc.cuMemcpy(tmp, (CUdeviceptr) src, size);
-        if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
-            opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
-                        result, (void *) tmp, src, (int) size);
-            return OPAL_ERROR;
-        }
-        result = cuFunc.cuMemcpy((CUdeviceptr) dest, tmp, size);
-        if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
-            opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
-                        result, dest, (void *) tmp, (int) size);
-            return OPAL_ERROR;
-        }
-    }
-    cuFunc.cuMemFree(tmp);
-    return OPAL_SUCCESS;
-}
-
-int mca_common_cuda_get_device(int *devicenum)
-{
-    CUdevice cuDev;
-    int res;
-
-    res = cuFunc.cuCtxGetDevice(&cuDev);
-    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-        opal_output(0, "CUDA: cuCtxGetDevice failed: res=%d", res);
-        return res;
-    }
-    *devicenum = cuDev;
-    return 0;
-}
-
-int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2)
-{
-    int res;
-    res = cuFunc.cuDeviceCanAccessPeer(access, (CUdevice) dev1, (CUdevice) dev2);
-    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-        opal_output(0, "CUDA: cuDeviceCanAccessPeer failed: res=%d", res);
-        return res;
-    }
-    return 0;
-}
-
-int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
-{
-    CUresult result;
-    result = cuFunc.cuMemGetAddressRange((CUdeviceptr *) pbase, psize, (CUdeviceptr) base);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed 2", true,
-                       OPAL_PROC_MY_HOSTNAME, result, base);
-        return OPAL_ERROR;
-    } else {
-        opal_output_verbose(50, mca_common_cuda_output,
-                            "CUDA: cuMemGetAddressRange passed: addr=%p, pbase=%p, psize=%lu ",
-                            base, *(char **) pbase, *psize);
-    }
-    return 0;
-}
-
-#if OPAL_CUDA_GDR_SUPPORT
-/* Check to see if the memory was freed between the time it was stored in
- * the registration cache and now.  Return true if the memory was previously
- * freed.  This is indicated by the BUFFER_ID value in the registration cache
- * not matching the BUFFER_ID of the buffer we are checking.  Return false
- * if the registration is still good.
- */
-bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg)
-{
-    int res;
-    unsigned long long bufID;
-    unsigned char *dbuf = reg->base;
-
-    res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) dbuf);
-    /* If we cannot determine the BUFFER_ID, then print a message and default
-     * to forcing the registration to be kicked out. */
-    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-        opal_show_help("help-mpi-common-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,
-                       res);
-        return true;
-    }
-    opal_output_verbose(50, mca_common_cuda_output,
-                        "CUDA: base=%p, bufID=%llu, reg->gpu_bufID=%llu, %s", dbuf, bufID,
-                        reg->gpu_bufID,
-                        (reg->gpu_bufID == bufID ? "BUFFER_ID match" : "BUFFER_ID do not match"));
-    if (bufID != reg->gpu_bufID) {
-        return true;
-    } else {
-        return false;
-    }
-}
-
-/*
- * Get the buffer ID from the memory and store it in the registration.
- * This is needed to ensure the cached registration is not stale.  If
- * we fail to get buffer ID, print an error and set buffer ID to 0.
- * Also set SYNC_MEMOPS on any GPU registration to ensure that
- * synchronous copies complete before the buffer is accessed.
- */
-void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg)
-{
-    int res;
-    unsigned long long bufID = 0;
-    unsigned char *dbuf = reg->base;
-    int enable = 1;
-
-    res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) dbuf);
-    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
-        opal_show_help("help-mpi-common-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,
-                       res);
-    }
-    reg->gpu_bufID = bufID;
-
-    res = cuFunc.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-                                       (CUdeviceptr) dbuf);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != res)) {
-        opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed", true,
-                       OPAL_PROC_MY_HOSTNAME, res, dbuf);
-    }
-}
-
-static bool initialized = false;
-int opal_cuda_verbose = 0;
-static int opal_cuda_enabled = 0; /* Starts out disabled */
-static int opal_cuda_output = 0;
-static void opal_cuda_support_init(void);
-static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
-static opal_common_cuda_function_table_t ftable;
-
-/* This function allows the common cuda code to register an
- * initialization function that gets called the first time an attempt
- * is made to send or receive a GPU pointer.  This allows us to delay
- * some CUDA initialization until after MPI_Init().
- */
-void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *))
-{
-    common_cuda_initialization_function = fptr;
-}
-
-/**
- * This function is called when a convertor is instantiated.  It has to call
- * the opal_cuda_support_init() function once to figure out if CUDA support
- * is enabled or not.  If CUDA is not enabled, then short circuit out
- * for all future calls.
- */
-void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf)
-{
-    /* Only do the initialization on the first GPU access */
-    if (!initialized) {
-        opal_cuda_support_init();
-    }
-
-    /* This is needed to handle case where convertor is not fully initialized
-     * like when trying to do a sendi with convertor on the statck */
-    convertor->cbmemcpy = (memcpy_fct_t) &opal_cuda_memcpy;
-
-    /* If not enabled, then nothing else to do */
-    if (!opal_cuda_enabled) {
-        return;
-    }
-
-    if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
-        convertor->flags |= CONVERTOR_ACCELERATOR;
-    }
-}
-
-/* Checks the type of pointer
- *
- * @param dest   One pointer to check
- * @param source Another pointer to check
- */
-bool opal_cuda_check_bufs(char *dest, char *src)
-{
-    /* Only do the initialization on the first GPU access */
-    if (!initialized) {
-        opal_cuda_support_init();
-    }
-
-    if (!opal_cuda_enabled) {
-        return false;
-    }
-
-    if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
-        return true;
-    } else {
-        return false;
-    }
-}
-
-/*
- * With CUDA enabled, all contiguous copies will pass through this function.
- * Therefore, the first check is to see if the convertor is a GPU buffer.
- * Note that if there is an error with any of the CUDA calls, the program
- * aborts as there is no recovering.
- */
-
-/* Checks the type of pointer
- *
- * @param buf   check one pointer providing a convertor.
- *  Provides additional information, e.g. managed vs. unmanaged GPU buffer
- */
-bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor)
-{
-    /* Only do the initialization on the first GPU access */
-    if (!initialized) {
-        opal_cuda_support_init();
-    }
-
-    if (!opal_cuda_enabled) {
-        return false;
-    }
-
-    return (ftable.gpu_is_gpu_buffer(buf, convertor));
-}
-
-/*
- * This function allocates a buffer using either cuMemAlloc
- * or malloc, depending on if the convertor flag CONVERTOR_CUDA
- * is set.
- *
- * @param size       Size of buffer to be allocated
- * @param convertor  The convertor with flags describing if the buf
- *                   should be a Host or Cuda buffer.
- *
- * @returns void *   A pointer to the newly allocated buffer.
- */
-void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor)
-{
-    int res;
-    void *buffer;
-    if (!(convertor->flags & CONVERTOR_ACCELERATOR)) {
-        return malloc(size);
-    }
-    res = ftable.gpu_malloc(buffer, size);
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemAlloc: size=%d", (int) size);
-        abort();
-    } else {
-        return buffer;
-    }
-}
-
-/*
- * This function frees a buffer using either cuMemFree() or free(),
- * depending on if the convertor flag CONVERTOR_CUDA is set.
- *
- * @param buffer     Pointer to buffer to be freed
- * @param convertor  The convertor with flags describing if the buf
- *                   should be a Host or Cuda buffer.
- *
- */
-void opal_cuda_free(void *buffer, opal_convertor_t *convertor)
-{
-    int res;
-    if (!(convertor->flags & CONVERTOR_ACCELERATOR)) {
-        free(buffer);
-        return;
-    }
-    res = ftable.gpu_free(buffer);
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemFree: ptr=%p", buffer);
-        abort();
-    }
-    return;
-}
-
-/*
- * With CUDA enabled, all contiguous copies will pass through this function.
- * Therefore, the first check is to see if the convertor is a GPU buffer.
- * Note that if there is an error with any of the CUDA calls, the program
- * aborts as there is no recovering.
- */
-
-void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor)
-{
-    int res;
-
-    if (!(convertor->flags & CONVERTOR_ACCELERATOR)) {
-        return memcpy(dest, src, size);
-    }
-
-    if (convertor->flags & CONVERTOR_ACCELERATOR_ASYNC) {
-        res = ftable.gpu_cu_memcpy_async(dest, (void *) src, size, convertor);
-    } else {
-        res = ftable.gpu_cu_memcpy(dest, (void *) src, size);
-    }
-
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src,
-                    (int) size);
-        abort();
-    } else {
-        return dest;
-    }
-}
-
-/*
- * This function is needed in cases where we do not have contiguous
- * datatypes.  The current code has macros that cannot handle a convertor
- * argument to the memcpy call.
- */
-void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
-{
-    int res;
-    res = ftable.gpu_cu_memcpy(dest, src, size);
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src,
-                    (int) size);
-        abort();
-    } else {
-        return dest;
-    }
-}
-
-/*
- * In some cases, need an implementation of memmove.  This is not fast, but
- * it is not often needed.
- */
-void *opal_cuda_memmove(void *dest, void *src, size_t size)
-{
-    int res;
-
-    res = ftable.gpu_memmove(dest, src, size);
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d", res, dest,
-                    src, (int) size);
-        abort();
-    }
-    return dest;
-}
-
-/**
- * This function gets called once to check if the program is running in a cuda
- * environment.
- */
-static void opal_cuda_support_init(void)
-{
-    if (initialized) {
-        return;
-    }
-
-    /* Set different levels of verbosity in the cuda related code. */
-    opal_cuda_output = opal_output_open(NULL);
-    opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);
-
-    /* Callback into the common cuda initialization routine. This is only
-     * set if some work had been done already in the common cuda code.*/
-    if (NULL != common_cuda_initialization_function) {
-        if (0 == common_cuda_initialization_function(&ftable)) {
-            opal_cuda_enabled = 1;
-        }
-    }
-
-    if (1 == opal_cuda_enabled) {
-        opal_output_verbose(10, opal_cuda_output,
-                            "CUDA: enabled successfully, CUDA device pointers will work");
-    } else {
-        opal_output_verbose(10, opal_cuda_output,
-                            "CUDA: not enabled, CUDA device pointers will not work");
-    }
-
-    initialized = true;
-}
-
-/**
- * Tell the convertor that copies will be asynchronous CUDA copies.  The
- * flags are cleared when the convertor is reinitialized.
- */
-void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream)
-{
-    convertor->flags |= CONVERTOR_ACCELERATOR_ASYNC;
-    convertor->stream = stream;
-}
-#endif /* OPAL_CUDA_GDR_SUPPORT */
diff --git a/opal/cuda/common_cuda.h b/opal/cuda/common_cuda.h
deleted file mode 100644
index 431fff6daa7..00000000000
--- a/opal/cuda/common_cuda.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
-/*
- * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
- *                         University Research and Technology
- *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2013 The University of Tennessee and The University
- *                         of Tennessee Research Foundation.  All rights
- *                         reserved.
- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
- *                         University of Stuttgart.  All rights reserved.
- * Copyright (c) 2004-2006 The Regents of the University of California.
- *                         All rights reserved.
- * Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
- * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
- *                         reserved.
- * $COPYRIGHT$
- *
- * Additional copyrights may follow
- *
- * $HEADER$
- */
-
-#ifndef OPAL_MCA_COMMON_CUDA_H
-#define OPAL_MCA_COMMON_CUDA_H
-#include "opal/datatype/opal_convertor.h"
-#include "opal/mca/btl/btl.h"
-
-#define MEMHANDLE_SIZE 8
-#define EVTHANDLE_SIZE 8
-
-struct mca_rcache_common_cuda_reg_data_t {
-    uint64_t memHandle[MEMHANDLE_SIZE];
-    uint64_t evtHandle[EVTHANDLE_SIZE];
-    uint64_t event;
-    opal_ptr_t memh_seg_addr;
-    size_t memh_seg_len;
-};
-typedef struct mca_rcache_common_cuda_reg_data_t mca_rcache_common_cuda_reg_data_t;
-
-struct mca_rcache_common_cuda_reg_t {
-    mca_rcache_base_registration_t base;
-    mca_rcache_common_cuda_reg_data_t data;
-};
-typedef struct mca_rcache_common_cuda_reg_t mca_rcache_common_cuda_reg_t;
-extern bool mca_common_cuda_enabled;
-
-OPAL_DECLSPEC void mca_common_cuda_register_mca_variables(void);
-
-OPAL_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg);
-
-OPAL_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg);
-
-OPAL_DECLSPEC void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg);
-
-OPAL_DECLSPEC int mca_common_cuda_malloc(void **buffer, size_t size);
-OPAL_DECLSPEC int mca_common_cuda_free(void *buffer);
-
-OPAL_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
-                                         struct mca_btl_base_descriptor_t *, int *done);
-
-OPAL_DECLSPEC int mca_common_cuda_record_ipc_event(char *msg,
-                                                   struct mca_btl_base_descriptor_t *frag);
-OPAL_DECLSPEC int mca_common_cuda_record_dtoh_event(char *msg,
-                                                    struct mca_btl_base_descriptor_t *frag);
-OPAL_DECLSPEC int mca_common_cuda_record_htod_event(char *msg,
-                                                    struct mca_btl_base_descriptor_t *frag);
-
-OPAL_DECLSPEC void *mca_common_cuda_get_dtoh_stream(void);
-OPAL_DECLSPEC void *mca_common_cuda_get_htod_stream(void);
-
-OPAL_DECLSPEC int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **);
-OPAL_DECLSPEC int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **);
-OPAL_DECLSPEC int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **);
-
-OPAL_DECLSPEC int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg,
-                                                    mca_rcache_common_cuda_reg_t *old_reg);
-
-OPAL_DECLSPEC void mca_common_cuda_construct_event_and_handle(uintptr_t *event, void *handle);
-OPAL_DECLSPEC void mca_common_cuda_destruct_event(uintptr_t event);
-
-OPAL_DECLSPEC int cuda_getmemhandle(void *base, size_t, mca_rcache_base_registration_t *newreg,
-                                    mca_rcache_base_registration_t *hdrreg);
-OPAL_DECLSPEC int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg);
-OPAL_DECLSPEC int cuda_openmemhandle(void *base, size_t size,
-                                     mca_rcache_base_registration_t *newreg,
-                                     mca_rcache_base_registration_t *hdrreg);
-OPAL_DECLSPEC int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg);
-OPAL_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
-OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
-OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
-OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
-OPAL_DECLSPEC void mca_common_cuda_fini(void);
-#if OPAL_CUDA_GDR_SUPPORT
-OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg);
-OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg);
-#endif /* OPAL_CUDA_GDR_SUPPORT */
-/**
- * Return:   0 if no packing is required for sending (the upper layer
- *             can use directly the pointer to the contiguous user
- *             buffer).
- *           1 if data does need to be packed, i.e. heterogeneous peers
- *             (source arch != dest arch) or non contiguous memory
- *             layout.
- */
-static inline int32_t opal_convertor_cuda_need_buffers(opal_convertor_t *pConvertor)
-{
-    int32_t retval;
-    uint32_t cudaflag = pConvertor->flags & CONVERTOR_ACCELERATOR; /* Save CUDA flag */
-    pConvertor->flags &= ~CONVERTOR_ACCELERATOR;                   /* Clear CUDA flag if it exists */
-    retval = opal_convertor_need_buffers(pConvertor);
-    pConvertor->flags |= cudaflag; /* Restore CUDA flag */
-    return retval;
-}
-
-/* Structure to hold CUDA support functions that gets filled in when the
- * common cuda code is initialized.  This removes any dependency on <cuda.h>
- * in the opal cuda datatype code. */
-struct opal_common_cuda_function_table {
-    int (*gpu_is_gpu_buffer)(const void *, opal_convertor_t *);
-    int (*gpu_cu_memcpy_async)(void *, const void *, size_t, opal_convertor_t *);
-    int (*gpu_cu_memcpy)(void *, const void *, size_t);
-    int (*gpu_memmove)(void *, void *, size_t);
-    int (*gpu_malloc)(void *, size_t);
-    int (*gpu_free)(void *);
-};
-typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
-
-void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf);
-bool opal_cuda_check_bufs(char *dest, char *src);
-bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor);
-void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor);
-void opal_cuda_free(void *buffer, opal_convertor_t *convertor);
-void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor);
-void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size);
-void *opal_cuda_memmove(void *dest, void *src, size_t size);
-void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
-void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream);
-
-#endif /* OPAL_MCA_COMMON_CUDA_H */
diff --git a/opal/cuda/help-mpi-common-cuda.txt b/opal/cuda/help-mpi-common-cuda.txt
deleted file mode 100644
index e6f7913316b..00000000000
--- a/opal/cuda/help-mpi-common-cuda.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-# -*- text -*-
-#
-# Copyright (c) 2011-2015 NVIDIA.  All rights reserved.
-# Copyright (c) 2015 Cisco Systems, Inc.  All rights reserved.
-# $COPYRIGHT$
-#
-# Additional copyrights may follow
-#
-# $HEADER$
-#
-[cuCtxGetCurrent failed not initialized]
-WARNING: The call to cuCtxGetCurrent() failed while attempting to register
-internal memory with the CUDA environment.  The program will continue to run,
-but the performance of GPU memory transfers may be reduced.  This failure
-indicates that the CUDA environment is not yet initialized.  To eliminate
-this warning, ensure that CUDA is initialized prior to calling MPI_Init.
-
-NOTE: You can turn off this warning by setting the MCA parameter
-      mpi_common_cuda_warning to 0.
-#
-[cuCtxGetCurrent failed]
-WARNING: The call to cuCtxGetCurrent() failed while attempting to register
-internal memory with the CUDA environment.  The program will continue to run,
-but the performance of GPU memory transfers may be reduced.
-  cuCtxGetCurrent return value:   %d
-
-NOTE: You can turn off this warning by setting the MCA parameter
-      mpi_common_cuda_warning to 0.
-#
-[cuCtxGetCurrent returned NULL]
-WARNING: The call to cuCtxGetCurrent() failed while attempting to register
-internal memory with the CUDA environment.  The program will continue to run,
-but the performance of GPU memory transfers may be reduced.  This failure
-indicates that there is no CUDA context yet.  To eliminate this warning,
-ensure that there is a CUDA context prior to calling MPI_Init.
-
-NOTE: You can turn off this warning by setting the MCA parameter
-      mpi_common_cuda_warning to 0.
-#
-[cuMemHostRegister during init failed]
-The call to cuMemHostRegister(%p, %d, 0) failed.
-  Host:  %s
-  cuMemHostRegister return value:  %d
-  Registration cache:  %s
-#
-[cuMemHostRegister failed]
-The call to cuMemHostRegister(%p, %d, 0) failed.
-  Host:  %s
-  cuMemHostRegister return value:  %d
-  Registration cache:  %s
-#
-[cuIpcGetMemHandle failed]
-The call to cuIpcGetMemHandle failed. This means the GPU RDMA protocol
-cannot be used.
-  cuIpcGetMemHandle return value:   %d
-  address: %p
-Check the cuda.h file for what the return value means. Perhaps a reboot
-of the node will clear the problem.
-#
-[cuMemGetAddressRange failed]
-The call to cuMemGetAddressRange failed. This means the GPU RDMA protocol
-cannot be used.
-  cuMemGetAddressRange return value:   %d
-  address: %p
-Check the cuda.h file for what the return value means. Perhaps a reboot
-of the node will clear the problem.
-#
-[cuMemGetAddressRange failed 2]
-The call to cuMemGetAddressRange failed during the GPU RDMA protocol.
-  Host:  %s
-  cuMemGetAddressRange return value:  %d
-  address:  %p
-Check the cuda.h file for what the return value means. This is highly
-unusual and should not happen. The program will probably abort.
-#
-[Out of cuEvent handles]
-The library has exceeded its number of outstanding event handles.
-For better performance, this number should be increased.
-  Current maximum handles:   %4d
-  Suggested new maximum:     %4d
-Rerun with --mca mpi_common_cuda_event_max %d
-#
-[cuIpcOpenMemHandle failed]
-The call to cuIpcOpenMemHandle failed. This is an unrecoverable error
-and will cause the program to abort.
-  Hostname:                         %s
-  cuIpcOpenMemHandle return value:  %d
-  address:                          %p
-Check the cuda.h file for what the return value means. A possible cause
-for this is not enough free device memory.  Try to reduce the device
-memory footprint of your application.
-#
-[cuIpcCloseMemHandle failed]
-The call to cuIpcCloseMemHandle failed. This is a warning and the program
-will continue to run.
-  cuIpcCloseMemHandle return value:   %d
-  address: %p
-Check the cuda.h file for what the return value means. Perhaps a reboot
-of the node will clear the problem.
-#
-[cuMemcpyAsync failed]
-The call to cuMemcpyAsync failed. This is a unrecoverable error and will
-cause the program to abort.
-  cuMemcpyAsync(%p, %p, %d) returned value %d
-Check the cuda.h file for what the return value means.
-#
-[cuEventCreate failed]
-The call to cuEventCreate failed. This is a unrecoverable error and will
-cause the program to abort.
-  Hostname:                     %s
-  cuEventCreate return value:   %d
-Check the cuda.h file for what the return value means.
-#
-[cuEventRecord failed]
-The call to cuEventRecord failed. This is a unrecoverable error and will
-cause the program to abort.
-  Hostname:                     %s
-  cuEventRecord return value:   %d
-Check the cuda.h file for what the return value means.
-#
-[cuEventQuery failed]
-The call to cuEventQuery failed. This is a unrecoverable error and will
-cause the program to abort.
-  cuEventQuery return value:   %d
-Check the cuda.h file for what the return value means.
-#
-[cuIpcGetEventHandle failed]
-The call to cuIpcGetEventHandle failed. This is a unrecoverable error and will
-cause the program to abort.
-  cuIpcGetEventHandle return value:   %d
-Check the cuda.h file for what the return value means.
-#
-[cuIpcOpenEventHandle failed]
-The call to cuIpcOpenEventHandle failed. This is a unrecoverable error and will
-cause the program to abort.
-  cuIpcOpenEventHandle return value:   %d
-Check the cuda.h file for what the return value means.
-#
-[cuStreamWaitEvent failed]
-The call to cuStreamWaitEvent failed. This is a unrecoverable error and will
-cause the program to abort.
-  cuStreamWaitEvent return value:   %d
-Check the cuda.h file for what the return value means.
-#
-[cuEventDestroy failed]
-The call to cuEventDestory failed. This is a unrecoverable error and will
-cause the program to abort.
-  cuEventDestory return value:   %d
-Check the cuda.h file for what the return value means.
-#
-[cuStreamCreate failed]
-The call to cuStreamCreate failed.  This is a unrecoverable error and will
-cause the program to abort.
-  Hostname:                      %s
-  cuStreamCreate return value:   %d
-Check the cuda.h file for what the return vale means.
-#
-[dlopen disabled]
-Open MPI was compiled without dynamic library support (e.g., with the
- --disable-dlopen flag), and therefore cannot utilize CUDA support.
-
-If you need CUDA support, reconfigure Open MPI with dynamic library support enabled.
-#
-[dlopen failed]
-The library attempted to open the following supporting CUDA libraries,
-but each of them failed.  CUDA-aware support is disabled.
-%s
-If you do not require CUDA-aware support, then run with
---mca opal_warn_on_missing_libcuda 0 to suppress this message.  If you do
-require CUDA-aware support, then try setting LD_LIBRARY_PATH to the location
-of libcuda.so.1 to resolve this issue.
-#
-[dlsym failed]
-An error occurred while trying to map in the address of a function.
-  Function Name: %s
-  Error string:  %s
-CUDA-aware support is disabled.
-#
-[bufferID failed]
-An error occurred while trying to get the BUFFER_ID of a GPU memory
-region.  This could cause incorrect results.  Turn of GPU Direct RDMA
-support by running with --mca btl_openib_cuda_want_gdr_support 0.
-  Hostname:                             %s
-  cuPointerGetAttribute return value:   %d
-Check the cuda.h file for what the return value means.
-[cuPointerSetAttribute failed]
-The call to cuPointerSetAttribute with CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
-failed. This is highly unusual and should not happen.  The program will
-continue, but report this error to the Open MPI developers.
-  Hostname:                             %s
-  cuPointerSetAttribute return value:   %d
-  Address:                              %p
-Check the cuda.h file for what the return value means.
-#
-[cuStreamSynchronize failed]
-The call to cuStreamSynchronize failed. This is highly unusual and should
-not happen.  Please report this error to the Open MPI developers.
-  Hostname:                             %s
-  cuStreamSynchronize return value:     %d
-Check the cuda.h file for what the return value means.
-#
-[cuMemcpy failed]
-The call to cuMemcpy failed. This is highly unusual and should
-not happen.  Please report this error to the Open MPI developers.
-  Hostname:                  %s
-  cuMemcpy return value:     %d
-Check the cuda.h file for what the return value means.
-#
-[No memory]
-A call to allocate memory within the CUDA support failed.  This is
-an unrecoverable error and will cause the program to abort.
-  Hostname:  %s
diff --git a/opal/cuda/owner.txt b/opal/cuda/owner.txt
deleted file mode 100644
index 9a3b6b5a6d4..00000000000
--- a/opal/cuda/owner.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# owner/status file
-# owner: institution that is responsible for this package
-# status: e.g. active, maintenance, unmaintained
-#
-owner: NVIDIA
-status:active
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index fc9573eef70..527bb310bb1 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -48,8 +48,6 @@ bool opal_ddt_copy_debug = false;
 bool opal_ddt_raw_debug = false;
 int opal_ddt_verbose = -1; /* Has the datatype verbose it's own output stream */
 
-extern int opal_cuda_verbose;
-
 /* Using this macro implies that at this point _all_ information needed
  * to fill up the datatype are known.
  * We fill all the static information, the pointer to desc.desc is setup
@@ -226,16 +224,6 @@ int opal_datatype_register_params(void)
     if (0 > ret) {
         return ret;
     }
-#    if OPAL_CUDA_SUPPORT
-    /* Set different levels of verbosity in the cuda related code. */
-    ret = mca_base_var_register("opal", "opal", NULL, "cuda_verbose",
-                                "Set level of opal cuda verbosity", MCA_BASE_VAR_TYPE_INT, NULL, 0,
-                                MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8,
-                                MCA_BASE_VAR_SCOPE_LOCAL, &opal_cuda_verbose);
-    if (0 > ret) {
-        return ret;
-    }
-#    endif
 
 #endif /* OPAL_ENABLE_DEBUG */
 
diff --git a/opal/include/opal/Makefile.am b/opal/include/opal/Makefile.am
index ed657307caf..baf470529eb 100644
--- a/opal/include/opal/Makefile.am
+++ b/opal/include/opal/Makefile.am
@@ -29,7 +29,8 @@ headers += \
         opal/hash_string.h \
 	opal/frameworks.h \
 	opal/opal_portable_platform.h \
-	opal/opal_portable_platform_real.h
+	opal/opal_portable_platform_real.h \
+	opal/opal_cuda.h
 
 nodist_headers += \
 	opal/version.h
diff --git a/opal/include/opal/opal_cuda.h b/opal/include/opal/opal_cuda.h
new file mode 100644
index 00000000000..5c91716cc2d
--- /dev/null
+++ b/opal/include/opal/opal_cuda.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2024-2006 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2013 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2006 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2022      Amazon.com, Inc. or its affiliates.
+ *                         All Rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * This file is intended only to carry shared types. If actual cuda
+ * symbols are required, they need to be added to a new common cuda
+ * component.
+ */
+
+#ifndef OPAL_CUDA_H
+#define OPAL_CUDA_H
+#include "opal/mca/rcache/rcache.h"
+
+#define MEMHANDLE_SIZE 8
+#define EVTHANDLE_SIZE 8
+
+struct mca_opal_cuda_reg_data_t {
+    uint64_t memHandle[MEMHANDLE_SIZE];
+    uint64_t evtHandle[EVTHANDLE_SIZE];
+    uint64_t event;
+    opal_ptr_t memh_seg_addr;
+    size_t memh_seg_len;
+};
+typedef struct mca_opal_cuda_reg_data_t mca_opal_cuda_reg_data_t;
+
+struct mca_opal_cuda_reg_t {
+    mca_rcache_base_registration_t base;
+    mca_opal_cuda_reg_data_t data;
+};
+typedef struct mca_opal_cuda_reg_t mca_opal_cuda_reg_t;
+#endif /* OPAL_CUDA_H */
diff --git a/opal/mca/accelerator/cuda/Makefile.am b/opal/mca/accelerator/cuda/Makefile.am
index a2463d729ef..5646890bab3 100644
--- a/opal/mca/accelerator/cuda/Makefile.am
+++ b/opal/mca/accelerator/cuda/Makefile.am
@@ -32,10 +32,13 @@ endif
 
 mcacomponentdir = $(opallibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
+
 mca_accelerator_cuda_la_SOURCES = $(sources)
 mca_accelerator_cuda_la_LDFLAGS = -module -avoid-version
-mca_accelerator_cuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la
+mca_accelerator_cuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
+        $(accelerator_cuda_LIBS)
 
 noinst_LTLIBRARIES = $(component_noinst)
 libmca_accelerator_cuda_la_SOURCES =$(sources)
 libmca_accelerator_cuda_la_LDFLAGS = -module -avoid-version
+libmca_accelerator_cuda_la_LIBADD = $(accelerator_cuda_LIBS)
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index 5369680839c..9a955bac596 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -97,7 +97,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
                                          CU_POINTER_ATTRIBUTE_IS_MANAGED};
     void *attrdata[] = {(void *) &mem_type, (void *) &mem_ctx, (void *) &is_managed};
 
-    result = opal_accelerator_cuda_func.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
+    result = cuPointerGetAttributes(3, attributes, attrdata, dbuf);
     OPAL_OUTPUT_VERBOSE((101, opal_accelerator_base_framework.framework_output,
                          "dbuf=%p, mem_type=%d, mem_ctx=%p, is_managed=%d, result=%d", (void *) dbuf,
                          (int) mem_type, (void *) mem_ctx, is_managed, result));
@@ -121,7 +121,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
 #else /* OPAL_CUDA_GET_ATTRIBUTES */
-    result = opal_accelerator_cuda_func.cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
+    result = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
     if (CUDA_SUCCESS != result) {
         /* If we cannot determine it is device pointer,
          * just assume it is not. */
@@ -142,11 +142,11 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
      * GPU memory, but no context, get the context from the GPU memory
      * and set the current context to that.  It is rare that we will not
      * have a context. */
-    result = opal_accelerator_cuda_func.cuCtxGetCurrent(&ctx);
+    result = cuCtxGetCurrent(&ctx);
     if (OPAL_UNLIKELY(NULL == ctx)) {
         if (CUDA_SUCCESS == result) {
 #if !OPAL_CUDA_GET_ATTRIBUTES
-            result = opal_accelerator_cuda_func.cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
+            result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
             if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
                 opal_output(0,
                             "CUDA: error calling cuPointerGetAttribute: "
@@ -155,7 +155,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
                 return OPAL_ERROR;
             }
 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
-            result = opal_accelerator_cuda_func.cuCtxSetCurrent(mem_ctx);
+            result = cuCtxSetCurrent(mem_ctx);
             if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
                 opal_output(0,
                             "CUDA: error calling cuCtxSetCurrent: "
@@ -185,7 +185,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     if (OPAL_LIKELY(((CUDA_VERSION > 7000) ? 0 : 1))) {
         CUdeviceptr pbase;
         size_t psize;
-        result = opal_accelerator_cuda_func.cuMemGetAddressRange(&pbase, &psize, dbuf);
+        result = cuMemGetAddressRange(&pbase, &psize, dbuf);
         if (CUDA_SUCCESS != result) {
             opal_output_verbose(5, opal_accelerator_base_framework.framework_output,
                                 "CUDA: cuMemGetAddressRange failed on this pointer: result=%d, buf=%p "
@@ -214,7 +214,7 @@ static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t
         return OPAL_ERR_OUT_OF_RESOURCE;
     }
 
-    result = opal_accelerator_cuda_func.cuStreamCreate((*stream)->stream, 0);
+    result = cuStreamCreate((*stream)->stream, 0);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -230,7 +230,7 @@ static void opal_accelerator_cuda_stream_destruct(opal_accelerator_cuda_stream_t
     CUresult result;
 
     if (NULL != stream->base.stream) {
-        result = opal_accelerator_cuda_func.cuStreamDestroy(*(CUstream *)stream->base.stream);
+        result = cuStreamDestroy(*(CUstream *)stream->base.stream);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuStreamDestroy failed", true,
                            result);
@@ -259,7 +259,7 @@ static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **
         OBJ_RELEASE(*event);
         return OPAL_ERR_OUT_OF_RESOURCE;
     }
-    result = opal_accelerator_cuda_func.cuEventCreate((*event)->event, CU_EVENT_DISABLE_TIMING);
+    result = cuEventCreate((*event)->event, CU_EVENT_DISABLE_TIMING);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuEventCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -274,7 +274,7 @@ static void opal_accelerator_cuda_event_destruct(opal_accelerator_cuda_event_t *
 {
     CUresult result;
     if (NULL != event->base.event) {
-        result = opal_accelerator_cuda_func.cuEventDestroy(*(CUevent *)event->base.event);
+        result = cuEventDestroy(*(CUevent *)event->base.event);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuEventDestroy failed", true,
                            result);
@@ -297,7 +297,7 @@ static int accelerator_cuda_record_event(int dev_id, opal_accelerator_event_t *e
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuEventRecord(*(CUevent *)event->event, *(CUstream *)stream->stream);
+    result = cuEventRecord(*(CUevent *)event->event, *(CUstream *)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuEventRecord failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -314,7 +314,7 @@ static int accelerator_cuda_query_event(int dev_id, opal_accelerator_event_t *ev
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuEventQuery(*(CUevent *)event->event);
+    result = cuEventQuery(*(CUevent *)event->event);
     switch (result) {
         case CUDA_SUCCESS:
             {
@@ -344,7 +344,7 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, *(CUstream *)stream->stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, *(CUstream *)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
                        size, result);
@@ -370,13 +370,13 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
      * Additionally, cuMemcpy is not necessarily always synchronous. See:
      * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html
      * TODO: Add optimizations for type field */
-    result = opal_accelerator_cuda_func.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
                        size, result);
         return OPAL_ERROR;
     }
-    result = opal_accelerator_cuda_func.cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
+    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -395,29 +395,29 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuMemAlloc(&tmp, size);
+    result = cuMemAlloc(&tmp, size);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         return OPAL_ERROR;
     }
-    result = opal_accelerator_cuda_func.cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size,
                        result);
         return OPAL_ERROR;
     }
-    result = opal_accelerator_cuda_func.cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp,
                        size, result);
         return OPAL_ERROR;
     }
-    result = opal_accelerator_cuda_func.cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
+    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }
-    opal_accelerator_cuda_func.cuMemFree(tmp);
+    cuMemFree(tmp);
     return OPAL_SUCCESS;
 }
 
@@ -430,7 +430,7 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
     }
 
     if (size > 0) {
-        result = opal_accelerator_cuda_func.cuMemAlloc((CUdeviceptr *) ptr, size);
+        result = cuMemAlloc((CUdeviceptr *) ptr, size);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
                            OPAL_PROC_MY_HOSTNAME, result);
@@ -444,7 +444,7 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr)
 {
     CUresult result;
     if (NULL != ptr) {
-        result = opal_accelerator_cuda_func.cuMemFree((CUdeviceptr) ptr);
+        result = cuMemFree((CUdeviceptr) ptr);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true,
                            OPAL_PROC_MY_HOSTNAME, result);
@@ -463,7 +463,7 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuMemGetAddressRange((CUdeviceptr *) base, size, (CUdeviceptr) ptr);
+    result = cuMemGetAddressRange((CUdeviceptr *) base, size, (CUdeviceptr) ptr);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemGetAddressRange failed 2", true,
                        OPAL_PROC_MY_HOSTNAME, result, ptr);
@@ -483,7 +483,7 @@ static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size)
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuMemHostRegister(ptr, size, 0);
+    result = cuMemHostRegister(ptr, size, 0);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister failed", true,
                        ptr, size, OPAL_PROC_MY_HOSTNAME, result);
@@ -497,7 +497,7 @@ static int accelerator_cuda_host_unregister(int dev_id, void *ptr)
 {
     CUresult result;
     if (NULL != ptr) {
-        result = opal_accelerator_cuda_func.cuMemHostUnregister(ptr);
+        result = cuMemHostUnregister(ptr);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuMemHostUnregister failed", true,
                            ptr, OPAL_PROC_MY_HOSTNAME, result);
@@ -516,7 +516,7 @@ static int accelerator_cuda_get_device(int *dev_id)
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuCtxGetDevice(&cuDev);
+    result = cuCtxGetDevice(&cuDev);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuCtxGetDevice failed", true,
                        result);
@@ -534,7 +534,7 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuDeviceCanAccessPeer(access, (CUdevice) dev1, (CUdevice) dev2);
+    result = cuDeviceCanAccessPeer(access, (CUdevice) dev1, (CUdevice) dev2);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuDeviceCanAccessPeer failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -554,13 +554,13 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
 {
     CUresult result;
     int enable = 1;
-    result = opal_accelerator_cuda_func.cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
+    result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,
                        result);
         return result;
     }
-    result = opal_accelerator_cuda_func.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+    result = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
                                        (CUdeviceptr) addr);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuPointerSetAttribute failed", true,
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
index 4646029ce06..8efde778761 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.h
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -25,41 +25,6 @@ typedef struct {
     opal_accelerator_base_component_t super;
 } opal_accelerator_cuda_component_t;
 
-/* Structure to hold CUDA function pointers that get dynamically loaded. */
-struct  accelerator_cuda_func_table {
-    int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr);
-    int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-    int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
-    int (*cuMemcpy2D)(const CUDA_MEMCPY2D* pCopy);
-    int (*cuMemAlloc)(CUdeviceptr *, size_t);
-    int (*cuMemFree)(CUdeviceptr buf);
-    int (*cuCtxGetCurrent)(void *cuContext);
-    int (*cuStreamCreate)(CUstream *, int);
-    int (*cuEventCreate)(CUevent *, int);
-    int (*cuEventRecord)(CUevent, CUstream);
-    int (*cuEventQuery)(CUevent);
-    int (*cuEventDestroy)(CUevent);
-    int (*cuMemHostRegister)(void *, size_t, unsigned int);
-    int (*cuMemHostUnregister)(void *);
-    int (*cuMemGetAddressRange)(CUdeviceptr *, size_t *, CUdeviceptr);
-    int (*cuIpcGetEventHandle)(CUipcEventHandle *, CUevent);
-    int (*cuIpcOpenEventHandle)(CUevent *, CUipcEventHandle);
-    int (*cuIpcOpenMemHandle)(CUdeviceptr *, CUipcMemHandle, unsigned int);
-    int (*cuIpcCloseMemHandle)(CUdeviceptr);
-    int (*cuIpcGetMemHandle)(CUipcMemHandle *, CUdeviceptr);
-    int (*cuCtxGetDevice)(CUdevice *);
-    int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice);
-    int (*cuCtxSetCurrent)(CUcontext);
-    int (*cuEventSynchronize)(CUevent);
-    int (*cuStreamSynchronize)(CUstream);
-    int (*cuStreamDestroy)(CUstream);
-    int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr);
-#if OPAL_CUDA_GET_ATTRIBUTES
-    int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
-#endif /* OPAL_CUDA_GET_ATTRIBUTES */
-};
-typedef struct accelerator_cuda_func_table  accelerator_cuda_func_table_t;
-
 struct opal_accelerator_cuda_stream_t {
     opal_accelerator_stream_t base;
 };
@@ -73,7 +38,6 @@ typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t);
 
 /* Declare extern variables, defined in accelerator_cuda_component.c */
-OPAL_DECLSPEC extern accelerator_cuda_func_table_t opal_accelerator_cuda_func;
 OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream;
 OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock;
 
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index dd3f9aade7f..2ffeebafd00 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -34,26 +34,11 @@
 
 
 /* Define global variables, used in accelerator_cuda.c */
-accelerator_cuda_func_table_t opal_accelerator_cuda_func = {0};
 CUstream opal_accelerator_cuda_memcpy_stream = NULL;
 opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
 
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x)  STRINGIFY2(x)
-#define OPAL_CUDA_DLSYM(libhandle, func_name)                                                           \
-    do {                                                                                                \
-        char *err_msg;                                                                                  \
-        void *ptr;                                                                                      \
-        if (OPAL_SUCCESS != opal_dl_lookup(libhandle, STRINGIFY(func_name), &ptr, &err_msg)) {          \
-            opal_show_help("help-mpi-accelerator-cuda.txt", "dlsym failed", true, STRINGIFY(func_name), \
-                           err_msg);                                                                    \
-            return -1;                                                                                \
-        } else {                                                                                        \
-            *(void **) (&opal_accelerator_cuda_func.func_name) = ptr;                                        \
-            opal_output_verbose(15, opal_accelerator_base_framework.framework_output, "CUDA: successful dlsym of %s",            \
-                                STRINGIFY(funcName));                                                   \
-        }                                                                                               \
-    } while (0)
 
 /* Unused variable that we register at init time and unregister at fini time.
  * This is used to detect if user has done a device reset prior to MPI_Finalize.
@@ -76,7 +61,6 @@ static int accelerator_cuda_close(void);
 static int accelerator_cuda_component_register(void);
 static opal_accelerator_base_module_t* accelerator_cuda_init(void);
 static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module);
-static int accelerator_cuda_populate_func_table(opal_dl_handle_t *libcuda_handle);
 /*
  * Instantiate the public struct with all of our public information
  * and pointers to our public functions in it
@@ -134,13 +118,6 @@ static int accelerator_cuda_component_register(void)
 static opal_accelerator_base_module_t* accelerator_cuda_init(void)
 {
     int retval, i, j;
-    char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
-    char *searchpaths[] = {"", "/usr/lib64", NULL};
-    char **errmsgs = NULL;
-    char *errmsg = NULL;
-    int errsize;
-    bool found_libraries = false;
-    opal_dl_handle_t *libcuda_handle = NULL;
     CUresult result;
     CUcontext cuContext;
 
@@ -153,95 +130,9 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
         return NULL;
     }
 
-    if (!OPAL_HAVE_DL_SUPPORT) {
-        opal_show_help("help-accelerator-cuda.txt", "dlopen disabled", true);
-        return NULL;
-    }
-
-    /* Now walk through all the potential names libcuda and find one
-     * that works.  If it does, all is good.  If not, print out all
-     * the messages about why things failed.  This code was careful
-     * to try and save away all error messages if the loading ultimately
-     * failed to help with debugging.
-     *
-     * NOTE: On the first loop we just utilize the default loading
-     * paths from the system.  For the second loop, set /usr/lib64 to
-     * the search path and try again.  This is done to handle the case
-     * where we have both 32 and 64 bit libcuda.so libraries
-     * installed.  Even when running in 64-bit mode, the /usr/lib
-     * directory is searched first and we may find a 32-bit
-     * libcuda.so.1 library.  Loading of this library will fail as the
-     * OPAL DL framework does not handle having the wrong ABI in the
-     * search path (unlike ld or ld.so).  Note that we only set this
-     * search path after the original search.  This is so that
-     * LD_LIBRARY_PATH and run path settings are respected.  Setting
-     * this search path overrides them (rather then being
-     * appended). */
-
-    j = 0;
-    while (searchpaths[j] != NULL) {
-        i = 0;
-        while (cudalibs[i] != NULL) {
-            char *filename = NULL;
-            char *str = NULL;
-
-            /* If there's a non-empty search path, prepend it
-             * to the library filename */
-            if (strlen(searchpaths[j]) > 0) {
-                opal_asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]);
-            } else {
-                filename = strdup(cudalibs[i]);
-            }
-            if (NULL == filename) {
-                opal_show_help("help-accelerator-cuda.txt", "No memory", true,
-                               OPAL_PROC_MY_HOSTNAME);
-                return NULL;
-            }
-
-            retval = opal_dl_open(filename, false, false, &libcuda_handle, &str);
-            if (OPAL_SUCCESS != retval || NULL == libcuda_handle) {
-                if (NULL != str) {
-                    opal_argv_append(&errsize, &errmsgs, str);
-                } else {
-                    opal_argv_append(&errsize, &errmsgs, "opal_dl_open() returned NULL.");
-                }
-                opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "CUDA: Library open error: %s",
-                                    errmsgs[errsize - 1]);
-            } else {
-                opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
-                                    "CUDA: Library successfully opened %s", cudalibs[i]);
-                found_libraries = true;
-                break;
-            }
-            i++;
-            free(filename);
-        }
-        if (true == found_libraries) {
-            break; /* Break out of outer loop */
-        }
-        j++;
-    }
-
-    if (true != found_libraries) {
-        errmsg = opal_argv_join(errmsgs, '\n');
-        if (opal_warn_on_missing_libcuda) {
-            opal_show_help("help-accelerator-cuda.txt", "dlopen failed", true, errmsg);
-        }
-    }
-    opal_argv_free(errmsgs);
-    free(errmsg);
-
-    if (true != found_libraries) {
-        return NULL;
-    }
-
-    if (OPAL_SUCCESS != accelerator_cuda_populate_func_table(libcuda_handle)) {
-        return NULL;
-    }
-
     /* Check to see if this process is running in a CUDA context.  If
      * so, all is good.  If not, then disable registration of memory. */
-    result = opal_accelerator_cuda_func.cuCtxGetCurrent(&cuContext);
+    result = cuCtxGetCurrent(&cuContext);
     if (CUDA_SUCCESS != result) {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed");
         return NULL;
@@ -253,14 +144,14 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
     }
 
     /* Create stream for use in cuMemcpyAsync synchronous copies */
-    result = opal_accelerator_cuda_func.cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0);
+    result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return NULL;
     }
 
-    result = opal_accelerator_cuda_func.cuMemHostRegister(&checkmem, sizeof(int), 0);
+    result = cuMemHostRegister(&checkmem, sizeof(int), 0);
     if (result != CUDA_SUCCESS) {
         /* If registering the memory fails, print a message and continue.
          * This is not a fatal error. */
@@ -283,51 +174,14 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
      * while calling into the CUDA library.  This check will detect if
      * a user has called cudaDeviceReset prior to MPI_Finalize. If so,
      * then this call will fail and we skip cleaning up CUDA resources. */
-    result = opal_accelerator_cuda_func.cuMemHostUnregister(&checkmem);
+    result = cuMemHostUnregister(&checkmem);
     if (CUDA_SUCCESS != result) {
         ctx_ok = 0;
     }
     if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) {
-        opal_accelerator_cuda_func.cuStreamDestroy(opal_accelerator_cuda_memcpy_stream);
+        cuStreamDestroy(opal_accelerator_cuda_memcpy_stream);
     }
 
     OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
     return;
 }
-
-static int accelerator_cuda_populate_func_table(opal_dl_handle_t *libcuda_handle)
-{
-    /* Map in the functions that we need.  Note that if there is an error
-     * the macro OPAL_CUDA_DLSYM will print an error and call return.  */
-    OPAL_CUDA_DLSYM(libcuda_handle, cuStreamCreate);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventCreate);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventRecord);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventQuery);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventSynchronize);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuEventDestroy);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostRegister);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostUnregister);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttribute);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpyAsync);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy2D);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemFree);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemAlloc);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuStreamSynchronize);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuStreamDestroy);
-    OPAL_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute);
-#if OPAL_CUDA_GET_ATTRIBUTES
-    OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes);
-#endif /* OPAL_CUDA_GET_ATTRIBUTES */
-    return OPAL_SUCCESS;
-}
diff --git a/opal/mca/accelerator/cuda/configure.m4 b/opal/mca/accelerator/cuda/configure.m4
index 499801c34ca..aa67623c8b2 100644
--- a/opal/mca/accelerator/cuda/configure.m4
+++ b/opal/mca/accelerator/cuda/configure.m4
@@ -15,18 +15,22 @@
 
 #
 # If CUDA support was requested, then build the CUDA support library.
-# This code checks just makes sure the check was done earlier by the
-# opal_check_cuda.m4 code.
-#
+# This code checks makes sure the check was done earlier by the
+# opal_check_cuda.m4 code. It also copies the flags and libs under
+# opal_cuda_CPPFLAGS, opal_cuda_LDFLAGS, and opal_cuda_LIBS
 
 AC_DEFUN([MCA_opal_accelerator_cuda_CONFIG],[
+
     AC_CONFIG_FILES([opal/mca/accelerator/cuda/Makefile])
 
-    # make sure that CUDA-aware checks have been done
-    AC_REQUIRE([OPAL_CHECK_CUDA])
+    OPAL_CHECK_CUDA([accelerator_cuda])
 
     AS_IF([test "x$CUDA_SUPPORT" = "x1"],
           [$1],
           [$2])
 
+    AC_SUBST([accelerator_cuda_CPPFLAGS])
+    AC_SUBST([accelerator_cuda_LDFLAGS])
+    AC_SUBST([accelerator_cuda_LIBS])
+
 ])dnl
diff --git a/opal/mca/btl/smcuda/Makefile.am b/opal/mca/btl/smcuda/Makefile.am
index 3b465af577e..f1a89df8dce 100644
--- a/opal/mca/btl/smcuda/Makefile.am
+++ b/opal/mca/btl/smcuda/Makefile.am
@@ -53,10 +53,12 @@ mcacomponent_LTLIBRARIES = $(component_install)
 mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
 mca_btl_smcuda_la_LDFLAGS = -module -avoid-version
 mca_btl_smcuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
-    $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la
+    $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la \
+    $(btl_smcuda_LIBS)
 mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
 
 noinst_LTLIBRARIES = $(component_noinst)
 libmca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
 libmca_btl_smcuda_la_LDFLAGS = -module -avoid-version
 libmca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
+libmca_btl_smcuda_la_LIBADD = $(btl_smcuda_LIBS)
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 7d73892950f..748568c289b 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -68,7 +68,8 @@
 #include "btl_smcuda_frag.h"
 #include "btl_smcuda_accelerator.h"
 
-#include "opal/cuda/common_cuda.h"
+
+#include "opal/include/opal/opal_cuda.h"
 
 static struct mca_btl_base_registration_handle_t *
 mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
@@ -1000,7 +1001,7 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
                             uint32_t flags)
 {
     mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl;
-    mca_rcache_common_cuda_reg_t *reg;
+    mca_opal_cuda_reg_t *reg;
     int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
     int rcache_flags = 0;
 
@@ -1023,15 +1024,64 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl,
                                          struct mca_btl_base_registration_handle_t *handle)
 {
     mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl;
-    mca_rcache_common_cuda_reg_t *reg = (mca_rcache_common_cuda_reg_t
+    mca_opal_cuda_reg_t *reg = (mca_opal_cuda_reg_t
                                              *) ((intptr_t) handle
-                                                 - offsetof(mca_rcache_common_cuda_reg_t, data));
+                                                 - offsetof(mca_opal_cuda_reg_t, data));
 
     smcuda_module->rcache->rcache_deregister(smcuda_module->rcache, &reg->base);
 
     return OPAL_SUCCESS;
 }
 
+/*
+ * Put remote event on stream to ensure that the the start of the
+ * copy does not start until the completion of the event.
+ */
+static void mca_btl_smcuda_wait_stream_synchronize(mca_opal_cuda_reg_t *rget_reg)
+{
+#if OPAL_CUDA_SYNC_MEMOPS
+    /* No need for any of this with SYNC_MEMOPS feature */
+    return;
+#else /* OPAL_CUDA_SYNC_MEMOPS */
+    CUipcEventHandle evtHandle;
+    CUevent event;
+    CUresult result;
+
+    memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle));
+
+    result = cuIpcOpenEventHandle(&event, evtHandle);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
+                            "cuIpcOpenEventHandle failed");
+    }
+
+    /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
+     * versions.  Need to record an event on the stream, even though
+     * it is not used, to make sure we do not short circuit our way
+     * out of the cuStreamWaitEvent test.
+     */
+    result = cuEventRecord(event, 0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
+                            "cuEventRecord failed");
+    }
+    /* END of Workaround */
+
+    result = cuStreamWaitEvent(0, event, 0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
+                            "cuStreamWaitEvent failed");
+    }
+
+    /* All done with this event. */
+    result = cuEventDestroy(event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
+                            "cuStreamWaitEvent failed");
+    }
+#endif /* OPAL_CUDA_SYNC_MEMOPS */
+}
+
 int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
                             void *local_address, uint64_t remote_address,
                             struct mca_btl_base_registration_handle_t *local_handle,
@@ -1039,8 +1089,8 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
                             int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
                             void *cbcontext, void *cbdata)
 {
-    mca_rcache_common_cuda_reg_t rget_reg;
-    mca_rcache_common_cuda_reg_t *reg_ptr = &rget_reg;
+    mca_opal_cuda_reg_t rget_reg;
+    mca_opal_cuda_reg_t *reg_ptr = &rget_reg;
     int rc, done;
     void *remote_memory_address;
     size_t offset;
@@ -1111,7 +1161,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
      * is available in the sender's GPU buffer.  Therefore, do a stream synchronize
      * on the IPC event that we received.  Note that we pull it from
      * rget_reg, not reg_ptr, as we do not cache the event. */
-    mca_common_wait_stream_synchronize(&rget_reg);
+    mca_btl_smcuda_wait_stream_synchronize(&rget_reg);
 
     rc = mca_btl_smcuda_memcpy(local_address, remote_memory_address, size, "mca_btl_smcuda_get",
                                 (mca_btl_base_descriptor_t *) frag);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_frag.h b/opal/mca/btl/smcuda/btl_smcuda_frag.h
index 53ececfe6e8..886dd1490ab 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_frag.h
+++ b/opal/mca/btl/smcuda/btl_smcuda_frag.h
@@ -31,7 +31,7 @@
 #include "opal_config.h"
 #include "btl_smcuda.h"
 
-#include "opal/cuda/common_cuda.h"
+#include "opal/include/opal/opal_cuda.h"
 
 #define MCA_BTL_SMCUDA_FRAG_TYPE_MASK ((uintptr_t) 0x3)
 #define MCA_BTL_SMCUDA_FRAG_SEND      ((uintptr_t) 0x0)
@@ -52,7 +52,7 @@ struct mca_btl_smcuda_hdr_t {
 typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t;
 
 struct mca_btl_base_registration_handle_t {
-    mca_rcache_common_cuda_reg_data_t reg_data;
+    mca_opal_cuda_reg_data_t reg_data;
 };
 
 struct mca_btl_smcuda_segment_t {
diff --git a/opal/mca/btl/smcuda/configure.m4 b/opal/mca/btl/smcuda/configure.m4
index 82b71aa858b..10b3721022c 100644
--- a/opal/mca/btl/smcuda/configure.m4
+++ b/opal/mca/btl/smcuda/configure.m4
@@ -19,12 +19,15 @@
 AC_DEFUN([MCA_opal_btl_smcuda_CONFIG],[
     AC_CONFIG_FILES([opal/mca/btl/smcuda/Makefile])
 
-    # make sure that CUDA-aware checks have been done
-    AC_REQUIRE([OPAL_CHECK_CUDA])
+    OPAL_CHECK_CUDA([btl_smcuda])
 
     # Only build if CUDA support is available
     AS_IF([test "x$CUDA_SUPPORT" = "x1"],
           [$1
            OPAL_MCA_CHECK_DEPENDENCY([opal], [btl], [smcuda], [opal], [common], [sm])],
           [$2])
+
+    AC_SUBST([btl_smcuda_CPPFLAGS])
+    AC_SUBST([btl_smcuda_LDFLAGS])
+    AC_SUBST([btl_smcuda_LIBS])
 ])dnl
diff --git a/opal/mca/rcache/gpusm/configure.m4 b/opal/mca/rcache/gpusm/configure.m4
index 2b792d7cc8c..65dd94811c5 100644
--- a/opal/mca/rcache/gpusm/configure.m4
+++ b/opal/mca/rcache/gpusm/configure.m4
@@ -19,9 +19,14 @@
 AC_DEFUN([MCA_opal_rcache_gpusm_CONFIG],[
     AC_CONFIG_FILES([opal/mca/rcache/gpusm/Makefile])
 
+    OPAL_CHECK_CUDA([rcache_gpusm])
+
     # Use CUDA_SUPPORT which was filled in by the opal configure code.
     AS_IF([test "x$CUDA_SUPPORT" = "x1"],
           [$1],
           [$2])
 
+    AC_SUBST([rcache_gpusm_CPPFLAGS])
+    AC_SUBST([rcache_gpusm_LDFLAGS])
+    AC_SUBST([rcache_gpusm_LIBS])
 ])dnl
diff --git a/opal/mca/rcache/gpusm/rcache_gpusm_module.c b/opal/mca/rcache/gpusm/rcache_gpusm_module.c
index 37aa6696a3f..a38ef3e89b6 100644
--- a/opal/mca/rcache/gpusm/rcache_gpusm_module.c
+++ b/opal/mca/rcache/gpusm/rcache_gpusm_module.c
@@ -41,7 +41,8 @@
 #include "opal_config.h"
 #include "opal/mca/rcache/base/base.h"
 #include "opal/mca/rcache/gpusm/rcache_gpusm.h"
-#include "opal/cuda/common_cuda.h"
+#include "opal/include/opal/opal_cuda.h"
+#include <cuda.h>
 
 /**
  * Called when the registration free list is created.  An event is created
@@ -49,7 +50,20 @@
  */
 static void mca_rcache_gpusm_registration_constructor(mca_rcache_gpusm_registration_t *item)
 {
-    mca_common_cuda_construct_event_and_handle(&item->event, (void *) &item->evtHandle);
+    uintptr_t *event = &item->event;
+    void *handle = (void *) &item->evtHandle;
+    CUresult result;
+
+    result = cuEventCreate((CUevent *) event,
+                                  CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output(0, "cuEventCreate failed\n");
+    }
+
+    result = cuIpcGetEventHandle((CUipcEventHandle *) handle, (CUevent) *event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output(0, "cuIpcGetEventHandle failed\n");
+    }
 }
 
 /**
@@ -57,8 +71,13 @@ static void mca_rcache_gpusm_registration_constructor(mca_rcache_gpusm_registrat
  */
 static void mca_rcache_gpusm_registration_destructor(mca_rcache_gpusm_registration_t *item)
 {
-    mca_common_cuda_destruct_event(item->event);
+    uintptr_t event = item->event;
+    CUresult result;
 
+    result = cuEventDestroy((CUevent) event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output(0, "cuEventDestroy failed");
+    }
 }
 
 OBJ_CLASS_INSTANCE(mca_rcache_gpusm_registration_t, mca_rcache_base_registration_t,
@@ -81,7 +100,7 @@ void mca_rcache_gpusm_module_init(mca_rcache_gpusm_module_t *rcache)
     /* Start with 0 entries in the free list since CUDA may not have
      * been initialized when this free list is created and there is
      * some CUDA specific activities that need to be done. */
-    opal_free_list_init(&rcache->reg_list, sizeof(struct mca_rcache_common_cuda_reg_t),
+    opal_free_list_init(&rcache->reg_list, sizeof(struct mca_opal_cuda_reg_t),
                         opal_cache_line_size, OBJ_CLASS(mca_rcache_gpusm_registration_t), 0,
                         opal_cache_line_size, 0, -1, 64, NULL, 0, NULL, NULL, NULL);
 }
@@ -96,6 +115,77 @@ int mca_rcache_gpusm_find(mca_rcache_base_module_t *rcache, void *addr, size_t s
     return mca_rcache_gpusm_register(rcache, addr, size, 0, 0, reg);
 }
 
+/*
+ * Get the memory handle of a local section of memory that can be sent
+ * to the remote size so it can access the memory.  This is the
+ * registration function for the sending side of a message transfer.
+ */
+static int mca_rcache_gpusm_get_mem_handle(void *base, size_t size, mca_rcache_base_registration_t *newreg)
+{
+    CUmemorytype memType;
+    CUresult result;
+    CUipcMemHandle *memHandle;
+    CUdeviceptr pbase;
+    size_t psize;
+
+    mca_opal_cuda_reg_t *cuda_reg = (mca_opal_cuda_reg_t *) newreg;
+    memHandle = (CUipcMemHandle *) cuda_reg->data.memHandle;
+
+    /* We should only be there if this is a CUDA device pointer */
+    result = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                                          (CUdeviceptr) base);
+    assert(CUDA_SUCCESS == result);
+    assert(CU_MEMORYTYPE_DEVICE == memType);
+
+    /* Get the memory handle so we can send it to the remote process. */
+    result = cuIpcGetMemHandle(memHandle, (CUdeviceptr) base);
+
+    if (CUDA_SUCCESS != result) {
+        return OPAL_ERROR;
+    }
+
+    /* Need to get the real base and size of the memory handle.  This is
+     * how the remote side saves the handles in a cache. */
+    result = cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr) base);
+    if (CUDA_SUCCESS != result) {
+        return OPAL_ERROR;
+    }
+
+    /* Store all the information in the registration */
+    cuda_reg->base.base = (void *) pbase;
+    cuda_reg->base.bound = (unsigned char *) pbase + psize - 1;
+    cuda_reg->data.memh_seg_addr.pval = (void *) pbase;
+    cuda_reg->data.memh_seg_len = psize;
+
+#if OPAL_CUDA_SYNC_MEMOPS
+    /* With CUDA 6.0, we can set an attribute on the memory pointer that will
+     * ensure any synchronous copies are completed prior to any other access
+     * of the memory region.  This means we do not need to record an event
+     * and send to the remote side.
+     */
+    memType = 1; /* Just use this variable since we already have it */
+    result = cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                          (CUdeviceptr) base);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        return OPAL_ERROR;
+    }
+#else
+    /* Need to record the event to ensure that any memcopies into the
+     * device memory have completed.  The event handle associated with
+     * this event is sent to the remote process so that it will wait
+     * on this event prior to copying data out of the device memory.
+     * Note that this needs to be the NULL stream to make since it is
+     * unknown what stream any copies into the device memory were done
+     * with. */
+    result = cuEventRecord((CUevent) cuda_reg->data.event, 0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        return OPAL_ERROR;
+    }
+#endif /* OPAL_CUDA_SYNC_MEMOPS */
+
+    return OPAL_SUCCESS;
+}
+
 /*
  * This is the one function that does all the work.  It will call into
  * the register function to get the memory handle for the sending
@@ -133,7 +223,7 @@ int mca_rcache_gpusm_register(mca_rcache_base_module_t *rcache, void *addr, size
     gpusm_reg->flags = flags;
     gpusm_reg->access_flags = access_flags;
 
-    rc = cuda_getmemhandle(base, size, gpusm_reg, NULL);
+    rc = mca_rcache_gpusm_get_mem_handle(base, size, gpusm_reg);
 
     if (rc != OPAL_SUCCESS) {
         opal_free_list_return(&rcache_gpusm->reg_list, item);
diff --git a/opal/mca/rcache/rgpusm/configure.m4 b/opal/mca/rcache/rgpusm/configure.m4
index a9bce3c39dd..f76c27b8c35 100644
--- a/opal/mca/rcache/rgpusm/configure.m4
+++ b/opal/mca/rcache/rgpusm/configure.m4
@@ -19,9 +19,14 @@
 AC_DEFUN([MCA_opal_rcache_rgpusm_CONFIG],[
     AC_CONFIG_FILES([opal/mca/rcache/rgpusm/Makefile])
 
+    OPAL_CHECK_CUDA([rcache_rgpusm])
+
     # Use CUDA_SUPPORT which was filled in by the opal configure code.
     AS_IF([test "x$CUDA_SUPPORT" = "x1"],
           [$1],
           [$2])
 
+    AC_SUBST([rcache_rgpusm_CPPFLAGS])
+    AC_SUBST([rcache_rgpusm_LDFLAGS])
+    AC_SUBST([rcache_rgpusm_LIBS])
 ])dnl
diff --git a/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c b/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c
index 2859a14c7be..92287055dc3 100644
--- a/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c
+++ b/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c
@@ -86,10 +86,80 @@
 #ifdef HAVE_MALLOC_H
 #    include <malloc.h>
 #endif
-#include "opal/cuda/common_cuda.h"
+#include "opal/include/opal/opal_cuda.h"
 #include "opal/mca/rcache/base/base.h"
 #include "opal/mca/rcache/rcache.h"
 #include "opal/util/proc.h"
+#include <cuda.h>
+
+/*
+ * Open a memory handle that refers to remote memory so we can get an address
+ * that works on the local side.  This is the registration function for the
+ * remote side of a transfer.  newreg contains the new handle.  hddrreg contains
+ * the memory handle that was received from the remote side.
+ */
+static int mca_rcache_rgpusm_open_mem_handle(void *base, size_t size, mca_rcache_base_registration_t *newreg)
+{
+    CUresult result;
+    CUipcMemHandle *memHandle;
+    mca_opal_cuda_reg_t *cuda_newreg = (mca_opal_cuda_reg_t *) newreg;
+
+    /* Save in local variable to avoid ugly casting */
+    memHandle = (CUipcMemHandle *) cuda_newreg->data.memHandle;
+
+    /* Open the memory handle and store it into the registration structure. */
+    result = cuIpcOpenMemHandle((CUdeviceptr *) &newreg->alloc_base, *memHandle,
+                                       CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+
+    /* If there are some stale entries in the cache, they can cause other
+     * registrations to fail.  Let the caller know that so that can attempt
+     * to clear them out. */
+    if (CUDA_ERROR_ALREADY_MAPPED == result) {
+        opal_output_verbose(10, mca_rcache_rgpusm_component.output,
+                            "CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for "
+                            "p=%p,size=%d: notify memory pool\n",
+                            base, (int) size);
+        return OPAL_ERR_WOULD_BLOCK;
+    }
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output_verbose(10, mca_rcache_rgpusm_component.output,
+                            "CUDA: cuIpcOpenMemHandle failed: base=%p (remote base=%p,size=%d)",
+                            newreg->alloc_base, base, (int) size);
+        /* Currently, this is a non-recoverable error */
+        return OPAL_ERROR;
+    } else {
+        opal_output_verbose(10, mca_rcache_rgpusm_component.output,
+                            "CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)",
+                            newreg->alloc_base, base, (int) size);
+    }
+
+    return OPAL_SUCCESS;
+}
+
+/*
+ * Close a memory handle that refers to remote memory.
+ */
+static int mca_rcache_rgpusm_close_mem_handle(void *reg_data, mca_rcache_base_registration_t *reg)
+{
+    CUresult result;
+    mca_opal_cuda_reg_t *cuda_reg = (mca_opal_cuda_reg_t *) reg;
+
+    result = cuIpcCloseMemHandle((CUdeviceptr) cuda_reg->base.alloc_base);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        if (CUDA_ERROR_DEINITIALIZED != result) {
+            opal_output_verbose(10, mca_rcache_rgpusm_component.output,
+                            "CUDA: cuIpcCloseMemHandle failed: base=%p",
+                            cuda_reg->base.alloc_base);
+        }
+        /* We will just continue on and hope things continue to work. */
+    } else {
+        opal_output_verbose(10, mca_rcache_rgpusm_component.output,
+                            "CUDA: cuIpcCloseMemHandle passed: base=%p",
+                            cuda_reg->base.alloc_base);
+    }
+
+    return OPAL_SUCCESS;
+}
 
 static int mca_rcache_rgpusm_deregister_no_lock(struct mca_rcache_base_module_t *,
                                                 mca_rcache_base_registration_t *);
@@ -113,7 +183,7 @@ static inline bool mca_rcache_rgpusm_deregister_lru(mca_rcache_base_module_t *rc
     /* Drop the rcache lock while we deregister the memory */
     OPAL_THREAD_UNLOCK(&rcache->lock);
     assert(old_reg->ref_count == 0);
-    rc = cuda_closememhandle(NULL, old_reg);
+    rc = mca_rcache_rgpusm_close_mem_handle(NULL, old_reg);
     OPAL_THREAD_LOCK(&rcache->lock);
 
     /* This introduces a potential leak of registrations if
@@ -145,7 +215,7 @@ void mca_rcache_rgpusm_module_init(mca_rcache_rgpusm_module_t *rcache)
     rcache->vma_module = mca_rcache_base_vma_module_alloc();
 
     OBJ_CONSTRUCT(&rcache->reg_list, opal_free_list_t);
-    opal_free_list_init(&rcache->reg_list, sizeof(struct mca_rcache_common_cuda_reg_t),
+    opal_free_list_init(&rcache->reg_list, sizeof(struct mca_opal_cuda_reg_t),
                         opal_cache_line_size, OBJ_CLASS(mca_rcache_base_registration_t), 0,
                         opal_cache_line_size, 0, -1, 32, NULL, 0, NULL, NULL, NULL);
     OBJ_CONSTRUCT(&rcache->lru_list, opal_list_t);
@@ -164,8 +234,8 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz
                                mca_rcache_base_registration_t **reg)
 {
     mca_rcache_rgpusm_module_t *rcache_rgpusm = (mca_rcache_rgpusm_module_t *) rcache;
-    mca_rcache_common_cuda_reg_t *rgpusm_reg;
-    mca_rcache_common_cuda_reg_t *rget_reg;
+    mca_opal_cuda_reg_t *rgpusm_reg;
+    mca_opal_cuda_reg_t *rget_reg;
     opal_free_list_item_t *item;
     int rc;
     int mypeer; /* just for debugging */
@@ -174,7 +244,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz
      * function, we are using the **reg variable to not only get back the
      * registration information, but to hand in the memory handle received
      * from the remote side. */
-    rget_reg = (mca_rcache_common_cuda_reg_t *) *reg;
+    rget_reg = (mca_opal_cuda_reg_t *) *reg;
 
     mypeer = flags;
     flags = 0;
@@ -193,7 +263,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz
         if (NULL == item) {
             return OPAL_ERR_OUT_OF_RESOURCE;
         }
-        rgpusm_reg = (mca_rcache_common_cuda_reg_t *) item;
+        rgpusm_reg = (mca_opal_cuda_reg_t *) item;
         rgpusm_reg->base.rcache = rcache;
         rgpusm_reg->base.base = addr;
         rgpusm_reg->base.bound = (unsigned char *) addr + size - 1;
@@ -207,8 +277,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz
         /* The rget_reg registration is holding the memory handle needed
          * to register the remote memory.  This was received from the remote
          * process.  A pointer to the memory is returned in the alloc_base field. */
-        rc = cuda_openmemhandle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg,
-                                (mca_rcache_base_registration_t *) rget_reg);
+        rc = mca_rcache_rgpusm_open_mem_handle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg);
 
         /* This error should not happen with no cache in use. */
         assert(OPAL_ERR_WOULD_BLOCK != rc);
@@ -240,8 +309,8 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz
                             (int) size, (*reg)->base, (int) ((*reg)->bound - (*reg)->base));
 
         if (0 ==
-            memcmp(((mca_rcache_common_cuda_reg_t *)*reg)->data.memHandle, rget_reg->data.memHandle,
-                  sizeof(((mca_rcache_common_cuda_reg_t *)*reg)->data.memHandle))) {
+            memcmp(((mca_opal_cuda_reg_t *)*reg)->data.memHandle, rget_reg->data.memHandle,
+                  sizeof(((mca_opal_cuda_reg_t *)*reg)->data.memHandle))) {
             /* Registration matches what was requested.  All is good. */
             rcache_rgpusm->stat_cache_valid++;
         } else {
@@ -306,7 +375,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz
         OPAL_THREAD_UNLOCK(&rcache->lock);
         return OPAL_ERR_OUT_OF_RESOURCE;
     }
-    rgpusm_reg = (mca_rcache_common_cuda_reg_t *) item;
+    rgpusm_reg = (mca_opal_cuda_reg_t *) item;
 
     rgpusm_reg->base.rcache = rcache;
     rgpusm_reg->base.base = addr;
@@ -321,8 +390,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz
      * bound values may be changed by the registration.  The memory
      * associated with the handle comes back in the alloc_base
      * value. */
-    rc = cuda_openmemhandle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg,
-                            (mca_rcache_base_registration_t *) rget_reg);
+    rc = mca_rcache_rgpusm_open_mem_handle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg);
     /* There is a chance we can get the OPAL_ERR_WOULD_BLOCK from the
      * CUDA codes attempt to register the memory.  The case that this
      * can happen is as follows.  A block of memory is registered.
@@ -360,8 +428,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz
             rcache_rgpusm->stat_evicted++;
 
             /* And try again.  This one usually works. */
-            rc = cuda_openmemhandle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg,
-                                    (mca_rcache_base_registration_t *) rget_reg);
+            rc = mca_rcache_rgpusm_open_mem_handle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg);
         }
 
         /* There is a chance that another registration is blocking our
@@ -373,8 +440,7 @@ int mca_rcache_rgpusm_register(mca_rcache_base_module_t *rcache, void *addr, siz
                 break;
             }
             /* Clear out one registration. */
-            rc = cuda_openmemhandle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg,
-                                    (mca_rcache_base_registration_t *) rget_reg); 
+            rc = mca_rcache_rgpusm_open_mem_handle(addr, size, (mca_rcache_base_registration_t *) rgpusm_reg);
         }
     }
 
@@ -507,7 +573,7 @@ int mca_rcache_rgpusm_deregister(struct mca_rcache_base_module_t *rcache,
 
         {
             assert(reg->ref_count == 0);
-            rc = cuda_closememhandle(NULL, reg);
+            rc = mca_rcache_rgpusm_close_mem_handle(NULL, reg);
         }
 
         OPAL_THREAD_LOCK(&rcache->lock);
@@ -543,7 +609,7 @@ int mca_rcache_rgpusm_deregister_no_lock(struct mca_rcache_base_module_t *rcache
             mca_rcache_base_vma_delete(rcache_rgpusm->vma_module, reg);
 
         assert(reg->ref_count == 0);
-        rc = cuda_closememhandle(NULL, reg);
+        rc = mca_rcache_rgpusm_close_mem_handle(NULL, reg);
 
         if (OPAL_SUCCESS == rc) {
             opal_free_list_return(&rcache_rgpusm->reg_list, (opal_free_list_item_t *) reg);
@@ -594,7 +660,7 @@ void mca_rcache_rgpusm_finalize(struct mca_rcache_base_module_t *rcache)
             /* Drop lock before deregistering memory */
             OPAL_THREAD_UNLOCK(&rcache->lock);
             assert(reg->ref_count == 0);
-            rc = cuda_closememhandle(NULL, reg);
+            rc = mca_rcache_rgpusm_close_mem_handle(NULL, reg);
             OPAL_THREAD_LOCK(&rcache->lock);
 
             if (rc != OPAL_SUCCESS) {