btl/smcuda,rcache/rgpusm,rcache/gpusm Add direct cuda dependency

wckzhang · wckzhang · commit 8ed9056789b8 · 2022-09-28T16:31:29.000Z
Signed-off-by: William Zhang &lt;wilzhang@amazon.com&gt;
diff --git a/opal/include/opal/Makefile.am b/opal/include/opal/Makefile.am
@@ -29,7 +29,8 @@ headers += \
         opal/hash_string.h \
 	opal/frameworks.h \
 	opal/opal_portable_platform.h \
-	opal/opal_portable_platform_real.h
+	opal/opal_portable_platform_real.h \
+	opal/opal_cuda.h
 
 nodist_headers += \
 	opal/version.h
diff --git a/opal/include/opal/opal_cuda.h b/opal/include/opal/opal_cuda.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2024-2006 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2013 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2006 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2022      Amazon.com, Inc. or its affiliates.
+ *                         All Rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * This file is intended only to carry shared types. If actual cuda
+ * symbols are required, they need to be added to a new common cuda
+ * component.
+ */
+
+#ifndef OPAL_CUDA_H
+#define OPAL_CUDA_H
+#include "opal/mca/rcache/rcache.h"
+
+#define MEMHANDLE_SIZE 8
+#define EVTHANDLE_SIZE 8
+
+struct mca_opal_cuda_reg_data_t {
+    uint64_t memHandle[MEMHANDLE_SIZE];
+    uint64_t evtHandle[EVTHANDLE_SIZE];
+    uint64_t event;
+    opal_ptr_t memh_seg_addr;
+    size_t memh_seg_len;
+};
+typedef struct mca_opal_cuda_reg_data_t mca_opal_cuda_reg_data_t;
+
+struct mca_opal_cuda_reg_t {
+    mca_rcache_base_registration_t base;
+    mca_opal_cuda_reg_data_t data;
+};
+typedef struct mca_opal_cuda_reg_t mca_opal_cuda_reg_t;
+#endif /* OPAL_CUDA_H */
diff --git a/opal/mca/btl/smcuda/Makefile.am b/opal/mca/btl/smcuda/Makefile.am
@@ -53,10 +53,12 @@ mcacomponent_LTLIBRARIES = $(component_install)
 mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
 mca_btl_smcuda_la_LDFLAGS = -module -avoid-version
 mca_btl_smcuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
-    $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la
+    $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la \
+    $(btl_smcuda_LIBS)
 mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
 
 noinst_LTLIBRARIES = $(component_noinst)
 libmca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
 libmca_btl_smcuda_la_LDFLAGS = -module -avoid-version
 libmca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
+libmca_btl_smcuda_la_LIBADD = $(btl_smcuda_LIBS)
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -68,7 +68,8 @@
 #include "btl_smcuda_frag.h"
 #include "btl_smcuda_accelerator.h"
 
-#include "opal/cuda/common_cuda.h"
+
+#include "opal/include/opal/opal_cuda.h"
 
 static struct mca_btl_base_registration_handle_t *
 mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
@@ -1000,7 +1001,7 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
                             uint32_t flags)
 {
     mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl;
-    mca_rcache_common_cuda_reg_t *reg;
+    mca_opal_cuda_reg_t *reg;
     int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
     int rcache_flags = 0;
 
@@ -1023,24 +1024,73 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl,
                                          struct mca_btl_base_registration_handle_t *handle)
 {
     mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl;
-    mca_rcache_common_cuda_reg_t *reg = (mca_rcache_common_cuda_reg_t
+    mca_opal_cuda_reg_t *reg = (mca_opal_cuda_reg_t
                                              *) ((intptr_t) handle
-                                                 - offsetof(mca_rcache_common_cuda_reg_t, data));
+                                                 - offsetof(mca_opal_cuda_reg_t, data));
 
     smcuda_module->rcache->rcache_deregister(smcuda_module->rcache, &reg->base);
 
     return OPAL_SUCCESS;
 }
 
+/*
+ * Put remote event on stream to ensure that the the start of the
+ * copy does not start until the completion of the event.
+ */
+static void mca_btl_smcuda_wait_stream_synchronize(mca_opal_cuda_reg_t *rget_reg)
+{
+#if OPAL_CUDA_SYNC_MEMOPS
+    /* No need for any of this with SYNC_MEMOPS feature */
+    return;
+#else /* OPAL_CUDA_SYNC_MEMOPS */
+    CUipcEventHandle evtHandle;
+    CUevent event;
+    CUresult result;
+
+    memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle));
+
+    result = cuIpcOpenEventHandle(&event, evtHandle);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
+                            "cuIpcOpenEventHandle failed");
+    }
+
+    /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
+     * versions.  Need to record an event on the stream, even though
+     * it is not used, to make sure we do not short circuit our way
+     * out of the cuStreamWaitEvent test.
+     */
+    result = cuEventRecord(event, 0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
+                            "cuEventRecord failed");
+    }
+    /* END of Workaround */
+
+    result = cuStreamWaitEvent(0, event, 0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
+                            "cuStreamWaitEvent failed");
+    }
+
+    /* All done with this event. */
+    result = cuEventDestroy(event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
+                            "cuStreamWaitEvent failed");
+    }
+#endif /* OPAL_CUDA_SYNC_MEMOPS */
+}
+
 int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
                             void *local_address, uint64_t remote_address,
                             struct mca_btl_base_registration_handle_t *local_handle,
                             struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
                             int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
                             void *cbcontext, void *cbdata)
 {
-    mca_rcache_common_cuda_reg_t rget_reg;
-    mca_rcache_common_cuda_reg_t *reg_ptr = &rget_reg;
+    mca_opal_cuda_reg_t rget_reg;
+    mca_opal_cuda_reg_t *reg_ptr = &rget_reg;
     int rc, done;
     void *remote_memory_address;
     size_t offset;
@@ -1111,7 +1161,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
      * is available in the sender's GPU buffer.  Therefore, do a stream synchronize
      * on the IPC event that we received.  Note that we pull it from
      * rget_reg, not reg_ptr, as we do not cache the event. */
-    mca_common_wait_stream_synchronize(&rget_reg);
+    mca_btl_smcuda_wait_stream_synchronize(&rget_reg);
 
     rc = mca_btl_smcuda_memcpy(local_address, remote_memory_address, size, "mca_btl_smcuda_get",
                                 (mca_btl_base_descriptor_t *) frag);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_frag.h b/opal/mca/btl/smcuda/btl_smcuda_frag.h
@@ -31,7 +31,7 @@
 #include "opal_config.h"
 #include "btl_smcuda.h"
 
-#include "opal/cuda/common_cuda.h"
+#include "opal/include/opal/opal_cuda.h"
 
 #define MCA_BTL_SMCUDA_FRAG_TYPE_MASK ((uintptr_t) 0x3)
 #define MCA_BTL_SMCUDA_FRAG_SEND      ((uintptr_t) 0x0)
@@ -52,7 +52,7 @@ struct mca_btl_smcuda_hdr_t {
 typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t;
 
 struct mca_btl_base_registration_handle_t {
-    mca_rcache_common_cuda_reg_data_t reg_data;
+    mca_opal_cuda_reg_data_t reg_data;
 };
 
 struct mca_btl_smcuda_segment_t {
diff --git a/opal/mca/btl/smcuda/configure.m4 b/opal/mca/btl/smcuda/configure.m4
@@ -19,12 +19,15 @@
 AC_DEFUN([MCA_opal_btl_smcuda_CONFIG],[
     AC_CONFIG_FILES([opal/mca/btl/smcuda/Makefile])
 
-    # make sure that CUDA-aware checks have been done
-    AC_REQUIRE([OPAL_CHECK_CUDA])
+    OPAL_CHECK_CUDA([btl_smcuda])
 
     # Only build if CUDA support is available
     AS_IF([test "x$CUDA_SUPPORT" = "x1"],
           [$1
            OPAL_MCA_CHECK_DEPENDENCY([opal], [btl], [smcuda], [opal], [common], [sm])],
           [$2])
+
+    AC_SUBST([btl_smcuda_CPPFLAGS])
+    AC_SUBST([btl_smcuda_LDFLAGS])
+    AC_SUBST([btl_smcuda_LIBS])
 ])dnl
diff --git a/opal/mca/rcache/gpusm/configure.m4 b/opal/mca/rcache/gpusm/configure.m4
@@ -19,9 +19,14 @@
 AC_DEFUN([MCA_opal_rcache_gpusm_CONFIG],[
     AC_CONFIG_FILES([opal/mca/rcache/gpusm/Makefile])
 
+    OPAL_CHECK_CUDA([rcache_gpusm])
+
     # Use CUDA_SUPPORT which was filled in by the opal configure code.
     AS_IF([test "x$CUDA_SUPPORT" = "x1"],
           [$1],
           [$2])
 
+    AC_SUBST([rcache_gpusm_CPPFLAGS])
+    AC_SUBST([rcache_gpusm_LDFLAGS])
+    AC_SUBST([rcache_gpusm_LIBS])
 ])dnl
diff --git a/opal/mca/rcache/gpusm/rcache_gpusm_module.c b/opal/mca/rcache/gpusm/rcache_gpusm_module.c
@@ -41,24 +41,43 @@
 #include "opal_config.h"
 #include "opal/mca/rcache/base/base.h"
 #include "opal/mca/rcache/gpusm/rcache_gpusm.h"
-#include "opal/cuda/common_cuda.h"
+#include "opal/include/opal/opal_cuda.h"
+#include <cuda.h>
 
 /**
  * Called when the registration free list is created.  An event is created
  * for each entry.
  */
 static void mca_rcache_gpusm_registration_constructor(mca_rcache_gpusm_registration_t *item)
 {
-    mca_common_cuda_construct_event_and_handle(&item->event, (void *) &item->evtHandle);
+    uintptr_t *event = &item->event;
+    void *handle = (void *) &item->evtHandle;
+    CUresult result;
+
+    result = cuEventCreate((CUevent *) event,
+                                  CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output(0, "cuEventCreate failed\n");
+    }
+
+    result = cuIpcGetEventHandle((CUipcEventHandle *) handle, (CUevent) *event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output(0, "cuIpcGetEventHandle failed\n");
+    }
 }
 
 /**
  * Called when the program is exiting.  This destroys the events.
  */
 static void mca_rcache_gpusm_registration_destructor(mca_rcache_gpusm_registration_t *item)
 {
-    mca_common_cuda_destruct_event(item->event);
+    uintptr_t event = item->event;
+    CUresult result;
 
+    result = cuEventDestroy((CUevent) event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_output(0, "cuEventDestroy failed");
+    }
 }
 
 OBJ_CLASS_INSTANCE(mca_rcache_gpusm_registration_t, mca_rcache_base_registration_t,
@@ -81,7 +100,7 @@ void mca_rcache_gpusm_module_init(mca_rcache_gpusm_module_t *rcache)
     /* Start with 0 entries in the free list since CUDA may not have
      * been initialized when this free list is created and there is
      * some CUDA specific activities that need to be done. */
-    opal_free_list_init(&rcache->reg_list, sizeof(struct mca_rcache_common_cuda_reg_t),
+    opal_free_list_init(&rcache->reg_list, sizeof(struct mca_opal_cuda_reg_t),
                         opal_cache_line_size, OBJ_CLASS(mca_rcache_gpusm_registration_t), 0,
                         opal_cache_line_size, 0, -1, 64, NULL, 0, NULL, NULL, NULL);
 }
@@ -96,6 +115,77 @@ int mca_rcache_gpusm_find(mca_rcache_base_module_t *rcache, void *addr, size_t s
     return mca_rcache_gpusm_register(rcache, addr, size, 0, 0, reg);
 }
 
+/*
+ * Get the memory handle of a local section of memory that can be sent
+ * to the remote size so it can access the memory.  This is the
+ * registration function for the sending side of a message transfer.
+ */
+static int mca_rcache_gpusm_get_mem_handle(void *base, size_t size, mca_rcache_base_registration_t *newreg)
+{
+    CUmemorytype memType;
+    CUresult result;
+    CUipcMemHandle *memHandle;
+    CUdeviceptr pbase;
+    size_t psize;
+
+    mca_opal_cuda_reg_t *cuda_reg = (mca_opal_cuda_reg_t *) newreg;
+    memHandle = (CUipcMemHandle *) cuda_reg->data.memHandle;
+
+    /* We should only be there if this is a CUDA device pointer */
+    result = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                                          (CUdeviceptr) base);
+    assert(CUDA_SUCCESS == result);
+    assert(CU_MEMORYTYPE_DEVICE == memType);
+
+    /* Get the memory handle so we can send it to the remote process. */
+    result = cuIpcGetMemHandle(memHandle, (CUdeviceptr) base);
+
+    if (CUDA_SUCCESS != result) {
+        return OPAL_ERROR;
+    }
+
+    /* Need to get the real base and size of the memory handle.  This is
+     * how the remote side saves the handles in a cache. */
+    result = cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr) base);
+    if (CUDA_SUCCESS != result) {
+        return OPAL_ERROR;
+    }
+
+    /* Store all the information in the registration */
+    cuda_reg->base.base = (void *) pbase;
+    cuda_reg->base.bound = (unsigned char *) pbase + psize - 1;
+    cuda_reg->data.memh_seg_addr.pval = (void *) pbase;
+    cuda_reg->data.memh_seg_len = psize;
+
+#if OPAL_CUDA_SYNC_MEMOPS
+    /* With CUDA 6.0, we can set an attribute on the memory pointer that will
+     * ensure any synchronous copies are completed prior to any other access
+     * of the memory region.  This means we do not need to record an event
+     * and send to the remote side.
+     */
+    memType = 1; /* Just use this variable since we already have it */
+    result = cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                          (CUdeviceptr) base);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        return OPAL_ERROR;
+    }
+#else
+    /* Need to record the event to ensure that any memcopies into the
+     * device memory have completed.  The event handle associated with
+     * this event is sent to the remote process so that it will wait
+     * on this event prior to copying data out of the device memory.
+     * Note that this needs to be the NULL stream to make since it is
+     * unknown what stream any copies into the device memory were done
+     * with. */
+    result = cuEventRecord((CUevent) cuda_reg->data.event, 0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        return OPAL_ERROR;
+    }
+#endif /* OPAL_CUDA_SYNC_MEMOPS */
+
+    return OPAL_SUCCESS;
+}
+
 /*
  * This is the one function that does all the work.  It will call into
  * the register function to get the memory handle for the sending
@@ -133,7 +223,7 @@ int mca_rcache_gpusm_register(mca_rcache_base_module_t *rcache, void *addr, size
     gpusm_reg->flags = flags;
     gpusm_reg->access_flags = access_flags;
 
-    rc = cuda_getmemhandle(base, size, gpusm_reg, NULL);
+    rc = mca_rcache_gpusm_get_mem_handle(base, size, gpusm_reg);
 
     if (rc != OPAL_SUCCESS) {
         opal_free_list_return(&rcache_gpusm->reg_list, item);
diff --git a/opal/mca/rcache/rgpusm/configure.m4 b/opal/mca/rcache/rgpusm/configure.m4
@@ -19,9 +19,14 @@
 AC_DEFUN([MCA_opal_rcache_rgpusm_CONFIG],[
     AC_CONFIG_FILES([opal/mca/rcache/rgpusm/Makefile])
 
+    OPAL_CHECK_CUDA([rcache_rgpusm])
+
     # Use CUDA_SUPPORT which was filled in by the opal configure code.
     AS_IF([test "x$CUDA_SUPPORT" = "x1"],
           [$1],
           [$2])
 
+    AC_SUBST([rcache_rgpusm_CPPFLAGS])
+    AC_SUBST([rcache_rgpusm_LDFLAGS])
+    AC_SUBST([rcache_rgpusm_LIBS])
 ])dnl
diff --git a/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c b/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c