Skip to content

Commit 8ed9056

Browse files
committed
btl/smcuda,rcache/rgpusm,rcache/gpusm Add direct cuda dependency
Signed-off-by: William Zhang <[email protected]>
1 parent 26e244c commit 8ed9056

File tree

10 files changed

+311
-39
lines changed

10 files changed

+311
-39
lines changed

opal/include/opal/Makefile.am

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ headers += \
2929
opal/hash_string.h \
3030
opal/frameworks.h \
3131
opal/opal_portable_platform.h \
32-
opal/opal_portable_platform_real.h
32+
opal/opal_portable_platform_real.h \
33+
opal/opal_cuda.h
3334

3435
nodist_headers += \
3536
opal/version.h

opal/include/opal/opal_cuda.h

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2+
/*
3+
* Copyright (c) 2024-2006 The Trustees of Indiana University and Indiana
4+
* University Research and Technology
5+
* Corporation. All rights reserved.
6+
* Copyright (c) 2004-2013 The University of Tennessee and The University
7+
* of Tennessee Research Foundation. All rights
8+
* reserved.
9+
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10+
* University of Stuttgart. All rights reserved.
11+
* Copyright (c) 2004-2006 The Regents of the University of California.
12+
* All rights reserved.
13+
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
14+
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
15+
* reserved.
16+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
17+
* All Rights reserved.
18+
* $COPYRIGHT$
19+
*
20+
* Additional copyrights may follow
21+
*
22+
* $HEADER$
23+
*
24+
* This file is intended only to carry shared types. If actual cuda
25+
* symbols are required, they need to be added to a new common cuda
26+
* component.
27+
*/
28+
29+
#ifndef OPAL_CUDA_H
30+
#define OPAL_CUDA_H
31+
#include "opal/mca/rcache/rcache.h"
32+
33+
#define MEMHANDLE_SIZE 8
34+
#define EVTHANDLE_SIZE 8
35+
36+
struct mca_opal_cuda_reg_data_t {
37+
uint64_t memHandle[MEMHANDLE_SIZE];
38+
uint64_t evtHandle[EVTHANDLE_SIZE];
39+
uint64_t event;
40+
opal_ptr_t memh_seg_addr;
41+
size_t memh_seg_len;
42+
};
43+
typedef struct mca_opal_cuda_reg_data_t mca_opal_cuda_reg_data_t;
44+
45+
struct mca_opal_cuda_reg_t {
46+
mca_rcache_base_registration_t base;
47+
mca_opal_cuda_reg_data_t data;
48+
};
49+
typedef struct mca_opal_cuda_reg_t mca_opal_cuda_reg_t;
50+
#endif /* OPAL_CUDA_H */

opal/mca/btl/smcuda/Makefile.am

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ mcacomponent_LTLIBRARIES = $(component_install)
5353
mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
5454
mca_btl_smcuda_la_LDFLAGS = -module -avoid-version
5555
mca_btl_smcuda_la_LIBADD = $(top_builddir)/opal/lib@[email protected] \
56-
$(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la
56+
$(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la \
57+
$(btl_smcuda_LIBS)
5758
mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
5859

5960
noinst_LTLIBRARIES = $(component_noinst)
6061
libmca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
6162
libmca_btl_smcuda_la_LDFLAGS = -module -avoid-version
6263
libmca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
64+
libmca_btl_smcuda_la_LIBADD = $(btl_smcuda_LIBS)

opal/mca/btl/smcuda/btl_smcuda.c

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@
6868
#include "btl_smcuda_frag.h"
6969
#include "btl_smcuda_accelerator.h"
7070

71-
#include "opal/cuda/common_cuda.h"
71+
72+
#include "opal/include/opal/opal_cuda.h"
7273

7374
static struct mca_btl_base_registration_handle_t *
7475
mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
@@ -1000,7 +1001,7 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
10001001
uint32_t flags)
10011002
{
10021003
mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl;
1003-
mca_rcache_common_cuda_reg_t *reg;
1004+
mca_opal_cuda_reg_t *reg;
10041005
int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
10051006
int rcache_flags = 0;
10061007

@@ -1023,24 +1024,73 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl,
10231024
struct mca_btl_base_registration_handle_t *handle)
10241025
{
10251026
mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl;
1026-
mca_rcache_common_cuda_reg_t *reg = (mca_rcache_common_cuda_reg_t
1027+
mca_opal_cuda_reg_t *reg = (mca_opal_cuda_reg_t
10271028
*) ((intptr_t) handle
1028-
- offsetof(mca_rcache_common_cuda_reg_t, data));
1029+
- offsetof(mca_opal_cuda_reg_t, data));
10291030

10301031
smcuda_module->rcache->rcache_deregister(smcuda_module->rcache, &reg->base);
10311032

10321033
return OPAL_SUCCESS;
10331034
}
10341035

1036+
/*
1037+
* Put remote event on stream to ensure that the the start of the
1038+
* copy does not start until the completion of the event.
1039+
*/
1040+
static void mca_btl_smcuda_wait_stream_synchronize(mca_opal_cuda_reg_t *rget_reg)
1041+
{
1042+
#if OPAL_CUDA_SYNC_MEMOPS
1043+
/* No need for any of this with SYNC_MEMOPS feature */
1044+
return;
1045+
#else /* OPAL_CUDA_SYNC_MEMOPS */
1046+
CUipcEventHandle evtHandle;
1047+
CUevent event;
1048+
CUresult result;
1049+
1050+
memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle));
1051+
1052+
result = cuIpcOpenEventHandle(&event, evtHandle);
1053+
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1054+
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1055+
"cuIpcOpenEventHandle failed");
1056+
}
1057+
1058+
/* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
1059+
* versions. Need to record an event on the stream, even though
1060+
* it is not used, to make sure we do not short circuit our way
1061+
* out of the cuStreamWaitEvent test.
1062+
*/
1063+
result = cuEventRecord(event, 0);
1064+
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1065+
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1066+
"cuEventRecord failed");
1067+
}
1068+
/* END of Workaround */
1069+
1070+
result = cuStreamWaitEvent(0, event, 0);
1071+
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1072+
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1073+
"cuStreamWaitEvent failed");
1074+
}
1075+
1076+
/* All done with this event. */
1077+
result = cuEventDestroy(event);
1078+
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1079+
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1080+
"cuStreamWaitEvent failed");
1081+
}
1082+
#endif /* OPAL_CUDA_SYNC_MEMOPS */
1083+
}
1084+
10351085
int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
10361086
void *local_address, uint64_t remote_address,
10371087
struct mca_btl_base_registration_handle_t *local_handle,
10381088
struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
10391089
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
10401090
void *cbcontext, void *cbdata)
10411091
{
1042-
mca_rcache_common_cuda_reg_t rget_reg;
1043-
mca_rcache_common_cuda_reg_t *reg_ptr = &rget_reg;
1092+
mca_opal_cuda_reg_t rget_reg;
1093+
mca_opal_cuda_reg_t *reg_ptr = &rget_reg;
10441094
int rc, done;
10451095
void *remote_memory_address;
10461096
size_t offset;
@@ -1111,7 +1161,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
11111161
* is available in the sender's GPU buffer. Therefore, do a stream synchronize
11121162
* on the IPC event that we received. Note that we pull it from
11131163
* rget_reg, not reg_ptr, as we do not cache the event. */
1114-
mca_common_wait_stream_synchronize(&rget_reg);
1164+
mca_btl_smcuda_wait_stream_synchronize(&rget_reg);
11151165

11161166
rc = mca_btl_smcuda_memcpy(local_address, remote_memory_address, size, "mca_btl_smcuda_get",
11171167
(mca_btl_base_descriptor_t *) frag);

opal/mca/btl/smcuda/btl_smcuda_frag.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
#include "opal_config.h"
3232
#include "btl_smcuda.h"
3333

34-
#include "opal/cuda/common_cuda.h"
34+
#include "opal/include/opal/opal_cuda.h"
3535

3636
#define MCA_BTL_SMCUDA_FRAG_TYPE_MASK ((uintptr_t) 0x3)
3737
#define MCA_BTL_SMCUDA_FRAG_SEND ((uintptr_t) 0x0)
@@ -52,7 +52,7 @@ struct mca_btl_smcuda_hdr_t {
5252
typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t;
5353

5454
struct mca_btl_base_registration_handle_t {
55-
mca_rcache_common_cuda_reg_data_t reg_data;
55+
mca_opal_cuda_reg_data_t reg_data;
5656
};
5757

5858
struct mca_btl_smcuda_segment_t {

opal/mca/btl/smcuda/configure.m4

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,15 @@
1919
AC_DEFUN([MCA_opal_btl_smcuda_CONFIG],[
2020
AC_CONFIG_FILES([opal/mca/btl/smcuda/Makefile])
2121

22-
# make sure that CUDA-aware checks have been done
23-
AC_REQUIRE([OPAL_CHECK_CUDA])
22+
OPAL_CHECK_CUDA([btl_smcuda])
2423

2524
# Only build if CUDA support is available
2625
AS_IF([test "x$CUDA_SUPPORT" = "x1"],
2726
[$1
2827
OPAL_MCA_CHECK_DEPENDENCY([opal], [btl], [smcuda], [opal], [common], [sm])],
2928
[$2])
29+
30+
AC_SUBST([btl_smcuda_CPPFLAGS])
31+
AC_SUBST([btl_smcuda_LDFLAGS])
32+
AC_SUBST([btl_smcuda_LIBS])
3033
])dnl

opal/mca/rcache/gpusm/configure.m4

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,14 @@
1919
AC_DEFUN([MCA_opal_rcache_gpusm_CONFIG],[
2020
AC_CONFIG_FILES([opal/mca/rcache/gpusm/Makefile])
2121

22+
OPAL_CHECK_CUDA([rcache_gpusm])
23+
2224
# Use CUDA_SUPPORT which was filled in by the opal configure code.
2325
AS_IF([test "x$CUDA_SUPPORT" = "x1"],
2426
[$1],
2527
[$2])
2628

29+
AC_SUBST([rcache_gpusm_CPPFLAGS])
30+
AC_SUBST([rcache_gpusm_LDFLAGS])
31+
AC_SUBST([rcache_gpusm_LIBS])
2732
])dnl

opal/mca/rcache/gpusm/rcache_gpusm_module.c

Lines changed: 95 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,24 +41,43 @@
4141
#include "opal_config.h"
4242
#include "opal/mca/rcache/base/base.h"
4343
#include "opal/mca/rcache/gpusm/rcache_gpusm.h"
44-
#include "opal/cuda/common_cuda.h"
44+
#include "opal/include/opal/opal_cuda.h"
45+
#include <cuda.h>
4546

4647
/**
4748
* Called when the registration free list is created. An event is created
4849
* for each entry.
4950
*/
5051
static void mca_rcache_gpusm_registration_constructor(mca_rcache_gpusm_registration_t *item)
5152
{
52-
mca_common_cuda_construct_event_and_handle(&item->event, (void *) &item->evtHandle);
53+
uintptr_t *event = &item->event;
54+
void *handle = (void *) &item->evtHandle;
55+
CUresult result;
56+
57+
result = cuEventCreate((CUevent *) event,
58+
CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
59+
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
60+
opal_output(0, "cuEventCreate failed\n");
61+
}
62+
63+
result = cuIpcGetEventHandle((CUipcEventHandle *) handle, (CUevent) *event);
64+
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
65+
opal_output(0, "cuIpcGetEventHandle failed\n");
66+
}
5367
}
5468

5569
/**
5670
* Called when the program is exiting. This destroys the events.
5771
*/
5872
static void mca_rcache_gpusm_registration_destructor(mca_rcache_gpusm_registration_t *item)
5973
{
60-
mca_common_cuda_destruct_event(item->event);
74+
uintptr_t event = item->event;
75+
CUresult result;
6176

77+
result = cuEventDestroy((CUevent) event);
78+
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
79+
opal_output(0, "cuEventDestroy failed");
80+
}
6281
}
6382

6483
OBJ_CLASS_INSTANCE(mca_rcache_gpusm_registration_t, mca_rcache_base_registration_t,
@@ -81,7 +100,7 @@ void mca_rcache_gpusm_module_init(mca_rcache_gpusm_module_t *rcache)
81100
/* Start with 0 entries in the free list since CUDA may not have
82101
* been initialized when this free list is created and there is
83102
* some CUDA specific activities that need to be done. */
84-
opal_free_list_init(&rcache->reg_list, sizeof(struct mca_rcache_common_cuda_reg_t),
103+
opal_free_list_init(&rcache->reg_list, sizeof(struct mca_opal_cuda_reg_t),
85104
opal_cache_line_size, OBJ_CLASS(mca_rcache_gpusm_registration_t), 0,
86105
opal_cache_line_size, 0, -1, 64, NULL, 0, NULL, NULL, NULL);
87106
}
@@ -96,6 +115,77 @@ int mca_rcache_gpusm_find(mca_rcache_base_module_t *rcache, void *addr, size_t s
96115
return mca_rcache_gpusm_register(rcache, addr, size, 0, 0, reg);
97116
}
98117

118+
/*
119+
* Get the memory handle of a local section of memory that can be sent
120+
* to the remote size so it can access the memory. This is the
121+
* registration function for the sending side of a message transfer.
122+
*/
123+
static int mca_rcache_gpusm_get_mem_handle(void *base, size_t size, mca_rcache_base_registration_t *newreg)
124+
{
125+
CUmemorytype memType;
126+
CUresult result;
127+
CUipcMemHandle *memHandle;
128+
CUdeviceptr pbase;
129+
size_t psize;
130+
131+
mca_opal_cuda_reg_t *cuda_reg = (mca_opal_cuda_reg_t *) newreg;
132+
memHandle = (CUipcMemHandle *) cuda_reg->data.memHandle;
133+
134+
/* We should only be there if this is a CUDA device pointer */
135+
result = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
136+
(CUdeviceptr) base);
137+
assert(CUDA_SUCCESS == result);
138+
assert(CU_MEMORYTYPE_DEVICE == memType);
139+
140+
/* Get the memory handle so we can send it to the remote process. */
141+
result = cuIpcGetMemHandle(memHandle, (CUdeviceptr) base);
142+
143+
if (CUDA_SUCCESS != result) {
144+
return OPAL_ERROR;
145+
}
146+
147+
/* Need to get the real base and size of the memory handle. This is
148+
* how the remote side saves the handles in a cache. */
149+
result = cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr) base);
150+
if (CUDA_SUCCESS != result) {
151+
return OPAL_ERROR;
152+
}
153+
154+
/* Store all the information in the registration */
155+
cuda_reg->base.base = (void *) pbase;
156+
cuda_reg->base.bound = (unsigned char *) pbase + psize - 1;
157+
cuda_reg->data.memh_seg_addr.pval = (void *) pbase;
158+
cuda_reg->data.memh_seg_len = psize;
159+
160+
#if OPAL_CUDA_SYNC_MEMOPS
161+
/* With CUDA 6.0, we can set an attribute on the memory pointer that will
162+
* ensure any synchronous copies are completed prior to any other access
163+
* of the memory region. This means we do not need to record an event
164+
* and send to the remote side.
165+
*/
166+
memType = 1; /* Just use this variable since we already have it */
167+
result = cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
168+
(CUdeviceptr) base);
169+
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
170+
return OPAL_ERROR;
171+
}
172+
#else
173+
/* Need to record the event to ensure that any memcopies into the
174+
* device memory have completed. The event handle associated with
175+
* this event is sent to the remote process so that it will wait
176+
* on this event prior to copying data out of the device memory.
177+
* Note that this needs to be the NULL stream to make since it is
178+
* unknown what stream any copies into the device memory were done
179+
* with. */
180+
result = cuEventRecord((CUevent) cuda_reg->data.event, 0);
181+
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
182+
return OPAL_ERROR;
183+
}
184+
#endif /* OPAL_CUDA_SYNC_MEMOPS */
185+
186+
return OPAL_SUCCESS;
187+
}
188+
99189
/*
100190
* This is the one function that does all the work. It will call into
101191
* the register function to get the memory handle for the sending
@@ -133,7 +223,7 @@ int mca_rcache_gpusm_register(mca_rcache_base_module_t *rcache, void *addr, size
133223
gpusm_reg->flags = flags;
134224
gpusm_reg->access_flags = access_flags;
135225

136-
rc = cuda_getmemhandle(base, size, gpusm_reg, NULL);
226+
rc = mca_rcache_gpusm_get_mem_handle(base, size, gpusm_reg);
137227

138228
if (rc != OPAL_SUCCESS) {
139229
opal_free_list_return(&rcache_gpusm->reg_list, item);

opal/mca/rcache/rgpusm/configure.m4

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,14 @@
1919
AC_DEFUN([MCA_opal_rcache_rgpusm_CONFIG],[
2020
AC_CONFIG_FILES([opal/mca/rcache/rgpusm/Makefile])
2121

22+
OPAL_CHECK_CUDA([rcache_rgpusm])
23+
2224
# Use CUDA_SUPPORT which was filled in by the opal configure code.
2325
AS_IF([test "x$CUDA_SUPPORT" = "x1"],
2426
[$1],
2527
[$2])
2628

29+
AC_SUBST([rcache_rgpusm_CPPFLAGS])
30+
AC_SUBST([rcache_rgpusm_LDFLAGS])
31+
AC_SUBST([rcache_rgpusm_LIBS])
2732
])dnl

0 commit comments

Comments
 (0)