Skip to content

Commit 58fecda

Browse files
committed
common/ucx: Fix mca string var registration
bugfix #11632 Signed-off-by: Roie Danino <[email protected]>
1 parent 16e0752 commit 58fecda

File tree

2 files changed

+32
-19
lines changed

2 files changed

+32
-19
lines changed

opal/mca/common/ucx/common_ucx.c

+29-17
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ opal_common_ucx_module_t opal_common_ucx =
3838
{
3939
.progress_iterations = 100,
4040
.opal_mem_hooks = 1,
41+
.tls = NULL,
42+
.devices = NULL,
4143
};
4244

4345
static opal_mutex_t opal_common_ucx_mutex = OPAL_MUTEX_STATIC_INIT;
@@ -80,28 +82,38 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
8082
&opal_common_ucx.opal_mem_hooks);
8183

8284
if (NULL == opal_common_ucx.tls) {
83-
opal_common_ucx.tls = default_tls;
85+
// Extra level of string indirection needed to make ompi_info
86+
// happy since it will unload this library before the MCA base
87+
// cleans up the MCA vars. This will cause the string to go
88+
// out of scope unless we place the pointer to it on the heap.
89+
opal_common_ucx.tls = (char **) malloc(sizeof(char *));
90+
*opal_common_ucx.tls = strdup(default_tls);
8491
}
8592

86-
tls_index = mca_base_var_register(
87-
"opal", "opal_common", "ucx", "tls",
93+
tls_index = mca_base_component_var_register(
94+
component, "tls",
8895
"List of UCX transports which should be supported on the system, to enable "
8996
"selecting the UCX component. Special values: any (any available). "
9097
"A '^' prefix negates the list. "
9198
"For example, in order to exclude on shared memory and TCP transports, "
9299
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
93-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
94-
&opal_common_ucx.tls);
100+
MCA_BASE_VAR_TYPE_STRING, NULL, 0,
101+
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
102+
MCA_BASE_VAR_SCOPE_LOCAL,
103+
opal_common_ucx.tls);
95104

96105
if (NULL == opal_common_ucx.devices) {
97-
opal_common_ucx.devices = default_devices;
106+
opal_common_ucx.devices = (char **) malloc(sizeof(char *));
107+
*opal_common_ucx.devices = strdup(default_devices);
98108
}
99-
devices_index = mca_base_var_register(
100-
"opal", "opal_common", "ucx", "devices",
109+
devices_index = mca_base_component_var_register(
110+
component, "devices",
101111
"List of device driver pattern names, which, if supported by UCX, will "
102112
"bump its priority above ob1. Special values: any (any available)",
103-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
104-
&opal_common_ucx.devices);
113+
MCA_BASE_VAR_TYPE_STRING, NULL, 0,
114+
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
115+
MCA_BASE_VAR_SCOPE_LOCAL,
116+
opal_common_ucx.devices);
105117

106118
if (component) {
107119
mca_base_var_register_synonym(verbose_index, component->mca_project_name,
@@ -233,8 +245,8 @@ OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_
233245
int ret;
234246
#endif
235247

236-
is_any_tl = !strcmp(opal_common_ucx.tls, "any");
237-
is_any_device = !strcmp(opal_common_ucx.devices, "any");
248+
is_any_tl = !strcmp(*opal_common_ucx.tls, "any");
249+
is_any_device = !strcmp(*opal_common_ucx.devices, "any");
238250

239251
/* Check for special value "any" */
240252
if (is_any_tl && is_any_device) {
@@ -245,19 +257,19 @@ OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_
245257

246258
#if HAVE_DECL_OPEN_MEMSTREAM
247259
/* Split transports list */
248-
negate = ('^' == (opal_common_ucx.tls)[0]);
249-
tl_list = opal_argv_split(opal_common_ucx.tls + (negate ? 1 : 0), ',');
260+
negate = ('^' == (*opal_common_ucx.tls)[0]);
261+
tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ',');
250262
if (tl_list == NULL) {
251263
MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled",
252-
opal_common_ucx.tls);
264+
*opal_common_ucx.tls);
253265
goto out;
254266
}
255267

256268
/* Split devices list */
257-
device_list = opal_argv_split(opal_common_ucx.devices, ',');
269+
device_list = opal_argv_split(*opal_common_ucx.devices, ',');
258270
if (device_list == NULL) {
259271
MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled",
260-
opal_common_ucx.devices);
272+
*opal_common_ucx.devices);
261273
goto out_free_tl_list;
262274
}
263275

opal/mca/common/ucx/common_ucx.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* and Technology (RIST). All rights reserved.
66
* Copyright (c) 2019-2020 High Performance Computing Center Stuttgart,
77
* University of Stuttgart. All rights reserved.
8+
* Copyright (c) 2023 NVIDIA Corporation. All rights reserved.
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -90,8 +91,8 @@ typedef struct opal_common_ucx_module {
9091
int progress_iterations;
9192
int registered;
9293
bool opal_mem_hooks;
93-
char *tls;
94-
char *devices;
94+
char **tls;
95+
char **devices;
9596
} opal_common_ucx_module_t;
9697

9798
typedef struct opal_common_ucx_del_proc {

0 commit comments

Comments
 (0)