Skip to content

Commit 2c86b87

Browse files
authored
Merge pull request #4510 from matcabral/mtl_psm2_shadow_vars
New flag for MCA parameters that allows a behaving with a default value of "unset".
2 parents b160cf6 + 1fad594 commit 2c86b87

File tree

4 files changed

+54
-19
lines changed

4 files changed

+54
-19
lines changed

ompi/mca/mtl/psm2/mtl_psm2_component.c

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -92,29 +92,33 @@ struct ompi_mtl_psm2_shadow_variable {
9292
mca_base_var_info_lvl_t info_level;
9393
const char *mca_name;
9494
const char *description;
95+
mca_base_var_flag_t flags;
9596
};
9697

9798
struct ompi_mtl_psm2_shadow_variable ompi_mtl_psm2_shadow_variables[] = {
9899
{MCA_BASE_VAR_TYPE_STRING, &ompi_mtl_psm2.psm2_devices, {.stringval = "self,shm,hfi"}, "PSM2_DEVICES", OPAL_INFO_LVL_3,
99-
"devices", "Comma-delimited list of PSM2 devices. Valid values: self, shm, hfi (default: self,shm,hfi)"},
100+
"devices",
101+
"Comma-delimited list of PSM2 devices. Valid values: self, shm, hfi (default: self,shm,hfi. Reduced to self,shm in single node jobs)",0},
100102
{MCA_BASE_VAR_TYPE_STRING, &ompi_mtl_psm2.psm2_memory, {.stringval = "normal"}, "PSM2_MEMORY", OPAL_INFO_LVL_9,
101-
"memory_model", "PSM2 memory usage mode (default: normal)"},
102-
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_sendreqs_max, {.ulval = 1048576}, "PSM2_MQ_SENDREQS_MAX", OPAL_INFO_LVL_3,
103-
"mq_sendreqs_max", "PSM2 maximum number of isend requests in flight (default: 1M)"},
104-
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_recvreqs_max, {.ulval = 1048576}, "PSM2_MQ_RECVREQS_MAX", OPAL_INFO_LVL_3,
105-
"mq_recvreqs_max", "PSM2 maximum number of irecv requests in flight (default: 1M)"},
106-
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_hfi_threshold, {.ulval = 64000}, "PSM2_MQ_RNDV_HFI_THRESH", OPAL_INFO_LVL_3,
107-
"hfi_eager_limit", "PSM2 eager to rendezvous threshold (default: 64000)"},
108-
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_shm_threshold, {.ulval = 16000}, "PSM2_MQ_RNDV_SHM_THRESH", OPAL_INFO_LVL_3,
109-
"shm_eager_limit", "PSM2 shared memory eager to rendezvous threshold (default: 16000)"},
103+
"memory_model", "PSM2 memory usage mode. Valid values: min, normal, large (default: normal)", 0},
104+
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_sendreqs_max, {.ulval = 0}, "PSM2_MQ_SENDREQS_MAX", OPAL_INFO_LVL_3,
105+
"mq_sendreqs_max", "PSM2 maximum number of isend requests in flight (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
106+
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_recvreqs_max, {.ulval = 0}, "PSM2_MQ_RECVREQS_MAX", OPAL_INFO_LVL_3,
107+
"mq_recvreqs_max", "PSM2 maximum number of irecv requests in flight (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
108+
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_hfi_threshold, {.ulval = 0}, "PSM2_MQ_RNDV_HFI_THRESH", OPAL_INFO_LVL_3,
109+
"hfi_eager_limit", "PSM2 eager to rendezvous threshold (default: unset, let libpsm2 use its defaults)", MCA_BASE_VAR_FLAG_DEF_UNSET},
110+
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_shm_threshold, {.ulval = 0}, "PSM2_MQ_RNDV_SHM_THRESH", OPAL_INFO_LVL_3,
111+
"shm_eager_limit", "PSM2 shared memory eager to rendezvous threshold (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
110112
{MCA_BASE_VAR_TYPE_BOOL, &ompi_mtl_psm2.psm2_recvthread, {.boolval = true}, "PSM2_RCVTHREAD", OPAL_INFO_LVL_3,
111113
"use_receive_thread", "Use PSM2 progress thread (default: true)"},
112114
{MCA_BASE_VAR_TYPE_BOOL, &ompi_mtl_psm2.psm2_shared_contexts, {.boolval = true}, "PSM2_SHAREDCONTEXTS", OPAL_INFO_LVL_6,
113115
"use_shared_contexts", "Share PSM contexts between MPI processes (default: true)"},
114-
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_shared_contexts_max, {.ulval = 8}, "PSM2_SHAREDCONTEXTS_MAX", OPAL_INFO_LVL_9,
115-
"max_shared_contexts", "Maximum number of contexts available on a node (default: 8, max: 8)"},
116+
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_max_contexts_per_job, {.ulval = 0}, "PSM2_MAX_CONTEXTS_PER_JOB", OPAL_INFO_LVL_9,
117+
"max_contexts_per_job", "Maximum number of contexts available on a node (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
116118
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_tracemask, {.ulval = 1}, "PSM2_TRACEMASK", OPAL_INFO_LVL_9,
117-
"trace_mask", "PSM2 tracemask value. See PSM2 documentation for accepted values (default: 1)"},
119+
"trace_mask", "PSM2 tracemask value. See PSM2 documentation for accepted values in 0x (default: 1)"},
120+
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_opa_sl, {.ulval = 0}, "HFI_SL", OPAL_INFO_LVL_9,
121+
"opa_service_level", "HFI Service Level (default: unset, let libpsm2 use its defaults)", MCA_BASE_VAR_FLAG_DEF_UNSET},
118122
{-1},
119123
};
120124

@@ -123,10 +127,27 @@ static void ompi_mtl_psm2_set_shadow_env (struct ompi_mtl_psm2_shadow_variable *
123127
mca_base_var_storage_t *storage = variable->storage;
124128
char *env_value;
125129
int ret = 0;
130+
int var_index = 0;
131+
const mca_base_var_t *mca_base_var;
132+
133+
var_index = mca_base_var_find("ompi", "mtl", "psm2", variable->mca_name);
134+
ret = mca_base_var_get (var_index,&mca_base_var);
135+
/* Something is fundamentally broken if registered variables are
136+
* not found */
137+
if (OPAL_SUCCESS != ret) {
138+
fprintf (stderr, "ERROR setting PSM2 environment variable: %s\n", variable->env_name);
139+
return;
140+
}
141+
142+
/** Skip setting variables for which the default behavior is "unset" */
143+
if ((mca_base_var->mbv_flags & MCA_BASE_VAR_FLAG_DEF_UNSET) &&
144+
(MCA_BASE_VAR_SOURCE_DEFAULT == mca_base_var->mbv_source)){
145+
return ;
146+
}
126147

127148
switch (variable->variable_type) {
128149
case MCA_BASE_VAR_TYPE_BOOL:
129-
ret = asprintf (&env_value, "%s=%s", variable->env_name, storage->boolval ? "YES" : "NO");
150+
ret = asprintf (&env_value, "%s=%d", variable->env_name, storage->boolval ? 1 : 0);
130151
break;
131152
case MCA_BASE_VAR_TYPE_UNSIGNED_LONG:
132153
if (0 == strcmp (variable->env_name, "PSM2_TRACEMASK")) {
@@ -182,7 +203,7 @@ static void ompi_mtl_psm2_register_shadow_env (struct ompi_mtl_psm2_shadow_varia
182203
}
183204

184205
(void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version, variable->mca_name, variable->description,
185-
variable->variable_type, NULL, 0, 0, variable->info_level, MCA_BASE_VAR_SCOPE_READONLY,
206+
variable->variable_type, NULL, 0, variable->flags, variable->info_level, MCA_BASE_VAR_SCOPE_READONLY,
186207
variable->storage);
187208
}
188209

@@ -221,8 +242,10 @@ ompi_mtl_psm2_component_register(void)
221242
(void) get_num_total_procs(&num_total_procs);
222243

223244
/* set priority high enough to beat ob1's default (also set higher than psm) */
224-
if (num_local_procs == num_total_procs) {
225-
/* disable hfi if all processes are local */
245+
if ((num_local_procs == num_total_procs) && (1 < num_total_procs)) {
246+
/* Disable hfi if all processes are local. However, if running only one
247+
* process assume it is ompi_info or this is most likely going to spawn, for
248+
* which all PSM2 devices are needed */
226249
setenv("PSM2_DEVICES", "self,shm", 0);
227250
/* ob1 is much faster than psm2 with shared memory */
228251
param_priority = 10;

ompi/mca/mtl/psm2/mtl_psm2_types.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,11 @@ struct mca_mtl_psm2_module_t {
5656
unsigned long psm2_mq_recvreqs_max;
5757
unsigned long psm2_mq_rndv_hfi_threshold;
5858
unsigned long psm2_mq_rndv_shm_threshold;
59-
unsigned long psm2_shared_contexts_max;
59+
unsigned long psm2_max_contexts_per_job;
6060
unsigned long psm2_tracemask;
6161
bool psm2_recvthread;
6262
bool psm2_shared_contexts;
63+
unsigned long psm2_opa_sl;
6364
};
6465

6566
typedef struct mca_mtl_psm2_module_t mca_mtl_psm2_module_t;

opal/mca/base/mca_base_var.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1896,6 +1896,14 @@ static int var_value_string (mca_base_var_t *var, char **value_string)
18961896

18971897
assert (MCA_BASE_VAR_TYPE_MAX > var->mbv_type);
18981898

1899+
/** Parameters with MCA_BASE_VAR_FLAG_DEF_UNSET flag should be shown
1900+
* as "unset" by default. */
1901+
if ((var->mbv_flags & MCA_BASE_VAR_FLAG_DEF_UNSET) &&
1902+
(MCA_BASE_VAR_SOURCE_DEFAULT == var->mbv_source)){
1903+
asprintf (value_string, "%s", "unset");
1904+
return OPAL_SUCCESS;
1905+
}
1906+
18991907
ret = mca_base_var_get_value(var->mbv_index, &value, NULL, NULL);
19001908
if (OPAL_SUCCESS != ret || NULL == value) {
19011909
return ret;

opal/mca/base/mca_base_var.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,10 @@ typedef enum {
191191
manually when you register a variable with
192192
mca_base_var_register(). Analogous to the
193193
MCA_BASE_PVAR_FLAG_IWG. */
194-
MCA_BASE_VAR_FLAG_DWG = 0x0040
194+
MCA_BASE_VAR_FLAG_DWG = 0x0040,
195+
/** Variable has a default value of "unset". Meaning to only
196+
* be set when the user explicitly asks for it */
197+
MCA_BASE_VAR_FLAG_DEF_UNSET = 0x0080,
195198
} mca_base_var_flag_t;
196199

197200

0 commit comments

Comments
 (0)