@@ -92,29 +92,33 @@ struct ompi_mtl_psm2_shadow_variable {
92
92
mca_base_var_info_lvl_t info_level ;
93
93
const char * mca_name ;
94
94
const char * description ;
95
+ mca_base_var_flag_t flags ;
95
96
};
96
97
97
98
struct ompi_mtl_psm2_shadow_variable ompi_mtl_psm2_shadow_variables [] = {
98
99
{MCA_BASE_VAR_TYPE_STRING , & ompi_mtl_psm2 .psm2_devices , {.stringval = "self,shm,hfi" }, "PSM2_DEVICES" , OPAL_INFO_LVL_3 ,
99
- "devices" , "Comma-delimited list of PSM2 devices. Valid values: self, shm, hfi (default: self,shm,hfi)" },
100
+ "devices" ,
101
+ "Comma-delimited list of PSM2 devices. Valid values: self, shm, hfi (default: self,shm,hfi. Reduced to self,shm in single node jobs)" ,0 },
100
102
{MCA_BASE_VAR_TYPE_STRING , & ompi_mtl_psm2 .psm2_memory , {.stringval = "normal" }, "PSM2_MEMORY" , OPAL_INFO_LVL_9 ,
101
- "memory_model" , "PSM2 memory usage mode (default: normal)" },
102
- {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_mq_sendreqs_max , {.ulval = 1048576 }, "PSM2_MQ_SENDREQS_MAX" , OPAL_INFO_LVL_3 ,
103
- "mq_sendreqs_max" , "PSM2 maximum number of isend requests in flight (default: 1M)" },
104
- {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_mq_recvreqs_max , {.ulval = 1048576 }, "PSM2_MQ_RECVREQS_MAX" , OPAL_INFO_LVL_3 ,
105
- "mq_recvreqs_max" , "PSM2 maximum number of irecv requests in flight (default: 1M)" },
106
- {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_mq_rndv_hfi_threshold , {.ulval = 64000 }, "PSM2_MQ_RNDV_HFI_THRESH" , OPAL_INFO_LVL_3 ,
107
- "hfi_eager_limit" , "PSM2 eager to rendezvous threshold (default: 64000)" },
108
- {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_mq_rndv_shm_threshold , {.ulval = 16000 }, "PSM2_MQ_RNDV_SHM_THRESH" , OPAL_INFO_LVL_3 ,
109
- "shm_eager_limit" , "PSM2 shared memory eager to rendezvous threshold (default: 16000)" },
103
+ "memory_model" , "PSM2 memory usage mode. Valid values: min, normal, large (default: normal)" , 0 },
104
+ {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_mq_sendreqs_max , {.ulval = 0 }, "PSM2_MQ_SENDREQS_MAX" , OPAL_INFO_LVL_3 ,
105
+ "mq_sendreqs_max" , "PSM2 maximum number of isend requests in flight (default: unset, let libpsm2 use its default)" , MCA_BASE_VAR_FLAG_DEF_UNSET },
106
+ {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_mq_recvreqs_max , {.ulval = 0 }, "PSM2_MQ_RECVREQS_MAX" , OPAL_INFO_LVL_3 ,
107
+ "mq_recvreqs_max" , "PSM2 maximum number of irecv requests in flight (default: unset, let libpsm2 use its default)" , MCA_BASE_VAR_FLAG_DEF_UNSET },
108
+ {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_mq_rndv_hfi_threshold , {.ulval = 0 }, "PSM2_MQ_RNDV_HFI_THRESH" , OPAL_INFO_LVL_3 ,
109
+ "hfi_eager_limit" , "PSM2 eager to rendezvous threshold (default: unset, let libpsm2 use its defaults)" , MCA_BASE_VAR_FLAG_DEF_UNSET },
110
+ {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_mq_rndv_shm_threshold , {.ulval = 0 }, "PSM2_MQ_RNDV_SHM_THRESH" , OPAL_INFO_LVL_3 ,
111
+ "shm_eager_limit" , "PSM2 shared memory eager to rendezvous threshold (default: unset, let libpsm2 use its default)" , MCA_BASE_VAR_FLAG_DEF_UNSET },
110
112
{MCA_BASE_VAR_TYPE_BOOL , & ompi_mtl_psm2 .psm2_recvthread , {.boolval = true}, "PSM2_RCVTHREAD" , OPAL_INFO_LVL_3 ,
111
113
"use_receive_thread" , "Use PSM2 progress thread (default: true)" },
112
114
{MCA_BASE_VAR_TYPE_BOOL , & ompi_mtl_psm2 .psm2_shared_contexts , {.boolval = true}, "PSM2_SHAREDCONTEXTS" , OPAL_INFO_LVL_6 ,
113
115
"use_shared_contexts" , "Share PSM contexts between MPI processes (default: true)" },
114
- {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_shared_contexts_max , {.ulval = 8 }, "PSM2_SHAREDCONTEXTS_MAX " , OPAL_INFO_LVL_9 ,
115
- "max_shared_contexts " , "Maximum number of contexts available on a node (default: 8, max: 8)" },
116
+ {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_max_contexts_per_job , {.ulval = 0 }, "PSM2_MAX_CONTEXTS_PER_JOB " , OPAL_INFO_LVL_9 ,
117
+ "max_contexts_per_job " , "Maximum number of contexts available on a node (default: unset, let libpsm2 use its default)" , MCA_BASE_VAR_FLAG_DEF_UNSET },
116
118
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_tracemask , {.ulval = 1 }, "PSM2_TRACEMASK" , OPAL_INFO_LVL_9 ,
117
- "trace_mask" , "PSM2 tracemask value. See PSM2 documentation for accepted values (default: 1)" },
119
+ "trace_mask" , "PSM2 tracemask value. See PSM2 documentation for accepted values in 0x (default: 1)" },
120
+ {MCA_BASE_VAR_TYPE_UNSIGNED_LONG , & ompi_mtl_psm2 .psm2_opa_sl , {.ulval = 0 }, "HFI_SL" , OPAL_INFO_LVL_9 ,
121
+ "opa_service_level" , "HFI Service Level (default: unset, let libpsm2 use its defaults)" , MCA_BASE_VAR_FLAG_DEF_UNSET },
118
122
{-1 },
119
123
};
120
124
@@ -123,10 +127,27 @@ static void ompi_mtl_psm2_set_shadow_env (struct ompi_mtl_psm2_shadow_variable *
123
127
mca_base_var_storage_t * storage = variable -> storage ;
124
128
char * env_value ;
125
129
int ret = 0 ;
130
+ int var_index = 0 ;
131
+ const mca_base_var_t * mca_base_var ;
132
+
133
+ var_index = mca_base_var_find ("ompi" , "mtl" , "psm2" , variable -> mca_name );
134
+ ret = mca_base_var_get (var_index ,& mca_base_var );
135
+ /* Something is fundamentally broken if registered variables are
136
+ * not found */
137
+ if (OPAL_SUCCESS != ret ) {
138
+ fprintf (stderr , "ERROR setting PSM2 environment variable: %s\n" , variable -> env_name );
139
+ return ;
140
+ }
141
+
142
+ /** Skip setting variables for which the default behavior is "unset" */
143
+ if ((mca_base_var -> mbv_flags & MCA_BASE_VAR_FLAG_DEF_UNSET ) &&
144
+ (MCA_BASE_VAR_SOURCE_DEFAULT == mca_base_var -> mbv_source )){
145
+ return ;
146
+ }
126
147
127
148
switch (variable -> variable_type ) {
128
149
case MCA_BASE_VAR_TYPE_BOOL :
129
- ret = asprintf (& env_value , "%s=%s " , variable -> env_name , storage -> boolval ? "YES" : "NO" );
150
+ ret = asprintf (& env_value , "%s=%d " , variable -> env_name , storage -> boolval ? 1 : 0 );
130
151
break ;
131
152
case MCA_BASE_VAR_TYPE_UNSIGNED_LONG :
132
153
if (0 == strcmp (variable -> env_name , "PSM2_TRACEMASK" )) {
@@ -182,7 +203,7 @@ static void ompi_mtl_psm2_register_shadow_env (struct ompi_mtl_psm2_shadow_varia
182
203
}
183
204
184
205
(void ) mca_base_component_var_register (& mca_mtl_psm2_component .super .mtl_version , variable -> mca_name , variable -> description ,
185
- variable -> variable_type , NULL , 0 , 0 , variable -> info_level , MCA_BASE_VAR_SCOPE_READONLY ,
206
+ variable -> variable_type , NULL , 0 , variable -> flags , variable -> info_level , MCA_BASE_VAR_SCOPE_READONLY ,
186
207
variable -> storage );
187
208
}
188
209
@@ -221,8 +242,10 @@ ompi_mtl_psm2_component_register(void)
221
242
(void ) get_num_total_procs (& num_total_procs );
222
243
223
244
/* set priority high enough to beat ob1's default (also set higher than psm) */
224
- if (num_local_procs == num_total_procs ) {
225
- /* disable hfi if all processes are local */
245
+ if ((num_local_procs == num_total_procs ) && (1 < num_total_procs )) {
246
+ /* Disable hfi if all processes are local. However, if running only one
247
+ * process assume it is ompi_info or this is most likely going to spawn, for
248
+ * which all PSM2 devices are needed */
226
249
setenv ("PSM2_DEVICES" , "self,shm" , 0 );
227
250
/* ob1 is much faster than psm2 with shared memory */
228
251
param_priority = 10 ;
0 commit comments