From 48c228f6d8f0d297573c5a9a581c882c4e72e976 Mon Sep 17 00:00:00 2001 From: guserav Date: Mon, 3 Jun 2019 16:27:54 -0700 Subject: [PATCH 1/2] Add BTL_OFI_BLACKLIST Co-authored-by: jlbyrne-hpe Signed-off-by: guserav --- opal/mca/btl/ofi/btl_ofi_component.c | 66 +++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index c57b5040b6b..e506f8ee883 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -43,7 +43,11 @@ #define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR) +#define MCA_BTL_OFI_PROV_EXCLUDE_DEFAULT \ + "shm,sockets,tcp,udp,rstream,psm2" + static char *prov_include; +static char *prov_exclude; static char *ofi_progress_mode; static bool disable_sep; static int mca_btl_ofi_init_device(struct fi_info *info); @@ -117,13 +121,22 @@ static int mca_btl_ofi_component_register(void) "provider_include", "OFI provider that ofi btl will query for. This parameter only " "accept ONE provider name. " - "(e.g., \"psm2\"; an empty value means that all providers will " + "(e.g., \"gni\"; an empty value means that all providers will " "be considered.", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_READONLY, &prov_include); + prov_exclude = MCA_BTL_OFI_PROV_EXCLUDE_DEFAULT; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "provider_exclude", + "Comma-delimited list of OFI providers that are not considered for use (default: \"" MCA_BTL_OFI_PROV_EXCLUDE_DEFAULT "\"; empty value means that all providers will be considered). Mutually exclusive with btl_ofi_provider_include.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &prov_exclude); + mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ; (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, "num_cq_read", @@ -213,6 +226,20 @@ void mca_btl_ofi_exit(void) exit(1); } +static int is_in_list(char * const *list, const char *item) +{ + int i; + + if (!list) + return 0; + for (i = 0; list[i]; i++) { + if (!strcmp(list[i], item)) + return 1; + } + + return 0; +} + /* * OFI component initialization: * read interface list from kernel and compare against component parameters @@ -325,19 +352,38 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, } BTL_VERBOSE(("ofi btl found %d possible resources.", resource_count)); - info = info_list; - - while(info) { + char **exclude_list = NULL; + char *prov_name; + + if (prov_exclude) + exclude_list = opal_argv_split(prov_exclude, ','); + for (info = info_list; info; info = info->next) { + prov_name = info->fabric_attr->prov_name; + opal_output_verbose(1, opal_btl_base_framework.framework_output, + "%s:%d: btl:ofi: trying \"%s\"\n", + __FILE__, __LINE__, prov_name); + /* prov_include wins; same behavior as mtl_ofi */ + if ((exclude_list && is_in_list(exclude_list, prov_name)) && + !(prov_include && !strcmp(prov_include, prov_name))) { + opal_output_verbose(1, opal_btl_base_framework.framework_output, + "%s:%d: btl:ofi: \"%s\" in exclude list\n", + __FILE__, __LINE__, prov_name); + continue; + } rc = validate_info(info, required_caps); + if (OPAL_SUCCESS != rc) + continue; + /* Device passed sanity check, let's make a module. + * We only pick the first device we found valid */ + rc = mca_btl_ofi_init_device(info); if (OPAL_SUCCESS == rc) { - /* Device passed sanity check, let's make a module. - * We only pick the first device we found valid */ - rc = mca_btl_ofi_init_device(info); - if (OPAL_SUCCESS == rc) - break; + opal_output_verbose(1, opal_btl_base_framework.framework_output, + "%s:%d: btl:ofi: using \"%s\"\n", + __FILE__, __LINE__, prov_name); + break; } - info = info->next; } + free(exclude_list); /* We are done with the returned info. */ fi_freeinfo(info_list); From 36fa4641507bb370e9199c634bdf46e149234345 Mon Sep 17 00:00:00 2001 From: guserav Date: Tue, 4 Jun 2019 09:59:43 -0700 Subject: [PATCH 2/2] Add requested changes of @jsquyres Mainly to satisfy the coding style. Signed-off-by: guserav --- opal/mca/btl/ofi/btl_ofi_component.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index e506f8ee883..0d003097a5f 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -226,18 +226,20 @@ void mca_btl_ofi_exit(void) exit(1); } -static int is_in_list(char * const *list, const char *item) +static bool is_in_list(char * const *list, const char *item) { int i; - if (!list) - return 0; + if (!list) { + return false; + } for (i = 0; list[i]; i++) { - if (!strcmp(list[i], item)) - return 1; + if (!strcmp(list[i], item)) { + return true; + } } - return 0; + return false; } /* @@ -355,8 +357,9 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, char **exclude_list = NULL; char *prov_name; - if (prov_exclude) + if (prov_exclude) { exclude_list = opal_argv_split(prov_exclude, ','); + } for (info = info_list; info; info = info->next) { prov_name = info->fabric_attr->prov_name; opal_output_verbose(1, opal_btl_base_framework.framework_output, @@ -371,8 +374,9 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, continue; } rc = validate_info(info, required_caps); - if (OPAL_SUCCESS != rc) + if (OPAL_SUCCESS != rc) { continue; + } /* Device passed sanity check, let's make a module. * We only pick the first device we found valid */ rc = mca_btl_ofi_init_device(info);