From acb098463b244e030b8fbdc8c15ae43e5d81201b Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 18 Oct 2022 15:16:45 -0400 Subject: [PATCH] coll/han: set as default except if ranks are consecutive across nodes coll/han provides better latency than coll/tuned if processes are mapped to nodes nonconsecutively, e.g., using --rank-by node. In that case coll/han reduces the amount of cross-node traffic. Its benefits are less clear with linear process placements. We try to detect linear process placement and if found reduce the priority of coll/han to below coll/tuned. A new mca parameter coll_han_priority_penalty is used to control the adjustment (10 by default). Signed-off-by: Joseph Schuchart --- ompi/mca/coll/han/coll_han.h | 2 ++ ompi/mca/coll/han/coll_han_component.c | 10 +++++- ompi/mca/coll/han/coll_han_module.c | 43 ++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h index 837734aaf00..6177d84140b 100644 --- a/ompi/mca/coll/han/coll_han.h +++ b/ompi/mca/coll/han/coll_han.h @@ -204,6 +204,8 @@ typedef struct mca_coll_han_component_t { /** MCA parameter: Priority of this component */ int han_priority; + /** MCA parameter: Priority penalty for sequential process distribution */ + int han_priority_penalty; /* whether output the log message */ int han_output; int han_output_verbose; /* activation level of coll han verbosity */ diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index 0f5c1a85d2c..6eb616e3173 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -249,7 +249,7 @@ static int han_register(void) TOPO_LVL_T topo_lvl; COMPONENT_T component; - cs->han_priority = 0; + cs->han_priority = 35; (void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, @@ -261,6 +261,14 @@ static int han_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_output_verbose); + cs->han_priority_penalty = 10; + (void) mca_base_component_var_register(c, "priority_penalty", + "Priority reduction of the HAN component " + "for linear process distributions", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority_penalty); + cs->han_bcast_segsize = 65536; (void) mca_base_component_var_register(c, "bcast_segsize", "segment size for bcast", diff --git a/ompi/mca/coll/han/coll_han_module.c b/ompi/mca/coll/han/coll_han_module.c index bd24d5ec1a1..27ec3de2440 100644 --- a/ompi/mca/coll/han/coll_han_module.c +++ b/ompi/mca/coll/han/coll_han_module.c @@ -173,6 +173,45 @@ int mca_coll_han_init_query(bool enable_progress_threads, return OMPI_SUCCESS; } +/** + * check whether all ranks on our node are consecutive + * and exchange that information with our peers + */ +static bool proc_ranks_consecutive(struct ompi_communicator_t * comm) +{ + int last_rank = -1; + int num_jumps = 0; + for (int i = 0 ; i < comm->c_local_group->grp_proc_count ; ++i) { + ompi_proc_t *proc = NULL; + proc = ompi_group_get_proc_ptr_raw (comm->c_local_group, i); + if (ompi_proc_is_sentinel (proc)) { + /* non-local proc */ + continue; + } + if (!OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { + /* non-local proc */ + continue; + } + if ((last_rank + 1) != i) { + num_jumps++; + } + last_rank = i; + } + + /* the module is not used in the recursive-doubling implementation */ + ompi_coll_base_allreduce_intra_recursivedoubling(MPI_IN_PLACE, + &num_jumps, 1, MPI_INT, + MPI_MAX, comm, + NULL /* module */); + + /* if there is more than one jump in the rank sequence the ranks are not consecutive + * one jump in ranks may stem from the first rank on the node (on all but the first node) */ + if (num_jumps > 1) { + return false; + } + + return true; +} /* * Invoked when there's a new communicator that has been created. @@ -203,6 +242,10 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) /* Get the priority level attached to this module. If priority is less * than or equal to 0, then the module is unavailable. */ *priority = mca_coll_han_component.han_priority; + /* reduce priority if the rank distribution is linear across nodes */ + if (proc_ranks_consecutive(comm)) { + *priority -= mca_coll_han_component.han_priority_penalty; + } if (mca_coll_han_component.han_priority < 0) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:han:comm_query (%s/%s): priority too low; disqualifying myself",