diff --git a/ompi/mca/coll/han/Makefile.am b/ompi/mca/coll/han/Makefile.am index dae2fbb1b0a..acaaab5c749 100644 --- a/ompi/mca/coll/han/Makefile.am +++ b/ompi/mca/coll/han/Makefile.am @@ -3,6 +3,7 @@ # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. +# Copyright (c) 2022 BULL S.A.S. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -13,6 +14,7 @@ sources = \ coll_han.h \ coll_han_trigger.h \ +coll_han_algorithms.h \ coll_han_dynamic.h \ coll_han_dynamic_file.h \ coll_han_barrier.c \ @@ -25,6 +27,7 @@ coll_han_allgather.c \ coll_han_component.c \ coll_han_module.c \ coll_han_trigger.c \ +coll_han_algorithms.c \ coll_han_dynamic.c \ coll_han_dynamic_file.c \ coll_han_topo.c \ diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h index b0c16a34140..837734aaf00 100644 --- a/ompi/mca/coll/han/coll_han.h +++ b/ompi/mca/coll/han/coll_han.h @@ -2,8 +2,8 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2020 Bull S.A.S. All rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved + * Copyright (c) 2020-2022 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,6 +40,7 @@ #include "ompi/mca/coll/base/coll_base_functions.h" #include "coll_han_trigger.h" #include "ompi/mca/coll/han/coll_han_dynamic.h" +#include "coll_han_algorithms.h" /* * Today; @@ -205,6 +206,7 @@ typedef struct mca_coll_han_component_t { int han_priority; /* whether output the log message */ int han_output; + int han_output_verbose; /* activation level of coll han verbosity */ /* segment size for bcast */ uint32_t han_bcast_segsize; /* up level module for bcast */ @@ -242,6 +244,8 @@ typedef struct mca_coll_han_component_t { */ bool han_reproducible; bool use_simple_algorithm[COLLCOUNT]; + int use_algorithm[COLLCOUNT]; + int use_algorithm_param[COLLCOUNT]; // MCA parmeter id for algo, to know if user provided /* Dynamic configuration rules */ bool use_dynamic_file_rules; @@ -250,7 +254,11 @@ typedef struct mca_coll_han_component_t { /* Dynamic rules from file */ mca_coll_han_dynamic_rules_t dynamic_rules; /* Dynamic rules from mca parameter */ - COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL]; + COMPONENT_T mca_sub_components[COLLCOUNT][NB_TOPO_LVL]; + + int num_available_algorithms[COLLCOUNT]; // not counting "default" behaviour + /* to show algorithms in ompi_info */ + mca_base_var_enum_value_t* algorithm_enumerator[COLLCOUNT]; /* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */ int max_dynamic_errors; @@ -469,109 +477,7 @@ mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS, int mca_coll_han_barrier_intra_simple(struct ompi_communicator_t *comm, mca_coll_base_module_t *module); -/* Bcast */ -int mca_coll_han_bcast_intra_simple(void *buff, - int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module); - -/* Reduce */ -int -mca_coll_han_reduce_intra_simple(const void *sbuf, - void* rbuf, - int count, - struct ompi_datatype_t *dtype, - ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int -mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int -mca_coll_han_reduce_reproducible(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int mca_coll_han_reduce_intra(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - ompi_op_t* op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module); - -/* Allreduce */ -int -mca_coll_han_allreduce_intra_simple(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int -mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int -mca_coll_han_allreduce_reproducible(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_han_allreduce_intra(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module); - -/* Scatter */ -int -mca_coll_han_scatter_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int -mca_coll_han_scatter_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module); - -/* Gather */ -int -mca_coll_han_gather_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int -mca_coll_han_gather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); /* reordering after gather, for unordered ranks */ void ompi_coll_han_reorder_gather(const void *sbuf, @@ -580,21 +486,4 @@ ompi_coll_han_reorder_gather(const void *sbuf, struct ompi_communicator_t *comm, int * topo); - - -/* Allgather */ -int -mca_coll_han_allgather_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int -mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - #endif /* MCA_COLL_HAN_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_algorithms.c b/ompi/mca/coll/han/coll_han_algorithms.c new file mode 100644 index 00000000000..bc2bd5ebade --- /dev/null +++ b/ompi/mca/coll/han/coll_han_algorithms.c @@ -0,0 +1,227 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020-2022 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* + * @file + * This files contains helps for algorithm selection + * + */ + +#include "coll_han.h" +#include "coll_han_algorithms.h" + +/* default algo names, formatted for MCA var_enum_create */ +mca_base_var_enum_value_t han_default_algorithms_enum[] = { + { 0, "default" }, // algo #0 is called "defaut" + { 0, NULL } +}; + +/** + * all available algorithms must be added here to be selectable in + * the component. + * + * note: algorithm 'default' / #0 is automatically added to each + * implemented collective, it is the default behaviour in + * coll_han_dynamic.c + */ +mca_coll_han_algorithm_value_t* mca_coll_han_available_algorithms[COLLCOUNT] = { + [BARRIER] = (mca_coll_han_algorithm_value_t[]) { + {"simple", (fnptr_t) &mca_coll_han_barrier_intra_simple}, // 2-level + { 0 } + }, + [BCAST] = (mca_coll_han_algorithm_value_t[]){ + {"intra", (fnptr_t) &mca_coll_han_bcast_intra}, // 2-level + {"simple", (fnptr_t) &mca_coll_han_bcast_intra_simple}, // 2-level + { 0 } + }, + [REDUCE] = (mca_coll_han_algorithm_value_t[]){ + {"intra", (fnptr_t) &mca_coll_han_reduce_intra}, // 2-level + {"simple", (fnptr_t) &mca_coll_han_reduce_intra_simple}, // 2-level + {"reproducible", (fnptr_t) &mca_coll_han_reduce_reproducible}, // fallback + { 0 } + }, + [ALLREDUCE] = (mca_coll_han_algorithm_value_t[]){ + {"intra", (fnptr_t) &mca_coll_han_allreduce_intra}, // 2-level + {"simple", (fnptr_t) &mca_coll_han_allreduce_intra_simple}, // 2-level + {"reproducible", (fnptr_t) &mca_coll_han_allreduce_reproducible}, // fallback + { 0 } + }, + [SCATTER] = (mca_coll_han_algorithm_value_t[]){ + {"intra", (fnptr_t) &mca_coll_han_scatter_intra}, // 2-level + {"simple", (fnptr_t) &mca_coll_han_scatter_intra_simple}, // 2-level + { 0 } + }, + [GATHER] = (mca_coll_han_algorithm_value_t[]){ + {"intra", (fnptr_t) &mca_coll_han_gather_intra}, // 2-level + {"simple", (fnptr_t) &mca_coll_han_gather_intra_simple}, // 2-level + { 0 } + }, + [ALLGATHER] = (mca_coll_han_algorithm_value_t[]){ + {"intra", (fnptr_t)&mca_coll_han_allgather_intra}, // 2-level + {"simple", (fnptr_t)&mca_coll_han_allgather_intra_simple}, // 2-level + { 0 } + }, +}; + +int +mca_coll_han_algorithm_id_is_valid(int coll_id, int algorithm_id) +{ + if (!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { + return false; + } + if (0 > algorithm_id) { + return false; + } + + /* + * user provided algorithms + 'default' algorithm #0 + */ + return algorithm_id < mca_coll_han_component.num_available_algorithms[coll_id] + 1; +} + +fnptr_t +mca_coll_han_algorithm_id_to_fn(int coll_id, int algorithm_id) +{ + if (algorithm_id == 0 || !mca_coll_han_algorithm_id_is_valid(coll_id, algorithm_id)) { + return NULL; + } + /* algorithm 0 is not included here */ + return mca_coll_han_available_algorithms[coll_id][algorithm_id - 1].fn; +} + +char* +mca_coll_han_algorithm_id_to_name(int coll_id, int algorithm_id) +{ + if (!mca_coll_han_algorithm_id_is_valid(coll_id, algorithm_id)) { + return NULL; + } + if (0 == algorithm_id) { + return "default"; + } + /* algorithm 0 is not included here */ + return mca_coll_han_available_algorithms[coll_id][algorithm_id - 1].name; +} + +int +mca_coll_han_algorithm_name_to_id(COLLTYPE_T coll_id, const char* algorithm_name) +{ + // shortcut for default + if (0 == strcmp(algorithm_name, "default")) { + return 0; + } + if (0 > mca_coll_han_component.num_available_algorithms[coll_id]) { + return -1; + } + + const mca_base_var_enum_value_t* algorithms_values + = mca_coll_han_component.algorithm_enumerator[coll_id]; + for (int i = 0; algorithms_values[i].string != NULL; i++) { + if (0 == strcmp(algorithm_name, algorithms_values[i].string)) + return i; + } + return -1; +} + +/** + * count algorithms for this collective (other than default) + */ +static int +mca_han_algorithm_count(mca_coll_han_algorithm_value_t* algorithm_values) +{ + int n = 0; + if (NULL != algorithm_values) { + while (algorithm_values[n].name != NULL) { + n++; + } + } + return n; +} + +/** + * Initializes algorithm_enumerator, used to show algorithm id/name in ompi_info + */ +static int +mca_han_algorithm_enumerator_create(mca_base_var_enum_value_t** algorithm_enumerator, + mca_coll_han_algorithm_value_t* algorithm_values) +{ + *algorithm_enumerator = NULL; + + int n_algorithm_values = mca_han_algorithm_count(algorithm_values); + if (0 == n_algorithm_values) { + return OMPI_SUCCESS; + } + + // n_algorithm_values+2 because of 'default' and termination by 0 + mca_base_var_enum_value_t* enum_values = + malloc((n_algorithm_values + 2) * sizeof(mca_base_var_enum_value_t)); + if (NULL == enum_values) { + goto alloc_error; + } + + // always add "default" algorithm + enum_values[0].value = 0; + enum_values[0].string = "default"; + // fill data for other algorithms + for (int i = 0; i < n_algorithm_values; i++) { + enum_values[i + 1].value = i + 1; + enum_values[i + 1].string = algorithm_values[i].name; + } + // last value of enum must be zeroed + enum_values[n_algorithm_values + 1] = (mca_base_var_enum_value_t){ 0 }; + *algorithm_enumerator = enum_values; + return OMPI_SUCCESS; + +alloc_error: + opal_output(0, "coll/han failed to initialize available algorithms " + "(allocation error)"); + return OMPI_ERROR; +} + + +/** + * initializes componnent algorithm_info + */ +int +mca_coll_han_init_algorithms(void) +{ + memset(mca_coll_han_component.num_available_algorithms, 0, + COLLCOUNT * sizeof(int)); + memset(mca_coll_han_component.algorithm_enumerator, 0, + COLLCOUNT * sizeof(mca_base_var_enum_value_t*)); + for (int coll=0; coll_algorithm:enumerator' + * + */ + +#ifndef MCA_COLL_HAN_ALGORITHMS_H +#define MCA_COLL_HAN_ALGORITHMS_H + +/* use this pointer type instead of void* to avoid warnings as it is + * not legal to convert function pointers to void* + * + * note: alternatively we could use a union of function types, but + * then it is heavy to declare available algorithms as nested + * datastructure + */ +typedef void (*fnptr_t)(void); +// Han algorithms, data declarations per collective +// structure used to declare an array of algorithms: {name, fn} +typedef struct mca_coll_han_algorithm_value_s { + char* name; + fnptr_t fn; +} mca_coll_han_algorithm_value_t; + +// datastructure generated from previous by mca_han_init_algorithm_info() +typedef struct mca_coll_han_collective_algorithm_info_s { + mca_base_var_enum_value_t* enum_values; +} mca_coll_han_collective_algorithm_info_t; + +// initialise before using algorithms id name fn var_enum_value +int +mca_coll_han_init_algorithms(void); +int +mca_coll_han_free_algorithms(void); + +int +mca_coll_han_algorithm_name_to_id(COLLTYPE_T coll_id, const char* algorithm_name); +int +mca_coll_han_algorithm_id_is_valid(int coll_id, int algorithm_id); +fnptr_t +mca_coll_han_algorithm_id_to_fn(int coll_id, int algorithm_id); +char* +mca_coll_han_algorithm_id_to_name(int coll_id, int algorithm_id); + +/** + * Available han algorithms + * + * They must be added to 'mca_coll_han_available_algorithms' + * in coll_han_algorithms.c + */ + +int mca_coll_han_barrier_intra_simple(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +/* Bcast */ +int mca_coll_han_bcast_intra_simple(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); + +/* Reduce */ +int +mca_coll_han_reduce_intra_simple(const void *sbuf, + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_han_reduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t* op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +/* Allreduce */ +int +mca_coll_han_allreduce_intra_simple(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_han_allreduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); + +/* Scatter */ +int +mca_coll_han_scatter_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int +mca_coll_han_scatter_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +/* Gather */ +int +mca_coll_han_gather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int +mca_coll_han_gather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +/* Allgather */ +int +mca_coll_han_allgather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int +mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +#endif diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index 82344f5b450..656c27abe10 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -2,8 +2,8 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2020 Bull S.A.S. All rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved + * Copyright (c) 2020-2022 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -21,11 +21,13 @@ #include "ompi_config.h" #include "opal/util/show_help.h" +#include "opal/util/argv.h" #include "ompi/constants.h" #include "ompi/mca/coll/coll.h" #include "coll_han.h" #include "coll_han_dynamic.h" #include "coll_han_dynamic_file.h" +#include "coll_han_algorithms.h" #include "ompi/mca/coll/base/coll_base_util.h" /* @@ -97,7 +99,15 @@ mca_coll_han_component_t mca_coll_han_component = { static int han_open(void) { /* Get the global coll verbosity: it will be ours */ - mca_coll_han_component.han_output = ompi_coll_base_framework.framework_output; + if (mca_coll_han_component.han_output_verbose) { + mca_coll_han_component.han_output = opal_output_open(NULL); + opal_output_set_verbosity(mca_coll_han_component.han_output, + mca_coll_han_component.han_output_verbose); + } else { + mca_coll_han_component.han_output = ompi_coll_base_framework.framework_output; + } + + return mca_coll_han_init_dynamic_rules(); } @@ -160,22 +170,37 @@ static bool is_simple_implemented(COLLTYPE_T coll) } } +/** + * topo level conversions both ways; str <-> id + * An enum is used for conversions. + */ +static mca_base_var_enum_value_t level_enumerator[] = { + { INTRA_NODE, "intra_node" }, + { INTER_NODE, "inter_node" }, + { GLOBAL_COMMUNICATOR, "global_communicator" }, + { 0 } +}; + /* * Stringifier for topological level */ -const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl) +const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl_id) { - switch(topo_lvl) { - case INTRA_NODE: - return "intra_node"; - case INTER_NODE: - return "inter_node"; - case GLOBAL_COMMUNICATOR: - return "global_communicator"; - case NB_TOPO_LVL: - default: - return "invalid topologic level"; + for (int i = 0; level_enumerator[i].string != NULL; i++) { + if (topo_lvl_id == (TOPO_LVL_T) level_enumerator[i].value) { + return level_enumerator[i].string; + } + } + return "invalid topologic level"; +} +int mca_coll_han_topo_lvl_name_to_id(const char *topo_level_name) +{ + for (int i = 0; level_enumerator[i].string != NULL; i++) { + if (0 == strcmp(topo_level_name, level_enumerator[i].string)) { + return i; + } } + return -1; } static int @@ -231,6 +256,12 @@ static int han_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); + cs->han_output_verbose = 0; + (void) mca_base_component_var_register(c, "verbose", "Verbosity of the HAN coll component (use coll base verbosity if not set)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_output_verbose); + cs->han_bcast_segsize = 65536; (void) mca_base_component_var_register(c, "bcast_segsize", "segment size for bcast", @@ -333,6 +364,38 @@ static int han_register(void) MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reproducible); + /* + * Han algorithms MCA parameters for each collective. + * Shows algorithms thanks to enumerator + */ + if (OMPI_ERROR == mca_coll_han_init_algorithms()) { // needs to be initialised here to show available algorithms + return OMPI_ERROR; + } + + mca_base_var_enum_t *new_enum; + for(coll = 0 ; coll < COLLCOUNT ; coll++) { + if (!mca_coll_han_is_coll_dynamic_implemented(coll) + || (0 == mca_coll_han_component.num_available_algorithms[coll])) { + continue; + } + cs->use_algorithm[coll] = 0; // default algorithm is 0 + snprintf(param_name, sizeof(param_name), "use_%s_algorithm", + mca_coll_base_colltype_to_str(coll)); + snprintf(param_desc, sizeof(param_desc), "which han algorithm is used for %s", + mca_coll_base_colltype_to_str(coll)); + // note: the enumerator is create in mca_coll_han_init_algorithms() + (void) mca_base_var_enum_create(param_name, + mca_coll_han_component.algorithm_enumerator[coll], + &new_enum); + cs->use_algorithm_param[coll] = mca_base_component_var_register(c, + param_name, + param_desc, + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &(cs->use_algorithm[coll])); + OBJ_RELEASE(new_enum); + } /* * Simple algorithms MCA parameters : @@ -341,15 +404,22 @@ static int han_register(void) * to handle thread noise */ for(coll = 0 ; coll < COLLCOUNT ; coll++) { - cs->use_simple_algorithm[coll] = false; + if (coll != GATHER) { + cs->use_simple_algorithm[coll] = false; + } else { + cs->use_simple_algorithm[coll] = true; + } if(is_simple_implemented(coll)) { + const char *collstr = mca_coll_base_colltype_to_str(coll); snprintf(param_name, sizeof(param_name), "use_simple_%s", - mca_coll_base_colltype_to_str(coll)); - snprintf(param_desc, sizeof(param_desc), "whether to enable simple algo for %s", - mca_coll_base_colltype_to_str(coll)); + collstr); + snprintf(param_desc, sizeof(param_desc), "whether to enable simple algorithm for %s. " + "Prefer use_%s_algorithm=simple or configuration file instead.", + collstr, collstr); mca_base_component_var_register(c, param_name, param_desc, - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, + MCA_BASE_VAR_FLAG_DEPRECATED, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, &(cs->use_simple_algorithm[coll])); @@ -357,7 +427,7 @@ static int han_register(void) } /* Dynamic rules MCA parameters */ - memset(cs->mca_rules, 0, + memset(cs->mca_sub_components, 0, COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T)); for(coll = 0; coll < COLLCOUNT; coll++) { @@ -367,12 +437,12 @@ static int han_register(void) /* * Default values */ - cs->mca_rules[coll][INTRA_NODE] = TUNED; - cs->mca_rules[coll][INTER_NODE] = BASIC; - cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN; + for (int topo_lvl = 0 ; topo_lvl < GLOBAL_COMMUNICATOR ; topo_lvl++) { + cs->mca_sub_components[coll][topo_lvl] = TUNED; + } + cs->mca_sub_components[coll][GLOBAL_COMMUNICATOR] = HAN; } /* Specific default values */ - cs->mca_rules[BARRIER][INTER_NODE] = TUNED; /* Dynamic rule MCA var registration */ for(coll = 0; coll < COLLCOUNT; coll++) { @@ -409,7 +479,7 @@ static int han_register(void) MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &(cs->mca_rules[coll][topo_lvl])); + &(cs->mca_sub_components[coll][topo_lvl])); } } diff --git a/ompi/mca/coll/han/coll_han_dynamic.c b/ompi/mca/coll/han/coll_han_dynamic.c index fe22efb4ca3..12eb3494848 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.c +++ b/ompi/mca/coll/han/coll_han_dynamic.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * Copyright (c) 2020-2022 Bull S.A.S. All rights reserved. * Copyright (c) 2021 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved @@ -23,6 +23,7 @@ #include "opal/class/opal_list.h" #include "ompi/mca/coll/han/coll_han.h" #include "ompi/mca/coll/han/coll_han_dynamic.h" +#include "ompi/mca/coll/han/coll_han_algorithms.h" #include "ompi/mca/coll/base/coll_base_util.h" /* @@ -272,7 +273,7 @@ get_module(COLLTYPE_T coll_id, COMPONENT_T mca_rule_component; topo_lvl = han_module->topologic_level; - mca_rule_component = mca_coll_han_component.mca_rules[coll_id][topo_lvl]; + mca_rule_component = mca_coll_han_component.mca_sub_components[coll_id][topo_lvl]; mca_coll_han_get_all_coll_modules(comm, han_module); @@ -305,6 +306,60 @@ get_module(COLLTYPE_T coll_id, return han_module->modules_storage.modules[mca_rule_component].module_handler; } +/* + * whether OMPI_MCA_coll_han_use_xxx_algorithm was set by user + */ +static bool +han_algorithm_is_user_provided(COLLTYPE_T coll_id) { + const int *value = NULL; + mca_base_var_source_t source; + mca_base_var_get_value(mca_coll_han_component.use_algorithm_param[coll_id], + &value, &source, NULL); + return (MCA_BASE_VAR_SOURCE_DEFAULT != source); +} + +/* + * Return the algorithm to use for the collective coll_id + * for a msg_size sized message on the comm communicator + * following the dynamic rules + */ +static int +get_algorithm(COLLTYPE_T coll_id, + size_t msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + int algorithm_id = -1; + int rank = ompi_comm_rank(comm); + algorithm_id = mca_coll_han_component.use_algorithm[coll_id]; + if (!han_algorithm_is_user_provided(coll_id)) { + /* find the correct dynamic rule to check */ + const msg_size_rule_t *dynamic_rule = get_dynamic_rule(coll_id, + msg_size, + comm, + han_module); + if(NULL != dynamic_rule && dynamic_rule->algorithm_id >= 0) { + /* Use dynamic rule from file */ + algorithm_id = dynamic_rule->algorithm_id; + } else { + /* + * No dynamic rule from file + * Use default behaviour + */ + algorithm_id = 0; + } + } + if ( 0 == rank ) { + opal_output_verbose(1, mca_coll_han_component.han_output, + "coll:han:get_algorithm %s size:%ld algorithm:%d %s", + mca_coll_base_colltype_to_str(coll_id), + msg_size, + algorithm_id, + mca_coll_han_algorithm_id_to_name(coll_id, algorithm_id)); + } + return algorithm_id; +} + /* * Allgather selector: @@ -390,10 +445,17 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, * sub_module->coll_allgather is valid and point to this function * Call han topological collective algorithm */ - if(mca_coll_han_component.use_simple_algorithm[ALLGATHER]) { - allgather = mca_coll_han_allgather_intra_simple; - } else { - allgather = mca_coll_han_allgather_intra; + int algorithm_id = get_algorithm(ALLGATHER, + dtype_size, + comm, + han_module); + allgather = (mca_coll_base_module_allgather_fn_t)mca_coll_han_algorithm_id_to_fn(ALLGATHER, algorithm_id); + if (NULL == allgather) { /* default behaviour */ + if(mca_coll_han_component.use_simple_algorithm[ALLGATHER]) { + allgather = mca_coll_han_allgather_intra_simple; + } else { + allgather = mca_coll_han_allgather_intra; + } } } else { /* @@ -608,7 +670,7 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, allreduce = han_module->previous_allreduce; sub_module = han_module->previous_allreduce_module; } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { - /* Reproducibility: fallback on reproducible algo */ + /* Reproducibility: fallback on reproducible algorithm */ if (mca_coll_han_component.han_reproducible) { allreduce = mca_coll_han_allreduce_reproducible; } else { @@ -618,13 +680,17 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, * sub_module->coll_allreduce is valid and point to this function * Call han topological collective algorithm */ - if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { - allreduce = mca_coll_han_allreduce_intra_simple; - } else { - allreduce = mca_coll_han_allreduce_intra; + int algorithm_id = get_algorithm(ALLREDUCE, dtype_size, comm, han_module); + allreduce = (mca_coll_base_module_allreduce_fn_t) mca_coll_han_algorithm_id_to_fn(ALLREDUCE, algorithm_id); + + if (NULL == allreduce) { /* default behaviour */ + if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { + allreduce = mca_coll_han_allreduce_intra_simple; + } else { + allreduce = mca_coll_han_allreduce_intra; + } } } - sub_module = module; } else { /* * If we get here: @@ -716,7 +782,11 @@ mca_coll_han_barrier_intra_dynamic(struct ompi_communicator_t *comm, * sub_module->coll_barrier is valid and point to this function * Call han topological collective algorithm */ - barrier = mca_coll_han_barrier_intra_simple; + int algorithm_id = get_algorithm(BARRIER, 0, comm, han_module); + barrier = (mca_coll_base_module_barrier_fn_t) mca_coll_han_algorithm_id_to_fn(BARRIER, algorithm_id); + if (NULL == barrier) { /* default behaviour*/ + barrier = mca_coll_han_barrier_intra_simple; + } } else { /* * If we get here: @@ -813,12 +883,18 @@ mca_coll_han_bcast_intra_dynamic(void *buff, * sub_module->coll_bcast is valid and point to this function * Call han topological collective algorithm */ - if(mca_coll_han_component.use_simple_algorithm[BCAST]) { - bcast = mca_coll_han_bcast_intra_simple; - } else { - bcast = mca_coll_han_bcast_intra; + int algorithm_id = get_algorithm(BCAST, + dtype_size, + comm, + han_module); + bcast = (mca_coll_base_module_bcast_fn_t)mca_coll_han_algorithm_id_to_fn(BCAST, algorithm_id); + if (NULL == bcast) { /* default behaviour */ + if(mca_coll_han_component.use_simple_algorithm[BCAST]) { + bcast = mca_coll_han_bcast_intra_simple; + } else { + bcast = mca_coll_han_bcast_intra; + } } - sub_module = module; } else { /* * If we get here: @@ -923,10 +999,17 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, * sub_module->coll_gather is valid and point to this function * Call han topological collective algorithm */ - if(mca_coll_han_component.use_simple_algorithm[GATHER]) { - gather = mca_coll_han_gather_intra_simple; - } else { - gather = mca_coll_han_gather_intra; + int algorithm_id = get_algorithm(GATHER, + dtype_size, + comm, + han_module); + gather = (mca_coll_base_module_gather_fn_t) mca_coll_han_algorithm_id_to_fn(GATHER, algorithm_id); + if (NULL == gather) { /* default behaviour */ + if(mca_coll_han_component.use_simple_algorithm[GATHER]) { + gather = mca_coll_han_gather_intra_simple; + } else { + gather = mca_coll_han_gather_intra; + } } } else { /* @@ -1024,7 +1107,7 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, reduce = han_module->previous_reduce; sub_module = han_module->previous_reduce_module; } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { - /* Reproducibility: fallback on reproducible algo */ + /* Reproducibility: fallback on reproducible algorithm */ if (mca_coll_han_component.han_reproducible) { reduce = mca_coll_han_reduce_reproducible; } else { @@ -1034,13 +1117,19 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, * sub_module->coll_reduce is valid and point to this function * Call han topological collective algorithm */ - if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { - reduce = mca_coll_han_reduce_intra_simple; - } else { - reduce = mca_coll_han_reduce_intra; + int algorithm_id = get_algorithm(REDUCE, + dtype_size, + comm, + han_module); + reduce = (mca_coll_base_module_reduce_fn_t)mca_coll_han_algorithm_id_to_fn(REDUCE, algorithm_id); + if (NULL == reduce) { /* default behaviour */ + if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { + reduce = mca_coll_han_reduce_intra_simple; + } else { + reduce = mca_coll_han_reduce_intra; + } } } - sub_module = module; } else { /* * If we get here: @@ -1145,10 +1234,17 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, * sub_module->coll_scatter is valid and point to this function * Call han topological collective algorithm */ - if(mca_coll_han_component.use_simple_algorithm[SCATTER]) { - scatter = mca_coll_han_scatter_intra_simple; - } else { - scatter = mca_coll_han_scatter_intra; + int algorithm_id = get_algorithm(SCATTER, + dtype_size, + comm, + han_module); + scatter = (mca_coll_base_module_scatter_fn_t)mca_coll_han_algorithm_id_to_fn(SCATTER, algorithm_id); + if (NULL == scatter) { /* default behaviour */ + if(mca_coll_han_component.use_simple_algorithm[SCATTER]) { + scatter = mca_coll_han_scatter_intra_simple; + } else { + scatter = mca_coll_han_scatter_intra; + } } } else { /* diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h index 3343a5bce7d..661238baf94 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.h +++ b/ompi/mca/coll/han/coll_han_dynamic.h @@ -22,6 +22,7 @@ #include "ompi/mca/mca.h" #include "opal/util/output.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/han/coll_han.h" /* * @file @@ -53,7 +54,7 @@ * - MCA parameter defined rules * - File defined rules * - * MCA parameter defined rules are stored in mca_coll_han_component.mca_rules. + * MCA parameter defined rules are stored in mca_coll_han_component.mca_sub_components. * This is a double indexed table. The first index is the corresponding collective * communication and the second index is the topological level aimed by the rule. * These parameters define the collective component to use for a specific @@ -139,6 +140,7 @@ typedef struct msg_size_rule_s { /* Component to use on this specific configuration * and message size */ COMPONENT_T component; + int algorithm_id; } msg_size_rule_t; /* Rule for a specific configuration @@ -209,5 +211,6 @@ typedef struct mca_coll_han_collective_modules_storage_s { /* Tests if a dynamic collective is implemented */ bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id); COMPONENT_T mca_coll_han_component_name_to_id(const char* name); +int mca_coll_han_topo_lvl_name_to_id(const char *topo_level_str); #endif diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c index 5736a58e96d..6d3a7c245f7 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.c +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -3,8 +3,8 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2020 Bull S.A.S. All rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved + * Copyright (c) 2020-2022 Bull S.A.S. All rights reserved. * * $COPYRIGHT$ * @@ -29,6 +29,7 @@ #include "coll_han.h" #include "coll_han_dynamic.h" #include "coll_han_dynamic_file.h" +#include "coll_han_algorithms.h" #include "ompi/mca/coll/base/coll_base_util.h" @@ -59,8 +60,11 @@ mca_coll_han_init_dynamic_rules(void) int i, j, k, l; /* Collective information */ - long nb_coll, coll_id; + long nb_coll; + COLLTYPE_T coll_id; + int algorithm_id; char * coll_name = NULL; + char * algorithm_name = NULL; collective_rule_t *coll_rules; /* Topo information */ @@ -92,7 +96,8 @@ mca_coll_han_init_dynamic_rules(void) return OMPI_SUCCESS; } - if( NULL == (fptr = fopen(fname, "r")) ) { + fptr = fopen(fname, "r"); + if( NULL == fptr ) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules cannot open dynamic file provided by " "coll_han_dynamic_rules_filename=%s. Make sure it provides the full path and " @@ -126,6 +131,7 @@ mca_coll_han_init_dynamic_rules(void) /* Iterates on collective rules */ for( i = 0 ; i < nb_coll ; i++ ) { coll_rules[i].nb_topologic_levels = 0; + coll_rules[i].topologic_rules = NULL; mca_coll_han_component.dynamic_rules.nb_collectives = i+1; /* Get the collective identifier */ @@ -158,7 +164,7 @@ mca_coll_han_init_dynamic_rules(void) if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " - "read collective id %ld at line %d but this collective is not implemented yet. " + "read collective id %d at line %d but this collective is not implemented yet. " "This is not an error but this set of rules will not be used\n", fname, coll_id, fileline); } @@ -199,17 +205,31 @@ mca_coll_han_init_dynamic_rules(void) /* Iterates on topologic rules */ for( j = 0 ; j < nb_topo ; j++ ) { topo_rules[j].nb_rules = 0; + topo_rules[j].configuration_rules = NULL; coll_rules[i].nb_topologic_levels = j+1; /* Get the topologic level identifier */ - if( (getnext_long(fptr, &topo_lvl) < 0) || (topo_lvl < INTRA_NODE) || (topo_lvl >= NB_TOPO_LVL) ) { + char *topo_lvl_name = NULL; + if( getnext_string(fptr, &topo_lvl_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " - "at line %d: an invalid topo level %ld is given or the reader encountered an unexpected EOF. " - "Topologic level must be at least %d and less than %d\n", - fname, fileline, topo_lvl, INTRA_NODE, NB_TOPO_LVL); + "at line %d: cannot read the name/id of a topo level\n", + fname, fileline); goto file_reading_error; } + topo_lvl = mca_coll_han_topo_lvl_name_to_id(topo_lvl_name); + if (topo_lvl < 0) { + char *endp; + topo_lvl = (int)strtol(topo_lvl_name, &endp, 10); + if (('\0' != *endp ) || (topo_lvl < INTRA_NODE) || (topo_lvl >= NB_TOPO_LVL)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: unknown topo level '%s'\n", + fname, fileline, topo_lvl_name); + goto file_reading_error; + } + } + free (topo_lvl_name); /* * The first information of a topologic rule @@ -249,6 +269,7 @@ mca_coll_han_init_dynamic_rules(void) /* Iterate on configuration rules */ for( k = 0; k < nb_rules; k++ ) { conf_rules[k].nb_msg_size = 0; + conf_rules[k].msg_size_rules = NULL; topo_rules[j].nb_rules = k+1; /* Get the configuration size */ @@ -334,12 +355,42 @@ mca_coll_han_init_dynamic_rules(void) goto file_reading_error; } + /* Get the optionnal algorithm for han */ + algorithm_id = 0; // default for all collectives + if ((component == HAN) && (1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '@')) ) { + + if( getnext_string(fptr, &algorithm_name) < 0 ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: cannot read the name/id of an algorithm\n", + fname, fileline); + goto file_reading_error; + } + algorithm_id = mca_coll_han_algorithm_name_to_id(coll_id, algorithm_name); + if (algorithm_id < 0) { + char *endp; + algorithm_id = (int)strtol(algorithm_name, &endp, 10); + if (('\0' != *endp ) || !mca_coll_han_algorithm_id_is_valid(coll_id, algorithm_id)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: unknown algorithm '%s' for %s\n", + fname, fileline, algorithm_name, coll_name); + goto file_reading_error; + } + } + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found for coll=%s msg_size=%ld : algorithm '%s' %d\n", + coll_name, msg_size, algorithm_name, algorithm_id); + } + + /* Store message size rule information */ msg_size_rules[l].collective_id = coll_id; msg_size_rules[l].topologic_level = topo_lvl; msg_size_rules[l].configuration_size = conf_size; msg_size_rules[l].msg_size = msg_size; msg_size_rules[l].component = (COMPONENT_T)component; + msg_size_rules[l].algorithm_id = algorithm_id; nb_entries++; /* do we have the optional segment length */