diff --git a/contrib/coll_tuned_rulefile_converter.py b/contrib/coll_tuned_rulefile_converter.py new file mode 100755 index 00000000000..f9709653d70 --- /dev/null +++ b/contrib/coll_tuned_rulefile_converter.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024-2025 Amazon.com, Inc. or its affiliates. +# All Rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow + +import re +import json +from collections import OrderedDict + +coll_dict = { + 'allgather' : 0, + 'allgatherv' : 1, + 'allreduce' : 2, + 'alltoall' : 3, + 'alltoallv' : 4, + 'alltoallw' : 5, + 'barrier' : 6, + 'bcast' : 7, + 'exscan' : 8, + 'gather' : 9, + 'gatherv' : 10, + 'reduce' : 11, + 'reducescatter' : 12, + 'reducescatterblock' : 13, + 'scan' : 14, + 'scatter' : 15, + 'scatterv' : 16, + 'neighbor_allgather' : 17, + 'neighbor_allgatherv' : 18, + 'neighbor_alltoall' : 19, + 'neighbor_alltoallv' : 20, + 'neighbor_alltoallw' : 21 } +coll_dict_rev = { v:k for k,v in coll_dict.items() } + +han_component_dict = { + "self" : 0, + "basic" : 1, + "libnbc" : 2, + "tuned" : 3, + "sm" : 4, + "adapt" : 5, + "han" : 6, +} + +han_topo_level_dict = { + 'intra_node' : 0, + 'inter_node' : 1, + 'global_communicator' : 2, +} + + +def strip_comments(line): + return re.sub(r"#.*","",line).strip() + +class GenericOpenMPIRuleReader(): + def __init__(self, fp, fname_for_prints=""): + self.fp = fp + # The 1-indexed line number which corresponds to the next byte of fp read. + self.jline = 1 + self.line_start = 0 + def get_next_line(self): + while True: + self.line_start = self.fp.tell() + line = self.fp.readline() + if not line: return None + self.jline += 1 + if strip_comments(line): + return line + + def isnext_digit(self): + # ompi_coll_base_file_peek_next_char_isdigit + tell = self.fp.tell() + while True: + next = self.fp.read(1) + if next in ' \t': + tell += 1 + continue + self.fp.seek(tell) + return next in '0123456789' + + def get_next(self): + # (ompi_coll_base_file_getnext_long) + while True: + line = self.get_next_line() + if not line: return None + UNK = -1 + jnum_start = UNK + jnum_end = UNK + for jc in range(len(line)): + if line[jc] in "#": + break + if line[jc] in '0123456789': + if jnum_start == UNK: + jnum_start = jc + jnum_end = jc + else: + if jnum_end != UNK: + break + if jnum_end != UNK: + self.fp.seek(self.line_start+jnum_end+1) + # decrement the line number, the next read will continue on this line. + self.jline -= 1 + return int(line[jnum_start:jnum_end+1]) + + def read_header(self): + line = self.get_next_line() + match = re.match("rule-file-version-([0-9])", line) + if match: + return int(match.group(1)) + else: + self.jline -= 1 + self.fp.seek(self.line_start) + return 1 + +class TunedRuleReader(GenericOpenMPIRuleReader): + def load_rulefile(self): + json_root = OrderedDict() + file_ver = self.read_header() + json_root['rule_file_version'] = 3 + json_root['module'] = 'tuned' + json_root['collectives'] = OrderedDict() + + ncollectives = self.get_next() + for jcol in range(ncollectives): + coll_id = self.get_next() + coll_name = coll_dict_rev[coll_id] + comm_rules = [] + ncomm_sizes = self.get_next() + for jcomm_size in range(ncomm_sizes): + comm_size = self.get_next() + nmsg_sizes = self.get_next() + comm_rule = OrderedDict() + comm_rule['comm_size_min'] = 0 + if jcomm_size+1 < ncomm_sizes: + comm_rule['comm_size_max'] = max(comm_size-1, 0) + if jcomm_size > 0: + comm_rule['comm_size_min'] = comm_rules[jcomm_size-1]['comm_size_max'] + 1 + msg_rules = [] + for jmsg in range(nmsg_sizes): + msg_size = self.get_next() + result_alg = self.get_next() + result_topo_faninout = self.get_next() + result_segsize = self.get_next() + rule = OrderedDict() + rule['msg_size_min'] = msg_size + if jmsg < nmsg_sizes - 1: + rule['msg_size_max'] = 'Inf' + if jmsg > 0: + msg_rules[jmsg-1]['msg_size_max'] = msg_size - 1 + rule['alg'] = result_alg + if result_topo_faninout != 0: + rule['faninout'] = result_topo_faninout + if result_segsize != 0: + rule['segsize'] = result_segsize + result_maxreq = 0 + if file_ver > 1 and self.isnext_digit(): + result_maxreq = self.get_next() + if result_maxreq != 0: + rule['reqs'] = result_maxreq + msg_rules.append(rule) + comm_rule['rules'] = msg_rules + comm_rules.append(comm_rule) + json_root['collectives'][coll_name] = comm_rules + return json_root + +class TunedRuleWriter(): + def __init__(self): + pass + def to_file(json_rules): + for coll in coll_dict.keys(): + if coll in json_rules['collectives']: + pass + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--input","-i", type=argparse.FileType('r'), required=True) + # parser.add_argument("--output","-o",type=argparse.FileType('w'), required=True) + + args = parser.parse_args() + reader = TunedRuleReader(args.input) + print(json.dumps(reader.load_rulefile(), indent=4)) diff --git a/docs/tuning-apps/coll-tuned.rst b/docs/tuning-apps/coll-tuned.rst index 190626d579f..fa9c7ba7236 100644 --- a/docs/tuning-apps/coll-tuned.rst +++ b/docs/tuning-apps/coll-tuned.rst @@ -92,14 +92,77 @@ after. .. code-block:: sh shell$ mpirun ... --mca coll_tuned_use_dynamic_rules 1 \ - --mca coll_tuned_dynamic_rules_filename /path/to/my_rules.conf ... + --mca coll_tuned_dynamic_rules_filename /path/to/my_rules.json ... The loaded set of rules then are used to select the algorithm to use based on the collective, the communicator size, and the message size. Collectives for which rules have not be specified in the file will make use of the *fixed decision* rules as usual. -Dynamic tuning files are organized in this format: +Starting with Open MPI 6.0, dynamic tuning files can be specified in JSON +format, although the classic format will still be accepted. A converter script +is also available to transfer classic format files into JSON. + +The JSON format can be checked using the schema in +`docs/tuning-apps/tuned_dynamic_file_schema.json`. If your editor supports it, +this schema may provide validation of your file along with helpful tooltips for +each variable. + +An example file is shown here: + +.. code-block:: json + + { + "$schema": "tuned_schema.json", + "rule_file_version" : 3, + "module" : "tuned", + "collectives" : { + "allreduce" : + [ + { + "comm_size_min" : 64, + "comm_size_max" : 128, + "rules" : [ + { + "msg_size_min" : 512, + "msg_size_max" : 511999, + "alg" : 2, + }, + { + "msg_size_min" : 512000, + "msg_size_max" : "inf", + "alg" : "recursive_doubling", + "reqs" : 8 + } + ] + } + ] + } + } + +In this toy example the MPI_Allreduce collective (indicated by the `allreduce` +field) has two algorithms that will only be used on communicators with between +64 and 128 ranks. Additionally, those rules only apply to certain message +sizes. All others communicator sizes or message sizes fall back to the default +set of rules, and collectives other than MPI_Allreduce are not affected. + +Unlike in the classic file format, there is no need to specify a default rule or +specify rules in increasing order. Overlapping message sizes or communicator +sizes are allowed, and won't emit warnings. + +The process for selecting the matching rule is a simple first-match principle. +During communicator creation, the first set of communicator-rules which +satisfies the requirements (`comm_size_min`/`comm_size_max`) is selected. Then, +during each collective call, the message size is used to find the first matching +entry in the "rules" list. + +The algorithm selected is indicated by the `alg` field. It may be either an +integer mapping to the classic file format, or a string. In both cases, the +value is checked against the appropriate coll_tuned__algorithm MCA +parameter, and un-recognized values will cause the rule to be ignored. + + +Classic file format: .. code-block:: sh :linenos: diff --git a/docs/tuning-apps/tuned_dynamic_file_schema.json b/docs/tuning-apps/tuned_dynamic_file_schema.json new file mode 100644 index 00000000000..844bcd2444c --- /dev/null +++ b/docs/tuning-apps/tuned_dynamic_file_schema.json @@ -0,0 +1,130 @@ +{ + "$schema": "https://json-schema.org/draft/2019-09/schema#", + "title": "OpenMPITunedRules", + "description": "Defines configuration for the Open MPI Tuned module to select which collective algorithms will be used depending on comm size, message size, etc.", + "type": "object", + "required": ["rule_file_version","module","collectives"], + "additionalProperties" : false, + "properties": { + "rule_file_version": { + "description": "The version of this configuration file", + "type": "number" + }, + "module": { + "description": "The collective module intended to use these rules (tuned)", + "type": "string" + }, + "$schema": { + "description": "The schema used for validation", + "type": "string" + }, + "collectives" : { + "description": "The collectives, each with their own rules. Each collective is indicated by a lowercase property such as \"allgather\"", + "type": "object", + "additionalProperties" : false, + "patternProperties": { + "^(allgather|allreduce|alltoall|alltoallv|alltoallw|barrier)$": { + "type" : "array", + "items": { "$ref" : "#/$defs/comm_size_rule" } + }, + "^(bcast|exscan|gather|gatherv|reduce|reducescatter|reducescatterblock)$": { + "type" : "array", + "items": { "$ref" : "#/$defs/comm_size_rule" } + }, + "^(scan|scatter|scatterv|neighbor_allgather|neighbor_allgatherv)$": { + "type" : "array", + "items": { "$ref" : "#/$defs/comm_size_rule" } + }, + "^(neighbor_alltoall|neighbor_alltoallv|neighbor_alltoallw)$": { + "type" : "array", + "items": { "$ref" : "#/$defs/comm_size_rule" } + } + } + } + }, + + "$defs": { + "msg_size_rule": { + "type": "object", + "required": ["alg"], + "additionalProperties" : false, + "properties" : { + "msg_size_min" : { + "description" : "The smallest message size in bytes this rule applies to", + "anyOf" : { "$ref" : "#/$defs/int_or_inf" } + }, + "msg_size_max" : { + "description" : "The largest message size (inclusive) in bytes this rule applies to", + "anyOf" : { "$ref" : "#/$defs/int_or_inf" } + }, + "alg" : { + "description" : "The algorithm to use for this collective. Integer or name, see coll_tuned__algorithm for options.", + "type" : [ "string", "integer"] + }, + "reqs" : { + "description" : "Algorithm parameter: Use this many requests. Some algorithms may ignore this option.", + "type" : [ "integer"] + }, + "faninout" : { + "description" : "Algorithm parameter: Fan in and/or out by this much. Some algorithms may ignore this option.", + "type" : [ "integer"] + } + } + }, + + "comm_size_rule": { + "type": "object", + "required": ["rules"], + "additionalProperties" : false, + "properties" : { + "comm_size_min" : { + "description" : "The smallest size communicator these rules apply to", + "anyOf" : { "$ref" : "#/$defs/int_or_inf" } + }, + "comm_size_max" : { + "description" : "The largest (inclusive) size communicator these rules apply to", + "anyOf" : { "$ref" : "#/$defs/int_or_inf" } + }, + "comm_rank_distribution" : { + "description" : "A description of how the ranks are distributed within the communicator", + "enum" : ["any", "one-per-node", "single-node"] + }, + + "rules" : { + "description" : "A list of rules. The first matching rule is selected. If no match is found, defaults are used.", + "type" : "array", + "items": { "$ref" : "#/$defs/msg_size_rule" } + } + } + }, + "collective_identifier": { + "enum" : [ + "allgather", + "allreduce", + "alltoall", + "alltoallv", + "alltoallw", + "barrier", + "bcast", + "exscan", + "gather", + "gatherv", + "reduce", + "reducescatter", + "reducescatterblock", + "scan", + "scatter", + "scatterv", + "neighbor_allgather", + "neighbor_allgatherv", + "neighbor_alltoall", + "neighbor_alltoallv", + "neighbor_alltoallw" + ] + }, + "int_or_inf": [ + { "type" : "integer" }, + { "enum": ["inf","INF","Inf"] } + ] + } +} \ No newline at end of file diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index 4fbc086104a..bb5ed3f762a 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -7,6 +7,8 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,6 +32,8 @@ BEGIN_C_DECLS +#define COLL_TUNED_TRACING_VERBOSE 50 + /* these are the same across all modules and are loaded at component query time */ extern int ompi_coll_tuned_stream; extern int ompi_coll_tuned_priority; @@ -216,4 +220,9 @@ struct mca_coll_tuned_module_t { typedef struct mca_coll_tuned_module_t mca_coll_tuned_module_t; OBJ_CLASS_DECLARATION(mca_coll_tuned_module_t); +int coll_tuned_alg_from_str(int collective_id, const char *alg_name, int *alg_index); +int coll_tuned_alg_to_str(int collective_id, int alg_value, char **alg_string); +int coll_tuned_alg_register_options(int collective_id, mca_base_var_enum_t *options); + + #endif /* MCA_COLL_TUNED_EXPORT_H */ diff --git a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c index 7ca899accb7..052c1d5f9e4 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c @@ -5,6 +5,8 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2021 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -86,6 +88,7 @@ ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_allgather_forced_algorithm); + coll_tuned_alg_register_options( ALLGATHER, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -132,7 +135,7 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, size_t scount, mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d", algorithm, faninout, segsize)); switch (algorithm) { @@ -173,7 +176,7 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, size_t scount, rbuf, rcount, rdtype, comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[ALLGATHER])); return (MPI_ERR_ARG); diff --git a/ompi/mca/coll/tuned/coll_tuned_allgatherv_decision.c b/ompi/mca/coll/tuned/coll_tuned_allgatherv_decision.c index b13c1765747..5eb9801222c 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allgatherv_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_allgatherv_decision.c @@ -6,6 +6,8 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2021 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -84,6 +86,7 @@ ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mc OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_CONSTANT, &coll_tuned_allgatherv_forced_algorithm); + coll_tuned_alg_register_options( ALLGATHERV, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -132,7 +135,7 @@ int ompi_coll_tuned_allgatherv_intra_do_this(const void *sbuf, size_t scount, int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:allgatherv_intra_do_this selected algorithm %d topo faninout %d segsize %d", algorithm, faninout, segsize)); @@ -166,7 +169,7 @@ int ompi_coll_tuned_allgatherv_intra_do_this(const void *sbuf, size_t scount, rbuf, rcounts, rdispls, rdtype, comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:allgatherv_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[ALLGATHERV])); return (MPI_ERR_ARG); diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c index 9836317f267..9a63d8c5abb 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c @@ -5,6 +5,8 @@ * reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -84,6 +86,7 @@ int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorith OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_allreduce_forced_algorithm); + coll_tuned_alg_register_options( ALLREDUCE, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -129,8 +132,9 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d", - algorithm, faninout, segsize)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d", + algorithm, faninout, segsize)); switch (algorithm) { case (0): @@ -150,7 +154,8 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t case (7): return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE])); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE])); return (MPI_ERR_ARG); } diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c index 00d564e9501..e3482116c84 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c @@ -6,6 +6,8 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -75,12 +77,13 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm mca_param_indices->algorithm_param_index = mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_algorithm", - "Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: linear with sync, 5:two proc only. " + "Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 pairwise, 3: modified_bruck, 4: linear_sync, 5:two_proc. " "Only relevant if coll_tuned_use_dynamic_rules is true.", MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_alltoall_forced_algorithm); + coll_tuned_alg_register_options( ALLTOALL, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -164,8 +167,9 @@ int ompi_coll_tuned_alltoall_intra_do_this(const void *sbuf, size_t scount, int algorithm, int faninout, int segsize, int max_requests) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d", + algorithm, faninout, segsize)); switch (algorithm) { case (0): @@ -181,7 +185,8 @@ int ompi_coll_tuned_alltoall_intra_do_this(const void *sbuf, size_t scount, case (5): return ompi_coll_base_alltoall_intra_two_procs(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL])); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL])); return (MPI_ERR_ARG); } diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoallv_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoallv_decision.c index f0066a966bb..60a8fb10c85 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoallv_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoallv_decision.c @@ -5,6 +5,8 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -77,7 +79,7 @@ int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_alltoallv_forced_algorithm); - + coll_tuned_alg_register_options( ALLTOALLV, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -96,7 +98,7 @@ int ompi_coll_tuned_alltoallv_intra_do_this(const void *sbuf, ompi_count_array_t mca_coll_base_module_t *module, int algorithm) { - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:alltoallv_intra_do_this selected algorithm %d ", algorithm)); @@ -114,7 +116,7 @@ int ompi_coll_tuned_alltoallv_intra_do_this(const void *sbuf, ompi_count_array_t rbuf, rcounts, rdisps, rdtype, comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:alltoall_intra_do_this attempt to select " "algorithm %d when only 0-%d is valid.", algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALLV])); diff --git a/ompi/mca/coll/tuned/coll_tuned_barrier_decision.c b/ompi/mca/coll/tuned/coll_tuned_barrier_decision.c index dca24ad27d2..fa2c3f2bf29 100644 --- a/ompi/mca/coll/tuned/coll_tuned_barrier_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_barrier_decision.c @@ -3,6 +3,8 @@ * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -78,6 +80,7 @@ int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_ OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_barrier_forced_algorithm); + coll_tuned_alg_register_options( BARRIER, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -90,7 +93,7 @@ int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout)); @@ -103,7 +106,8 @@ int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, case (5): return ompi_coll_base_barrier_intra_two_procs(comm, module); case (6): return ompi_coll_base_barrier_intra_tree(comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER])); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER])); return (MPI_ERR_ARG); } diff --git a/ompi/mca/coll/tuned/coll_tuned_bcast_decision.c b/ompi/mca/coll/tuned/coll_tuned_bcast_decision.c index 2f14f9c12e1..90ecab634bf 100644 --- a/ompi/mca/coll/tuned/coll_tuned_bcast_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_bcast_decision.c @@ -5,6 +5,8 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -86,6 +88,7 @@ int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mc OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_bcast_forced_algorithm); + coll_tuned_alg_register_options( BCAST, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -139,8 +142,9 @@ int ompi_coll_tuned_bcast_intra_do_this(void *buf, size_t count, mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d", + algorithm, faninout, segsize)); switch (algorithm) { case (0): @@ -165,7 +169,8 @@ int ompi_coll_tuned_bcast_intra_do_this(void *buf, size_t count, case (9): return ompi_coll_base_bcast_intra_scatter_allgather_ring(buf, count, dtype, root, comm, module, segsize); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST])); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST])); return (MPI_ERR_ARG); } diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index e14c6883bfa..55c09d866e7 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -18,6 +18,8 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,6 +58,7 @@ char* ompi_coll_tuned_dynamic_rules_filename = (char*) NULL; int ompi_coll_tuned_init_tree_fanout = 4; int ompi_coll_tuned_init_chain_fanout = 4; int ompi_coll_tuned_init_max_requests = 128; +int ompi_coll_tuned_verbose = 0; /* Set it to the same value as intermediate msg by default, so it does not affect * default algorithm selection. Changing this value will force using linear with @@ -74,6 +77,10 @@ coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COL /* max algorithm values */ int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT] = {0}; +/* names of each algorithm for each collective */ +mca_base_var_enum_t *coll_tuned_algorithm_enums[COLLCOUNT] = {0}; + + /* * Local function */ @@ -187,6 +194,15 @@ static int tuned_register(void) MCA_BASE_VAR_SCOPE_ALL, &ompi_coll_tuned_dynamic_rules_filename); + ompi_coll_tuned_verbose = 0; + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "verbose", + "Verbosity of the tuned coll component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_ALL, + &ompi_coll_tuned_verbose); + /* register forced params */ ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]); ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]); @@ -210,11 +226,10 @@ static int tuned_open(void) { int rc; -#if OPAL_ENABLE_DEBUG - if (ompi_coll_base_framework.framework_verbose) { + if (ompi_coll_tuned_verbose) { ompi_coll_tuned_stream = opal_output_open(NULL); + opal_output_set_verbosity(ompi_coll_tuned_stream, ompi_coll_tuned_verbose); } -#endif /* OPAL_ENABLE_DEBUG */ /* now check that the user hasn't overrode any of the decision functions if dynamic rules are enabled */ /* the user can redo this before every comm dup/create if they like */ @@ -227,20 +242,24 @@ static int tuned_open(void) /* by default DISABLE dynamic rules and instead use fixed [if based] rules */ if (ompi_coll_tuned_use_dynamic_rules) { if( ompi_coll_tuned_dynamic_rules_filename ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:component_open Reading collective rules file [%s]", - ompi_coll_tuned_dynamic_rules_filename)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:component_open Reading collective rules file [%s]", + ompi_coll_tuned_dynamic_rules_filename)); rc = ompi_coll_tuned_read_rules_config_file( ompi_coll_tuned_dynamic_rules_filename, - &(mca_coll_tuned_component.all_base_rules), COLLCOUNT); - if( rc >= 0 ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_open Read %d valid rules\n", rc)); + &(mca_coll_tuned_component.all_base_rules)); + if( rc == OPAL_SUCCESS ) { + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:module_open Read a valid rules file")); } else { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_open Reading collective rules file failed\n")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:module_open Reading collective rules file failed\n")); mca_coll_tuned_component.all_base_rules = NULL; } } } - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_open: done!")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:component_open: done!")); return OMPI_SUCCESS; } @@ -249,18 +268,26 @@ static int tuned_open(void) /* i.e. alg table and dynamic changeable rules if allocated etc */ static int tuned_close(void) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_close: called")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:component_close: called")); /* dealloc alg table if allocated */ /* dealloc dynamic changeable rules if allocated */ - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_close: done!")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:component_close: done!")); if( NULL != mca_coll_tuned_component.all_base_rules ) { - ompi_coll_tuned_free_all_rules(mca_coll_tuned_component.all_base_rules, COLLCOUNT); + ompi_coll_tuned_free_all_rules(mca_coll_tuned_component.all_base_rules); mca_coll_tuned_component.all_base_rules = NULL; } + for (int i=0; i COLLCOUNT || collective_id < 0) { return OPAL_ERROR; }; + rc = coll_tuned_algorithm_enums[collective_id]->value_from_string( + coll_tuned_algorithm_enums[collective_id], + alg_name, alg_value ); + return rc; +} + +/* return the enum's value and string. caller's responsibility to free alg_string if NULL was not provided. */ +int coll_tuned_alg_to_str(int collective_id, int alg_value, char **alg_string) { + int rc; + if (collective_id > COLLCOUNT || collective_id < 0) { return OPAL_ERROR; }; + rc = coll_tuned_algorithm_enums[collective_id]->string_from_value( + coll_tuned_algorithm_enums[collective_id], + alg_value, alg_string ); + return rc; +} + + +int coll_tuned_alg_register_options(int collective_id, mca_base_var_enum_t *options) { + /* use the same enum used for mca parameters to allow tuning files to use + algorithm names rather than just numbers.*/ + if (!options) { return OPAL_ERROR; } + if (collective_id > COLLCOUNT || collective_id < 0) { + return OPAL_ERROR; + } + + /* retain the enum until tuned_close() */ + OBJ_RETAIN(options); + coll_tuned_algorithm_enums[collective_id] = options; + return OPAL_SUCCESS; +} + + OBJ_CLASS_INSTANCE(mca_coll_tuned_module_t, mca_coll_base_module_t, mca_coll_tuned_module_construct, NULL); diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c index 2f5144e66a9..5d6a699b301 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c @@ -12,7 +12,7 @@ * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. + * Copyright (c) 2020-2025 Amazon.com, Inc. or its affiliates. * All Rights reserved. * $COPYRIGHT$ * @@ -60,7 +60,8 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_ { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_dynamic")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_allreduce_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[ALLREDUCE].algorithm) { @@ -111,7 +112,8 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(const void *sbuf, size_t scount, { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_dynamic")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_alltoall_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[ALLTOALL].algorithm) { @@ -167,7 +169,8 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(const void *sbuf, ompi_count_arr { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoallv_intra_dec_dynamic")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_alltoallv_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[ALLTOALLV].algorithm) { @@ -215,7 +218,8 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm, { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_barrier_intra_dec_dynamic")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_barrier_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[BARRIER].algorithm) { @@ -257,7 +261,8 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buf, size_t count, { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:bcast_intra_dec_dynamic")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:bcast_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[BCAST].algorithm) { @@ -309,7 +314,8 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf, { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_intra_dec_dynamic")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:reduce_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[REDUCE].algorithm) { @@ -365,7 +371,8 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_intra_dec_dynamic")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:reduce_scatter_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[REDUCESCATTER].algorithm) { @@ -420,7 +427,8 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_block_intra_dec_dynamic")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:reduce_scatter_block_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm) { @@ -474,7 +482,7 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(const void *sbuf, size_t scount, { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ @@ -536,7 +544,7 @@ int ompi_coll_tuned_allgatherv_intra_dec_dynamic(const void *sbuf, size_t scount { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_allgatherv_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ @@ -593,7 +601,7 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount, { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_gather_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ @@ -643,7 +651,7 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount, { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_scatter_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ @@ -692,7 +700,7 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_ { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_exscan_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ @@ -736,7 +744,7 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_scan_intra_dec_dynamic")); /* Check first if an algorithm is set explicitly for this collective */ diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index 69598aff2bf..fa31aef1860 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -16,7 +16,7 @@ * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2019 Mellanox Technologies. All rights reserved. - * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. + * Copyright (c) 2020-2025 Amazon.com, Inc. or its affiliates. * All Rights reserved. * $COPYRIGHT$ * @@ -61,7 +61,8 @@ ompi_coll_tuned_allreduce_intra_dec_fixed(const void *sbuf, void *rbuf, size_t c size_t dsize, total_dsize; int communicator_size, alg; communicator_size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_fixed")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_allreduce_intra_dec_fixed")); ompi_datatype_type_size(dtype, &dsize); total_dsize = dsize * (ptrdiff_t)count; @@ -425,7 +426,8 @@ int ompi_coll_tuned_alltoallv_intra_dec_fixed(const void *sbuf, ompi_count_array int communicator_size, alg; communicator_size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoallv_intra_dec_fixed com_size %d", + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_alltoallv_intra_dec_fixed com_size %d", communicator_size)); /** Algorithms: * {1, "basic_linear"}, @@ -467,8 +469,9 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm, int communicator_size, alg; communicator_size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_dec_fixed com_size %d", - communicator_size)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_barrier_intra_dec_fixed com_size %d", + communicator_size)); /** Algorithms: * {1, "linear"}, * {2, "double_ring"}, @@ -521,9 +524,9 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, size_t count, ompi_datatype_type_size(datatype, &dsize); total_dsize = dsize * (unsigned long)count; - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_bcast_intra_dec_fixed" - " root %d rank %d com_size %d", - root, ompi_comm_rank(comm), communicator_size)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_bcast_intra_dec_fixed root %d rank %d com_size %d", + root, ompi_comm_rank(comm), communicator_size)); /** Algorithms: * {1, "basic_linear"}, @@ -670,8 +673,9 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( const void *sendbuf, void *recvbuf, communicator_size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed " - "root %d rank %d com_size %d", root, ompi_comm_rank(comm), communicator_size)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_reduce_intra_dec_fixed root %d rank %d com_size %d", + root, ompi_comm_rank(comm), communicator_size)); ompi_datatype_type_size(datatype, &dsize); total_dsize = dsize * (ptrdiff_t)count; /* needed for decision */ @@ -832,7 +836,8 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( const void *sbuf, void *rbuf int communicator_size, i, alg; size_t total_dsize, dsize; - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_intra_dec_fixed")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_reduce_scatter_intra_dec_fixed")); communicator_size = ompi_comm_size(comm); ompi_datatype_type_size(dtype, &dsize); @@ -980,7 +985,8 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(const void *sbuf, void int communicator_size, alg; size_t dsize, total_dsize; - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed")); ompi_datatype_type_size(dtype, &dsize); @@ -1220,8 +1226,9 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(const void *sbuf, size_t scount, } } - OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed" - " rank %d com_size %d", ompi_comm_rank(comm), communicator_size)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "ompi_coll_tuned_allgather_intra_dec_fixed rank %d com_size %d", + ompi_comm_rank(comm), communicator_size)); int faninout = 2; return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype, @@ -1358,7 +1365,7 @@ int ompi_coll_tuned_allgatherv_intra_dec_fixed(const void *sbuf, size_t scount, } } - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_allgatherv_intra_dec_fixed" " rank %d com_size %d", ompi_comm_rank(comm), communicator_size)); @@ -1389,7 +1396,7 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, size_t scount, int communicator_size, alg, rank; size_t dsize, total_dsize; - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_gather_intra_dec_fixed")); communicator_size = ompi_comm_size(comm); @@ -1477,7 +1484,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, size_t scount, int communicator_size, alg, rank; size_t dsize, total_dsize; - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_scatter_intra_dec_fixed")); communicator_size = ompi_comm_size(comm); diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c index 5eb8ef4317e..02156b95688 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c @@ -13,6 +13,8 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,6 +29,9 @@ #include "mpi.h" #include "ompi/mca/mca.h" #include "coll_tuned.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "opal/util/json/opal_json.h" + /* need to include our own topo prototypes so we can malloc data on the comm correctly */ #include "ompi/mca/coll/base/coll_base_topo.h" @@ -45,6 +50,322 @@ static int fileline=0; /* used for verbose error messages */ #define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) #define isnext_digit(fptr) ompi_coll_base_file_peek_next_char_isdigit(fptr) + +static int coll_tuned_read_alg(const opal_json_t *msg_rule, ompi_coll_msg_rule_t *msg_p, int coll_id) { + const char *RESULT_ALG_FIELD = "alg"; + const opal_json_t *alg_prop = NULL; + int rc, rc_as_str, rc_as_int, rc_validation; + const char* string_buf; + size_t string_len; + char int_as_str[24]; + int64_t int_val; + + rc = opal_json_get_key( msg_rule, RESULT_ALG_FIELD, &alg_prop); + if (rc != OPAL_SUCCESS) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "The \"alg\" field is required, but was not found." ); + return OPAL_ERROR; + } + rc_as_str = opal_json_read_string(alg_prop, &string_buf, &string_len); + rc_as_int = opal_json_read_integer(alg_prop, &int_val); + opal_json_free(&alg_prop); + if (rc_as_str == OPAL_SUCCESS) { + rc_validation = coll_tuned_alg_from_str( coll_id, string_buf, &msg_p->result_alg ); + } else if (rc_as_int == OPAL_SUCCESS) { + rc_validation = coll_tuned_alg_to_str( coll_id, int_val, NULL ); + if (rc_validation != OPAL_SUCCESS) { + snprintf(int_as_str, 23, "%ld", int_val); + int_as_str[23] = '\0'; + string_buf = int_as_str; + } else { + msg_p->result_alg = int_val; + } + } else { + opal_output_verbose(1, ompi_coll_tuned_stream, + "The \"alg\" field must be either a string or an integer, but it is something else." ); + return OPAL_ERROR; + } + if (rc_validation != OPAL_SUCCESS) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Algorithm (%s) provided for collective \"%s\" is not valid. " + "Check documentation for valid configurations of coll_tuned_%s_algorithm. Ignoring this rule.", + string_buf, mca_coll_base_colltype_to_str(coll_id), mca_coll_base_colltype_to_str(coll_id) ); + /* the rationale for disabling the rule is to allow two ompi versions to use the same file. */ + /* disable the rule by making an impossible condition: */ + msg_p->msg_size_min = COLL_RULES_MESSAGE_SIZE_INF; + msg_p->msg_size_max = 0; + rc = OPAL_SUCCESS; + } + return OPAL_SUCCESS; +} + +/* returns the value in the field. + *rc is set as follows: + OPAL_SUCCESS (value found and parsed) + OPAL_ERROR (value found, but not an integer) + OPAL_ERR_DATA_VALUE_NOT_FOUND (value not found) + + When optional==0, and the field is not found, or when the field is not + an integer, then a warning message is printed. + */ +static int coll_tuned_get_json_integer_field(const opal_json_t *parent, + const char* fieldname, + int optional, + int64_t *val ) { + const opal_json_t *json_val; + int rc; + const char *string_buf; + size_t str_len; + int rc_as_int, rc_as_str; + rc = opal_json_get_key( parent, fieldname, &json_val); + if (rc == OPAL_SUCCESS) { + rc_as_int = opal_json_read_integer(json_val, val); + if ( rc_as_int != OPAL_SUCCESS ) { + rc_as_str = opal_json_read_string(json_val, &string_buf, &str_len); + } + opal_json_free(&json_val); + if (rc_as_int == OPAL_SUCCESS) { + return rc_as_int; + } else if ( rc_as_str == OPAL_SUCCESS) { + if (strncmp("inf",string_buf, 3)==0 || + strncmp("Inf",string_buf, 3)==0 || + strncmp("INF",string_buf, 3)==0 ) { + *val = COLL_RULES_MESSAGE_SIZE_INF; + return rc_as_str; + } + } + opal_output_verbose(1, ompi_coll_tuned_stream, + "Found field \"%s\" as expected, but could not interpret it as an integer or \"inf\".", + fieldname); + return OPAL_ERROR; + } + if (optional == 0) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Missing the required field \"%s\".", + fieldname); + } + return OPAL_ERR_DATA_VALUE_NOT_FOUND; +} + +static int coll_tuned_read_message_size_rule( ompi_coll_msg_rule_t *msg_p, + const opal_json_t *msg_rule, int coll_id ) { + int rc; + const char *MSG_SIZE_FIELD = "msg_size_min"; + const char *MAX_MSG_SIZE_FIELD = "msg_size_max"; + const char *TOPO_FANINOUT_FIELD = "faninout"; + const char *SEGSIZE_FIELD = "seg_size"; + const char *MAX_REQUESTS_FIELD = "reqs"; + const int OPTIONAL = 1; + // const int REQUIRED = 0; + int64_t int_val; + + +#define OPTIONAL_READ(field, target) \ + rc = coll_tuned_get_json_integer_field(msg_rule, field, OPTIONAL, &int_val); \ + if (rc == OPAL_ERROR) {return rc;} \ + if (rc == OPAL_SUCCESS) {target = int_val;} +#define REQUIRED_READ(field, target) \ + rc = coll_tuned_get_json_integer_field(msg_rule, field, REQUIRED, &int_val); \ + if (rc != OPAL_SUCCESS) {return rc;} \ + if (rc == OPAL_SUCCESS) {target = int_val;} + + msg_p->msg_size_min = 0; + OPTIONAL_READ( MSG_SIZE_FIELD, msg_p->msg_size_min ) + + msg_p->msg_size_max = COLL_RULES_MESSAGE_SIZE_INF; + OPTIONAL_READ( MAX_MSG_SIZE_FIELD, msg_p->msg_size_max ) + + msg_p->result_topo_faninout = 0; + OPTIONAL_READ( TOPO_FANINOUT_FIELD, msg_p->result_topo_faninout ) + + msg_p->result_segsize = 0; + OPTIONAL_READ( SEGSIZE_FIELD, msg_p->result_segsize ) + + msg_p->result_max_requests = 0; + OPTIONAL_READ( MAX_REQUESTS_FIELD, msg_p->result_max_requests ) + + rc = coll_tuned_read_alg( msg_rule, msg_p, coll_id ); + + return OPAL_SUCCESS; + +#undef REQUIRED_READ +#undef OPTIONAL_READ +} + +static int coll_tuned_get_comm_distribution(const opal_json_t *parent, enum comm_rank_distro_t *distro) { + int rc; + const opal_json_t *json_val; + const char *string_buf; + size_t string_len; + const char *comm_rank_distro_name = "comm_rank_distribution"; + + rc = opal_json_get_key( parent, comm_rank_distro_name, &json_val); + if (rc == OPAL_ERROR) { + *distro = COLL_RULES_DISTRO_ANY; + return OPAL_SUCCESS; + } + rc = opal_json_read_string(json_val, &string_buf, &string_len); + opal_json_free(&json_val); + if ( rc == OPAL_SUCCESS) { + rc = coll_rules_comm_rank_distro_from_str(string_buf, distro); + + if (rc == OPAL_ERROR) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Unrecognized value for field \"%s\". Got \"%s\", assuming \"%s\".", + comm_rank_distro_name, string_buf, + coll_rules_comm_rank_distro_to_str(COLL_RULES_DISTRO_ANY) ); + *distro = COLL_RULES_DISTRO_ANY; + rc = OPAL_SUCCESS; + } + } + return rc; +} + + +static int ompi_coll_tuned_read_rules_json (const opal_json_t *json_root, ompi_coll_alg_rule_t** rules) { + + int rc = OPAL_ERROR; + + /* complete table of rules */ + ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; + + /* individual pointers to sections of rules */ + ompi_coll_alg_rule_t *alg_p = (ompi_coll_alg_rule_t*) NULL; + ompi_coll_com_rule_t *com_p = (ompi_coll_com_rule_t*) NULL; + ompi_coll_msg_rule_t *msg_p = (ompi_coll_msg_rule_t*) NULL; + + size_t jmsg_rule = 0; + size_t jcomm_rule = 0; + size_t jcol = 0; + int64_t int_val; + const char* coll_name = ""; + + alg_rules = ompi_coll_tuned_mk_alg_rules(COLLCOUNT); + + const opal_json_t *collectives_obj = NULL; + const opal_json_t *comm_rule_array = NULL; + + size_t num_collectives = 0; + size_t num_comm_rules; + rc = opal_json_get_key(json_root, "collectives", &collectives_obj); + if (rc != OPAL_SUCCESS) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Required top-level field \"collectives\" was not found."); + return OPAL_ERROR; + } + + rc = opal_json_get_container_size(collectives_obj, &num_collectives); + if (rc != OPAL_SUCCESS || OPAL_JSON_OBJECT != collectives_obj->type) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "The \"collectives\" field must be a dictionary of collectives."); + opal_json_free(&collectives_obj); + return OPAL_ERROR; + } + + for(jcol = 0; jcol < num_collectives; jcol++ ) { + int coll_id; + rc = opal_json_get_key_by_index( collectives_obj, jcol, &coll_name, &comm_rule_array); + if (rc) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Internal json error when attempting to parse the collective at index %ld\n", + jcol); + goto error_bad_coll; + } + coll_id = mca_coll_base_name_to_colltype(coll_name); + if (coll_id < 0) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Unrecognized collective: \"%s\". Use all lowercase such as \"allgather\"", + coll_name); + opal_json_free(&comm_rule_array); + goto error_bad_coll; + } + + alg_p = &alg_rules[coll_id]; + alg_p->coll_id = coll_id; + rc = opal_json_get_container_size(comm_rule_array, &num_comm_rules); + if (rc != OPAL_SUCCESS || OPAL_JSON_ARRAY != comm_rule_array->type) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Problem parsing the collective at index %ld (for %s). Expected an array of comm-related rules.", + jcol, mca_coll_base_colltype_to_str(coll_id) ); + goto error_bad_coll; + } + alg_p->n_com_sizes = (int)num_comm_rules; + alg_p->com_rules = ompi_coll_tuned_mk_com_rules (num_comm_rules, coll_id); + + for (jcomm_rule=0; jcomm_rule < num_comm_rules; jcomm_rule++) { + const opal_json_t *comm_rule; + const opal_json_t *msg_size_array; + size_t num_msg_rules; + rc = opal_json_get_index(comm_rule_array, jcomm_rule, &comm_rule); + com_p = &(alg_p->com_rules[jcomm_rule]); + + com_p->mpi_comsize_min = 0; + rc = coll_tuned_get_json_integer_field(comm_rule, "comm_size_min", 1, &int_val); + if (rc == OPAL_ERROR) { goto error_bad_comm_rule; } + if (rc == OPAL_SUCCESS) { com_p->mpi_comsize_min = int_val; } + + + com_p->mpi_comsize_max = INT_MAX; + rc = coll_tuned_get_json_integer_field(comm_rule, "comm_size_max", 1, &int_val); + if (rc == OPAL_ERROR) { goto error_bad_comm_rule; } + if (rc == OPAL_SUCCESS) { com_p->mpi_comsize_max = int_val; } + + rc = coll_tuned_get_comm_distribution(comm_rule, &com_p->comm_rank_distribution); + if (rc == OPAL_ERROR) { goto error_bad_comm_rule; } + + rc = opal_json_get_key( comm_rule, "rules", &msg_size_array); + if (rc != OPAL_SUCCESS) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Expected a set of message rules in this communicator rule"); + goto error_bad_comm_rule; + } + rc = opal_json_get_container_size(msg_size_array, &num_msg_rules); + if (rc != OPAL_SUCCESS) { goto error_bad_comm_rule; } + + com_p->n_rules = num_msg_rules; + com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (num_msg_rules, coll_id, com_p->mpi_comsize_min); + + + for (jmsg_rule=0; jmsg_rule < num_msg_rules; jmsg_rule++) { + const opal_json_t *msg_rule; + msg_p = &(com_p->msg_rules[jmsg_rule]); + rc = opal_json_get_index(msg_size_array, jmsg_rule, &msg_rule); + + rc = coll_tuned_read_message_size_rule( msg_p, msg_rule, coll_id); + opal_json_free(&msg_rule); + if (rc != OPAL_SUCCESS) { + goto error_bad_message_rule; + } + } + + opal_json_free(&msg_size_array); + opal_json_free(&comm_rule); + } + + opal_json_free(&comm_rule_array); + } + *rules = alg_rules; + + opal_json_free(&collectives_obj); + return rc; +error_bad_message_rule: + opal_output_verbose(1, ompi_coll_tuned_stream, + "Problem occurred within collective %s, " + "comm_size_min=%d, comm_size_max=%d (entry number %ld in the comm_size array), " + "message size entry number %ld.", coll_name, com_p->mpi_comsize_min, com_p->mpi_comsize_max, 1+jcomm_rule, 1+jmsg_rule); + opal_json_free(&collectives_obj); + return OMPI_ERROR; +error_bad_comm_rule: + opal_output_verbose(1, ompi_coll_tuned_stream, + "Problem occurred within collective %s, " + "in entry number %ld of the comm_size array", coll_name, 1+jcomm_rule); + opal_json_free(&collectives_obj); + return OMPI_ERROR; +error_bad_coll: + opal_json_free(&collectives_obj); + return OMPI_ERROR; +} + /* * Reads a rule file called fname * The rule file defines a set of sets of rules. The outer set is keyed on @@ -62,12 +383,11 @@ static int fileline=0; /* used for verbose error messages */ * If an error occurs it removes rule table and then exits with a very verbose * error message. this stops the user using a half baked rule table. * - * Returns the number of actual collectives that a rule exists for - * (note 0 is NOT an error) + * Returns OPAL_ERROR or OPAL_SUCCESS * */ -int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives) +static int ompi_coll_tuned_read_rules_config_file_classic (char *fname, ompi_coll_alg_rule_t** rules) { long NCOL = 0, /* number of collectives for which rules are provided */ COLID = 0, /* identifies the collective type to associate the rules with */ @@ -91,51 +411,51 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** /* stats info */ int total_alg_count = 0; -#if OPAL_ENABLE_DEBUG int total_com_count = 0; int total_msg_count = 0; -#endif if (!fname) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave NULL as rule table configuration file for tuned collectives... ignoring!\n")); - return (-1); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Gave NULL as rule table configuration file for tuned collectives... ignoring!\n"); + return OPAL_ERROR; } if (!rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave NULL as rule table result ptr!... ignoring!\n")); - return (-2); - } - - if (n_collectives<1) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave %d as max number of collectives in the rule table configuration file for tuned collectives!... ignoring!\n", n_collectives)); - return (-3); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Gave NULL as rule table result ptr!... ignoring!\n"); + return OPAL_ERROR; } fptr = fopen (fname, "r"); if (!fptr) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot read rules file [%s]\n", fname)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Cannot read rules file [%s]\n", fname); goto on_file_error; } /* make space and init the algorithm rules for each of the n_collectives MPI collectives */ - alg_rules = ompi_coll_tuned_mk_alg_rules (n_collectives); + alg_rules = ompi_coll_tuned_mk_alg_rules(COLLCOUNT); if (NULL == alg_rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate rules for file [%s]\n", fname)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Cannot allocate rules for file [%s]\n", fname); goto on_file_error; } /* consume the optional version identifier */ - if (0 == fscanf(fptr, "rule-file-version-%u", &version)) { + if (0 == fscanf(fptr, "rule-file-version-%d", &version)) { version = 1; } /* get the number of collectives for which rules are provided in the file */ if( (getnext(fptr, &NCOL) < 0) || (NCOL < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read number of collectives in configuration file around line %d\n", fileline); goto on_file_error; } - if (NCOL>n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", NCOL, n_collectives, fileline)); + if (NCOL>COLLCOUNT) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", + NCOL, COLLCOUNT, fileline); goto on_file_error; } @@ -143,35 +463,42 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** /* get the collective for which rules are being provided */ if( (getnext(fptr, &COLID) < 0) || (COLID < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read next Collective id in configuration file around line %d\n", fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read next Collective id in configuration file around line %d\n", fileline); goto on_file_error; } - if (COLID>=n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", COLID, n_collectives, fileline)); + if (COLID>=COLLCOUNT) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", COLID, COLLCOUNT, fileline); goto on_file_error; } - if (alg_rules[COLID].alg_rule_id != COLID) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", COLID)); + if (alg_rules[COLID].coll_id != COLID) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Internal error in handling collective ID %ld\n", COLID); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", COLID)); + opal_output_verbose(25, ompi_coll_tuned_stream, + "Reading dynamic rule for collective ID %ld\n", COLID); alg_p = &alg_rules[COLID]; - alg_p->alg_rule_id = COLID; + alg_p->coll_id = COLID; alg_p->n_com_sizes = 0; alg_p->com_rules = (ompi_coll_com_rule_t *) NULL; /* get the number of communicator sizes for which a set of rules are to be provided */ if( (getnext (fptr, &NCOMSIZES) < 0) || (NCOMSIZES < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", COLID, fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read count of communicators for collective ID %ld at around line %d\n", COLID, fileline); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCOMSIZES, COLID)); + opal_output_verbose(25, ompi_coll_tuned_stream, + "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCOMSIZES, COLID); alg_p->n_com_sizes = NCOMSIZES; alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCOMSIZES, COLID); if (NULL == alg_p->com_rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate com rules for file [%s]\n", fname)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Cannot allocate com rules for file [%s]\n", fname); goto on_file_error; } @@ -181,23 +508,36 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** /* get the communicator size to associate the set of rules with */ if( (getnext (fptr, &COMSIZE) < 0) || (COMSIZE < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", COLID, ncs, fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read communicator size for collective ID %ld com rule %d at around line %d\n", COLID, ncs, fileline); goto on_file_error; } - com_p->mpi_comsize = COMSIZE; + if (ncs > 0) { + com_p->mpi_comsize_min = alg_p->com_rules[ncs-1].mpi_comsize_max + 1; + } else { + com_p->mpi_comsize_min = 0; + } + if (ncs == NCOMSIZES-1) { + com_p->mpi_comsize_max = INT_MAX; + } else { + com_p->mpi_comsize_max = COMSIZE - 1; + } /* get the number of message sizes to specify rules for. inner set size */ if( (getnext (fptr, &NMSGSIZES) < 0) || (NMSGSIZES < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", COLID, ncs, fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", COLID, ncs, fileline); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n", - NMSGSIZES, COLID, COMSIZE)); - com_p->n_msg_sizes = NMSGSIZES; - com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMSGSIZES, COLID, ncs, COMSIZE); + opal_output_verbose(25, ompi_coll_tuned_stream, + "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n", + NMSGSIZES, COLID, COMSIZE); + com_p->n_rules = NMSGSIZES; + com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMSGSIZES, COLID, COMSIZE); if (NULL == com_p->msg_rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate msg rules for file [%s]\n", fname)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Cannot allocate msg rules for file [%s]\n", fname); goto on_file_error; } @@ -209,28 +549,36 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** /* read the message size to associate the rule with */ if( (getnext (fptr, &MSGSIZE) < 0) || (MSGSIZE < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline); goto on_file_error; } - msg_p->msg_size = (size_t)MSGSIZE; + msg_p->msg_size_min = (size_t)MSGSIZE; + msg_p->msg_size_max = COLL_RULES_MESSAGE_SIZE_INF; + if (nms > 0) { + com_p->msg_rules[nms-1].msg_size_max = msg_p->msg_size_min - 1; + } /* read the collective specific algorithm identifier */ if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline); goto on_file_error; } msg_p->result_alg = ALG; /* read faninout tuning parameter. required */ if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline); goto on_file_error; } msg_p->result_topo_faninout = FANINOUT; /* read segsize tuning parameter. required */ if( (getnext (fptr, &SEGSIZE) < 0) || (SEGSIZE < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline); goto on_file_error; } msg_p->result_segsize = SEGSIZE; @@ -239,7 +587,8 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** msg_p->result_max_requests = ompi_coll_tuned_alltoall_max_requests; if( (version > 1) && isnext_digit(fptr) ) { if( (getnext (fptr, &MAXREQ) < 0) || (MAXREQ < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read max requests for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Could not read max requests for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline); goto on_file_error; } msg_p->result_max_requests = MAXREQ; @@ -247,41 +596,42 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** /* check the first rule is for 0 size. look-up depends on this */ if (!nms && MSGSIZE) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MSGSIZE, COLID, ncs, nms, fileline)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "All algorithms must specify a rule for message size of zero upwards always first!\n"); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MSGSIZE, COLID, ncs, nms, fileline); goto on_file_error; } - -#if OPAL_ENABLE_DEBUG total_msg_count++; -#endif - } /* msg size */ - -#if OPAL_ENABLE_DEBUG total_com_count++; -#endif - } /* comm size */ total_alg_count++; - OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", COLID)); + opal_output_verbose(25, ompi_coll_tuned_stream, + "Done reading dynamic rule for collective ID %ld\n", COLID); } /* per collective */ fclose (fptr); - OPAL_OUTPUT((ompi_coll_tuned_stream,"\nConfigure file Stats\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Version\t\t\t\t\t: %5u\n", version)); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Collectives with rules\t\t\t: %5d\n", total_alg_count)); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Communicator sizes with rules\t\t: %5d\n", total_com_count)); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Message sizes with rules\t\t: %5d\n", total_msg_count)); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Lines in configuration file read\t\t: %5d\n", fileline)); + opal_output_verbose(25, ompi_coll_tuned_stream, + "\nConfigure file Stats\n"); + opal_output_verbose(25, ompi_coll_tuned_stream, + "Version\t\t\t\t\t: %5u\n", version); + opal_output_verbose(25, ompi_coll_tuned_stream, + "Collectives with rules\t\t\t: %5d\n", total_alg_count); + opal_output_verbose(25, ompi_coll_tuned_stream, + "Communicator sizes with rules\t\t: %5d\n", total_com_count); + opal_output_verbose(25, ompi_coll_tuned_stream, + "Message sizes with rules\t\t: %5d\n", total_msg_count); + opal_output_verbose(25, ompi_coll_tuned_stream, + "Lines in configuration file read\t\t: %5d\n", fileline); /* return the rules to the caller */ *rules = alg_rules; - return (total_alg_count); + return OPAL_SUCCESS; on_file_error: @@ -290,18 +640,78 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** /* we return back a verbose message and a count of -1 algorithms read */ /* draconian but its better than having a bad collective decision table */ - OPAL_OUTPUT((ompi_coll_tuned_stream,"read_rules_config_file: bad configure file [%s]. Read afar as line %d\n", fname, fileline)); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Ignoring user supplied tuned collectives configuration decision file.\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Switching back to [compiled in] fixed decision table.\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Fix errors as listed above and try again.\n")); + opal_output_verbose(1, ompi_coll_tuned_stream, + "read_rules_config_file: bad configure file [%s]. Read afar as line %d\n", fname, fileline); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Ignoring user supplied tuned collectives configuration decision file.\n"); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Switching back to [compiled in] fixed decision table.\n"); + opal_output_verbose(1, ompi_coll_tuned_stream, + "Fix errors as listed above and try again.\n"); /* deallocate memory if allocated */ - if (alg_rules) ompi_coll_tuned_free_all_rules (alg_rules, n_collectives); + if (alg_rules) ompi_coll_tuned_free_all_rules (alg_rules); /* close file */ if (fptr) fclose (fptr); *rules = (ompi_coll_alg_rule_t*) NULL; - return (-1); + return OPAL_ERROR; } +/** + * Reads a rule file called fname + * + * In Open MPI 6.0 we introduced json-based rule file, but we continue to read + * the original format for now. + * + * This funtion attempts a json read, and if it fails, falls back to classic + * read. If both fail, we fall back to fixed rules. + * + * Errors will be entirely hidden from the user unless coll_base_verbose is set + * to at least 1. + * + * With coll_base_verbose set to 25, rules file is dumped to output stream. + * + * Returns OPAL_ERROR or OPAL_SUCCESS + */ + +int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules) { + if (!fname) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Gave NULL as rule table configuration file for tuned collectives... ignoring!\n"); + return OPAL_ERROR; + } + + if (!rules) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "Gave NULL as rule table result ptr!... ignoring!\n"); + return OPAL_ERROR; + } + + const opal_json_t *json; + opal_output_verbose(1, ompi_coll_tuned_stream, "Attempting to read tuned rules as JSON...\n"); + int ret = opal_json_load_file(fname, &json, 0); + if (ret == OPAL_SUCCESS) { + ret = ompi_coll_tuned_read_rules_json(json, rules); + opal_json_free(&json); + if (ret != OPAL_SUCCESS) { + opal_output_verbose(1, ompi_coll_tuned_stream, + "ERROR: %s is valid json, but there were errors reading the rules from the file. " + "Falling back to default tuning and ignoring the file.\n", fname); + } + } else { + opal_output_verbose(1, ompi_coll_tuned_stream, "Failed to parse %s as valid json. Assuming classic format.\n",fname); + ret = ompi_coll_tuned_read_rules_config_file_classic(fname, rules); + if (ret != OPAL_SUCCESS) { + opal_output_verbose(1, ompi_coll_tuned_stream, "Failed to load %s in either json or classic readers. Check format.\n",fname); + } + } + + if (ret == OPAL_SUCCESS && opal_output_check_verbosity( 2, ompi_coll_tuned_stream)) { + opal_output_verbose( 2, ompi_coll_tuned_stream, "Dumping rules:\n"); + ompi_coll_tuned_dump_all_rules(*rules); + } + return ret; + +} \ No newline at end of file diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.h b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.h index 595e436fa49..9efc1839dcf 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.h +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,7 +29,7 @@ BEGIN_C_DECLS -int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives); +int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules); END_C_DECLS diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c index 2c2b4469635..8a681a6bafa 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c @@ -12,6 +12,8 @@ * Copyright (c) 2011-2012 FUJITSU LIMITED. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,6 +26,7 @@ #include "mpi.h" #include "ompi/mca/mca.h" #include "ompi/constants.h" +#include "ompi/communicator/communicator.h" #include "coll_tuned.h" /* need to include our own topo prototypes so we can malloc data on the comm correctly */ @@ -48,13 +51,13 @@ ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg) /* set all we can at this point */ for (i=0;ialg_rule_id, - msg_p->com_rule_id, msg_p->mpi_comsize, msg_p->msg_rule_id)); - - OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %10lu -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\tmax_requests %4d\n", - msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize, - msg_p->result_max_requests)); + if (msg_p->msg_size_max == COLL_RULES_MESSAGE_SIZE_INF) { + snprintf(max_msg_size, 23, "%s", "Inf"); + } else { + snprintf(max_msg_size, 23, "%lu", msg_p->msg_size_max); + } + max_msg_size[23] = '\0'; + + if (msg_p->msg_size_max < msg_p->msg_size_min) { + opal_output(ompi_coll_tuned_stream,"\t\t\tIGNORED RULE (check log)\n"); + } else { + rc = coll_tuned_alg_to_str(msg_p->coll_id, msg_p->result_alg, &alg_name); + if (rc != OPAL_SUCCESS) { + alg_name = "ERROR_BAD_ALG_ID"; + } + opal_output(ompi_coll_tuned_stream,"\t\t\tmsg_size %lu:%s -> %s (%2d). Params: topo in/out %2d, segsize %5ld, max_requests %4d\n", + msg_p->msg_size_min, max_msg_size, alg_name, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize, + msg_p->result_max_requests); + if (rc == OPAL_SUCCESS) { + free(alg_name); + } + } return (0); } @@ -123,20 +143,20 @@ int ompi_coll_tuned_dump_com_rule (ompi_coll_com_rule_t* com_p) int i; if (!com_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Com rule was a NULL ptr?!\n")); + opal_output(ompi_coll_tuned_stream,"\t\tERROR: Com rule was a NULL ptr?!\n"); return (-1); } - OPAL_OUTPUT((ompi_coll_tuned_stream, "alg_id %3d\tcom_id %3d\tcom_size %3d\t", com_p->alg_rule_id, com_p->com_rule_id, com_p->mpi_comsize)); - - if (!com_p->n_msg_sizes) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"no msgsizes defined\n")); + if (!com_p->n_rules) { + opal_output(ompi_coll_tuned_stream,"\t\tno message size rules defined\n"); return (0); } - OPAL_OUTPUT((ompi_coll_tuned_stream,"number of message sizes %3d\n", com_p->n_msg_sizes)); + opal_output(ompi_coll_tuned_stream, "\t\t[%s] comm_size: %d:%d, distro:%s -> %d message-size rules", + mca_coll_base_colltype_to_str(com_p->coll_id), com_p->mpi_comsize_min, com_p->mpi_comsize_max, + coll_rules_comm_rank_distro_to_str(com_p->comm_rank_distribution), com_p->n_rules); - for (i=0;in_msg_sizes;i++) { + for (i=0;in_rules;i++) { ompi_coll_tuned_dump_msg_rule (&(com_p->msg_rules[i])); } @@ -149,18 +169,16 @@ int ompi_coll_tuned_dump_alg_rule (ompi_coll_alg_rule_t* alg_p) int i; if (!alg_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n")); + opal_output(ompi_coll_tuned_stream,"ERROR: Algorithm rule was a NULL ptr?!\n"); return (-1); } - OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\t", alg_p->alg_rule_id)); - if (!alg_p->n_com_sizes) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"no coms defined\n")); + opal_output(ompi_coll_tuned_stream,"\tno rules defined\n"); return (0); } - OPAL_OUTPUT((ompi_coll_tuned_stream,"number of com sizes %3d\n", alg_p->n_com_sizes)); + opal_output(ompi_coll_tuned_stream,"\tnumber of communicator rules: %d\n", alg_p->n_com_sizes); for (i=0;in_com_sizes;i++) { ompi_coll_tuned_dump_com_rule (&(alg_p->com_rules[i])); @@ -170,19 +188,18 @@ int ompi_coll_tuned_dump_alg_rule (ompi_coll_alg_rule_t* alg_p) } -int ompi_coll_tuned_dump_all_rules (ompi_coll_alg_rule_t* alg_p, int n_rules) +int ompi_coll_tuned_dump_all_rules (ompi_coll_alg_rule_t* alg_p) { int i; if (!alg_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n")); + opal_output(ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n"); return (-1); } - OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of algorithm rules %3d\n", n_rules)); - - for (i=0;in_msg_sizes) { + if (com_p->n_rules) { msg_p = com_p->msg_rules; if (!msg_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL msg_rules when msg count was %d\n", com_p->n_msg_sizes)); + opal_output_verbose(1, ompi_coll_tuned_stream, + "attempt to free NULL n_rules when msg count was %d\n", com_p->n_rules); rc = -1; /* some error */ } else { @@ -230,7 +249,7 @@ int ompi_coll_tuned_free_coms_in_alg_rule (ompi_coll_alg_rule_t* alg_p) ompi_coll_com_rule_t* com_p; if (!alg_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL alg_rule ptr\n")); + opal_output_verbose(1, ompi_coll_tuned_stream,"attempt to free NULL alg_rule ptr\n"); return (-1); } @@ -238,7 +257,7 @@ int ompi_coll_tuned_free_coms_in_alg_rule (ompi_coll_alg_rule_t* alg_p) com_p = alg_p->com_rules; if (!com_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL com_rules when com count was %d\n", alg_p->n_com_sizes)); + opal_output_verbose(1, ompi_coll_tuned_stream,"attempt to free NULL com_rules when com count was %d\n", alg_p->n_com_sizes); } else { /* ok, memory exists for the com rules so free their message rules first */ for( i = 0; i < alg_p->n_com_sizes; i++ ) { @@ -256,12 +275,12 @@ int ompi_coll_tuned_free_coms_in_alg_rule (ompi_coll_alg_rule_t* alg_p) } -int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs) +int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p) { int i; int rc = 0; - for( i = 0; i < n_algs; i++ ) { + for( i = 0; i < COLLCOUNT; i++ ) { rc += ompi_coll_tuned_free_coms_in_alg_rule (&(alg_p[i])); } @@ -279,50 +298,74 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs) /* * This function is used to get the pointer to the nearest (less than or equal) - * com rule for this MPI collective (alg_id) for a given + * com rule for this MPI collective (coll_id) for a given * MPI communicator size. The complete rule base must be presented. * * If no rule exits returns NULL, else the com rule ptr * (which can be used in the coll_tuned_get_target_method_params() call) * */ -ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* rules, int alg_id, int mpi_comsize) +ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* rules, int coll_id, struct ompi_communicator_t *comm) { ompi_coll_alg_rule_t* alg_p = (ompi_coll_alg_rule_t*) NULL; ompi_coll_com_rule_t* com_p = (ompi_coll_com_rule_t*) NULL; ompi_coll_com_rule_t* best_com_p = (ompi_coll_com_rule_t*) NULL; int i; + int mpi_comsize; if (!rules) { /* no rule base no resulting com rule */ return ((ompi_coll_com_rule_t*)NULL); } - alg_p = &(rules[alg_id]); /* get the algorithm rule pointer */ + alg_p = &(rules[coll_id]); /* get the algorithm rule pointer */ if (!alg_p->n_com_sizes) { /* check for count of communicator sizes */ return ((ompi_coll_com_rule_t*)NULL); /* no com sizes so no rule */ } - /* ok have some com sizes, now to find the one closest to my mpi_comsize */ + if (OMPI_COMM_IS_INTER(comm)) { + mpi_comsize = ompi_comm_remote_size(comm); + } else { + mpi_comsize = ompi_comm_size(comm); + } - /* make a copy of the first com rule */ - best_com_p = com_p = alg_p->com_rules; - i = 0; + for (i=0; in_com_sizes; i++) { + com_p = &alg_p->com_rules[i]; + if (com_p->comm_rank_distribution == COLL_RULES_DISTRO_DISJOINT) { + if (!OMPI_COMM_IS_DISJOINT_SET(comm) || !OMPI_COMM_IS_DISJOINT(comm) ) { + /* rule says disjoint, and comm is not disjoint, skip this rule */ + continue; + } + } + if (com_p->comm_rank_distribution == COLL_RULES_DISTRO_SINGLENODE) { + int local_peers = ompi_group_count_local_peers(comm->c_local_group); + int all_peers = ompi_comm_size(comm); + if (local_peers != all_peers) { + /* rule says single-node, but we have non-local peers, skip this rule */ + continue; + } + } - while( i < alg_p->n_com_sizes ) { - if (com_p->mpi_comsize > mpi_comsize) { + if (com_p->mpi_comsize_min <= mpi_comsize && + mpi_comsize <= com_p->mpi_comsize_max ) { + best_com_p = com_p; break; } - best_com_p = com_p; - /* go to the next entry */ - com_p++; - i++; } - OPAL_OUTPUT((ompi_coll_tuned_stream,"Selected the following com rule id %d\n", best_com_p->com_rule_id)); - ompi_coll_tuned_dump_com_rule (best_com_p); + if ( opal_output_check_verbosity(25, ompi_coll_tuned_stream) ) { + if (!best_com_p) { + opal_output( ompi_coll_tuned_stream, "coll:tuned:dynamic: For %s with communicator size %d " + "no matching rule found. Using fixed defaults.", + mca_coll_base_colltype_to_str(coll_id), mpi_comsize); + } else { + opal_output(ompi_coll_tuned_stream,"coll:tuned:dynamic: For %s with communicator size %d selected rule number %d\n", + mca_coll_base_colltype_to_str(coll_id), mpi_comsize, i+1); + ompi_coll_tuned_dump_com_rule(best_com_p); + } + } - return (best_com_p); + return best_com_p; } /* @@ -348,34 +391,38 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul int i; /* No rule or zero rules */ - if( (NULL == base_com_rule) || (0 == base_com_rule->n_msg_sizes)) { + if( (NULL == base_com_rule) || (0 == base_com_rule->n_rules)) { return (0); } - /* ok have some msg sizes, now to find the one closest to my mpi_msgsize */ - - /* make a copy of the first msg rule */ - best_msg_p = msg_p = base_com_rule->msg_rules; - i = 0; - - while (in_msg_sizes) { - /* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking mpi_msgsize %d against com_id %d msg_id %d index %d msg_size %d", */ - /* mpi_msgsize, msg_p->com_rule_id, msg_p->msg_rule_id, i, msg_p->msg_size)); */ - if (msg_p->msg_size <= mpi_msgsize) { + /* search for the first comm rule that matches */ + for(i=0; in_rules; i++) { + msg_p = &base_com_rule->msg_rules[i]; + if (msg_p->msg_size_min <= mpi_msgsize && + mpi_msgsize <= msg_p->msg_size_max ) { best_msg_p = msg_p; - /* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */ - } - else { - /* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */ break; } - /* go to the next entry */ - msg_p++; - i++; } - OPAL_OUTPUT((ompi_coll_tuned_stream,"Selected the following msg rule id %d\n", best_msg_p->msg_rule_id)); - ompi_coll_tuned_dump_msg_rule (best_msg_p); + if ( opal_output_check_verbosity(25, ompi_coll_tuned_stream) ) { + if (!best_msg_p) { + opal_output(ompi_coll_tuned_stream, + "coll:tuned:dynamic For %s with msg_size=%lu: no matching rule, using fixed defaults.", + mca_coll_base_colltype_to_str(msg_p->coll_id), mpi_msgsize); + } else { + opal_output(ompi_coll_tuned_stream, + "coll:tuned:dynamic For %s with msg_size=%ld: selected rule number %d:", + mca_coll_base_colltype_to_str(msg_p->coll_id), mpi_msgsize, i+1); + ompi_coll_tuned_dump_msg_rule (best_msg_p); + } + } + + if (!best_msg_p) { + /* no match, use defaults */ + return 0; + } + /* return the segment size */ *result_topo_faninout = best_msg_p->result_topo_faninout; @@ -389,3 +436,28 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul /* return the algorithm/method to use */ return (best_msg_p->result_alg); } + +static const char* coll_rules_comm_rank_distro_table[] = { + [COLL_RULES_DISTRO_ANY] = "any", + [COLL_RULES_DISTRO_DISJOINT] = "one-per-node", + [COLL_RULES_DISTRO_SINGLENODE] = "single-node", +}; + +const char* coll_rules_comm_rank_distro_to_str(enum comm_rank_distro_t distro) +{ + if( (distro < 0) || (distro >= COLL_RULES_DISTRO_COUNT) ) { + return NULL; + } + return coll_rules_comm_rank_distro_table[distro]; +} + +int coll_rules_comm_rank_distro_from_str(const char *name, enum comm_rank_distro_t *distro) +{ + for (int i=0; ialgorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -92,8 +95,9 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co mca_coll_base_module_t *module, int algorithm) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:exscan_intra_do_this selected algorithm %d", - algorithm)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:exscan_intra_do_this selected algorithm %d", + algorithm)); switch (algorithm) { case (0): @@ -102,7 +106,8 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co case (2): return ompi_coll_base_exscan_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:exscan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[EXSCAN])); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:exscan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[EXSCAN])); return (MPI_ERR_ARG); } diff --git a/ompi/mca/coll/tuned/coll_tuned_gather_decision.c b/ompi/mca/coll/tuned/coll_tuned_gather_decision.c index 545db644b24..d356202a3bf 100644 --- a/ompi/mca/coll/tuned/coll_tuned_gather_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_gather_decision.c @@ -5,6 +5,8 @@ * reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -81,6 +83,7 @@ ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_pa OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_gather_forced_algorithm); + coll_tuned_alg_register_options( GATHER, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -129,7 +132,7 @@ ompi_coll_tuned_gather_intra_do_this(const void *sbuf, size_t scount, mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d", algorithm, faninout, segsize)); @@ -152,7 +155,7 @@ ompi_coll_tuned_gather_intra_do_this(const void *sbuf, size_t scount, root, comm, module, segsize); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[GATHER])); return (MPI_ERR_ARG); diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index f83b3ecd9ea..eb4fb125380 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -14,6 +14,8 @@ * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -61,7 +63,8 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority) { mca_coll_tuned_module_t *tuned_module; - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:module_tuned query called")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:module_tuned query called")); /** * No support for inter-communicator yet. @@ -176,13 +179,14 @@ ompi_coll_tuned_forced_getvalues( enum COLLTYPE type, if( NULL != mca_coll_tuned_component.all_base_rules ) { \ (TMOD)->com_rules[(TYPE)] \ = ompi_coll_tuned_get_com_rule_ptr( mca_coll_tuned_component.all_base_rules, \ - (TYPE), size ); \ + (TYPE), comm ); \ if( NULL != (TMOD)->com_rules[(TYPE)] ) { \ need_dynamic_decision = 1; \ } \ } \ if( 1 == need_dynamic_decision ) { \ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned: enable dynamic selection for "#TYPE)); \ + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, \ + "coll:tuned: enable dynamic selection for "#TYPE)); \ EXECUTE; \ } \ } while(0) @@ -194,18 +198,10 @@ static int tuned_module_enable( mca_coll_base_module_t *module, struct ompi_communicator_t *comm ) { - int size; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t *) module; mca_coll_base_comm_t *data = NULL; - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called.")); - - /* Allocate the data that hangs off the communicator */ - if (OMPI_COMM_IS_INTER(comm)) { - size = ompi_comm_remote_size(comm); - } else { - size = ompi_comm_size(comm); - } + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:module_init called.")); /** * we still malloc data as it is used by the TUNED modules @@ -225,7 +221,7 @@ tuned_module_enable( mca_coll_base_module_t *module, } if (ompi_coll_tuned_use_dynamic_rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init MCW & Dynamic")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:module_init MCW & Dynamic")); /** * next dynamic state, recheck all forced rules as well @@ -304,7 +300,7 @@ tuned_module_enable( mca_coll_base_module_t *module, /* All done */ tuned_module->super.base_data = data; - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use")); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use")); return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c index 8e279cacbee..6ae3c00f7d9 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c @@ -5,6 +5,8 @@ * reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -87,6 +89,7 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_reduce_forced_algorithm); + coll_tuned_alg_register_options( REDUCE, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -153,8 +156,9 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co int algorithm, int faninout, int segsize, int max_requests ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d", + algorithm, faninout, segsize)); switch (algorithm) { case (0): return ompi_coll_tuned_reduce_intra_dec_fixed(sbuf, rbuf, count, dtype, @@ -183,7 +187,8 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co segsize, max_requests, faninout); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE])); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE])); return (MPI_ERR_ARG); } diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c index b01be6fc17e..f4f6bdb7590 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c @@ -7,6 +7,8 @@ * Copyright (c) 2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -86,6 +88,7 @@ int ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init (coll_tuned_for OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_reduce_scatter_block_forced_algorithm); + coll_tuned_alg_register_options( REDUCESCATTERBLOCK, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -122,8 +125,9 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_block_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:reduce_scatter_block_intra_do_this selected algorithm %d topo faninout %d segsize %d", + algorithm, faninout, segsize)); switch (algorithm) { case (0): return ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(sbuf, rbuf, rcount, @@ -137,7 +141,8 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r case (4): return ompi_coll_base_reduce_scatter_block_intra_butterfly(sbuf, rbuf, rcount, dtype, op, comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_block_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTERBLOCK])); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:reduce_scatter_block_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTERBLOCK])); return (MPI_ERR_ARG); } diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c index 69497fe908d..16747598b6e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c @@ -5,6 +5,8 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -83,6 +85,7 @@ int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_alg OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_reduce_scatter_forced_algorithm); + coll_tuned_alg_register_options( REDUCESCATTER, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -129,8 +132,9 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf, mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", + algorithm, faninout, segsize)); switch (algorithm) { case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed(sbuf, rbuf, rcounts, @@ -144,7 +148,8 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf, case (4): return ompi_coll_base_reduce_scatter_intra_butterfly(sbuf, rbuf, rcounts, dtype, op, comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER])); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER])); return (MPI_ERR_ARG); } diff --git a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c index 05d34ab948a..903e76c4694 100644 --- a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c @@ -5,6 +5,8 @@ * Copyright (c) 2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -77,6 +79,7 @@ int ompi_coll_tuned_scan_intra_check_forced_init (coll_tuned_force_algorithm_mca OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_scan_forced_algorithm); + coll_tuned_alg_register_options( SCAN, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -92,8 +95,9 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun mca_coll_base_module_t *module, int algorithm) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:scan_intra_do_this selected algorithm %d", - algorithm)); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:scan_intra_do_this selected algorithm %d", + algorithm)); switch (algorithm) { case (0): @@ -102,7 +106,8 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun case (2): return ompi_coll_base_scan_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:scan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[SCAN])); + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, + "coll:tuned:scan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[SCAN])); return (MPI_ERR_ARG); } diff --git a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c index 83df153cbef..b1449b2955c 100644 --- a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c @@ -6,6 +6,8 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -82,6 +84,7 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_scatter_forced_algorithm); + coll_tuned_alg_register_options( SCATTER, new_enum ); OBJ_RELEASE(new_enum); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; @@ -162,7 +165,7 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, size_t scount, mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", algorithm, faninout, segsize)); @@ -185,7 +188,7 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, size_t scount, root, comm, module, ompi_coll_tuned_scatter_blocking_send_ratio); } /* switch */ - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[SCATTER])); return MPI_ERR_ARG; diff --git a/opal/util/json/opal_json.c b/opal/util/json/opal_json.c index 57109b369a4..a496b5916eb 100644 --- a/opal/util/json/opal_json.c +++ b/opal/util/json/opal_json.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. + * Copyright (c) 2024-2025 Amazon.com, Inc. or its affiliates. * All Rights reserved. * $COPYRIGHT$ * @@ -19,8 +19,6 @@ #define CHECK_OBJ_TYPE(obj, expected) \ do { \ if ((obj)->type != (expected)) { \ - opal_show_help("help-json.txt", "Invalid argument type", true, __func__, #expected, \ - expected, (obj)->type); \ return OPAL_ERROR; \ } \ } while (0) @@ -97,7 +95,6 @@ int opal_json_load(const char *str, const size_t len, const opal_json_t **json) json_value *value = json_parse(str, len); if (!value) { - opal_show_help("help-json.txt", "Invalid JSON string", true); ret = OPAL_ERROR; goto out; } @@ -114,7 +111,7 @@ int opal_json_load(const char *str, const size_t len, const opal_json_t **json) return ret; } -int opal_json_load_file(const char *filename, const opal_json_t **json) +int opal_json_load_file(const char *filename, const opal_json_t **json, int show_errors) { FILE *fp = NULL; size_t file_size; @@ -123,7 +120,9 @@ int opal_json_load_file(const char *filename, const opal_json_t **json) fp = fopen(filename, "r"); if (fp == NULL) { - opal_show_help("help-json.txt", "Unable to open file", true, filename); + if (show_errors) { + opal_show_help("help-json.txt", "Unable to open file", true, filename); + } ret = OPAL_ERROR; goto out; } @@ -134,18 +133,25 @@ int opal_json_load_file(const char *filename, const opal_json_t **json) file_contents = (char *) malloc(file_size); if (!file_contents) { - opal_show_help("help-json.txt", "Memory allocation failure", true); + if (show_errors) { + opal_show_help("help-json.txt", "Memory allocation failure", true); + } ret = OPAL_ERROR; goto out; } if (file_size > fread(file_contents, 1, file_size, fp)) { - opal_show_help("help-json.txt", "Unable to read file", true, filename); + if (show_errors) { + opal_show_help("help-json.txt", "Unable to read file", true, filename); + } ret = OPAL_ERROR; goto out; } ret = opal_json_load(file_contents, file_size, json); + if (ret != OPAL_SUCCESS && show_errors) { + opal_show_help("help-json.txt", "Invalid JSON string", true); + } out: if (fp) { @@ -186,6 +192,33 @@ int opal_json_get_key(const opal_json_t *json, const char *key, const opal_json_ return ret; } +int opal_json_get_key_by_index(const opal_json_t *json, const size_t index, const char **key, const opal_json_t **out) +{ + int ret = OPAL_ERROR; + + CHECK_OBJ_TYPE(json, OPAL_JSON_OBJECT); + + opal_json_internal_t *in = (opal_json_internal_t *) json; + opal_json_internal_t *result = NULL; + + json_object_entry entry = {0}; + + if (index < 0 || index >= in->value->u.object.length) { + opal_show_help("help-json.txt", "Index out of bound", true, index, + in->value->u.array.length); + return ret; + } + entry = in->value->u.object.values[index]; + *key = entry.name; + ret = opal_json_internal_new(entry.value, &result); + if (OPAL_SUCCESS == ret) { + *out = (opal_json_t *) result; + } else if (result) { + opal_json_internal_free(result); + } + return ret; +} + void opal_json_free(const opal_json_t **json) { opal_json_internal_free((struct opal_json_internal_t *) *json); diff --git a/opal/util/json/opal_json.h b/opal/util/json/opal_json.h index a29642e6a9c..d990029e661 100644 --- a/opal/util/json/opal_json.h +++ b/opal/util/json/opal_json.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. + * Copyright (c) 2024-2025 Amazon.com, Inc. or its affiliates. * All Rights reserved. * $COPYRIGHT$ * @@ -21,7 +21,7 @@ const opal_json_t *json = NULL, *item = NULL; char *val; size_t len; -ret = opal_json_load_file("my_file.json", &json); // Parse the file content into json +ret = opal_json_load_file("my_file.json", &json, 1); // Parse the file content into json if (OPAL_SUCCESS != ret) { goto out; } @@ -88,7 +88,7 @@ OPAL_DECLSPEC int opal_json_load(const char *str, const size_t len, const opal_j * @returns OPAL_SUCCESS if the file is read and parsed successfully * OPAL_ERROR otherwise */ -OPAL_DECLSPEC int opal_json_load_file(const char *filename, const opal_json_t **json); +OPAL_DECLSPEC int opal_json_load_file(const char *filename, const opal_json_t **json, int show_errors); /** * Free JSON resources @@ -123,6 +123,22 @@ OPAL_DECLSPEC int opal_json_get_key(const opal_json_t *json, const char *key, OPAL_DECLSPEC int opal_json_get_index(const opal_json_t *json, const size_t index, const opal_json_t **out); +/** + * Get the JSON object and key at index from a JSON object + * + * @param[in] json Parent JSON array + * @param[in] index Index value + * @param[out] key The name of the key at the given index. This pointer + * remains valid for the lifetime of the parent JSON + * object. The caller should not free it. + * @param[out] out Output JSON object at the specified index. This object + * should be freed by caller with opal_json_free. + */ +OPAL_DECLSPEC int +opal_json_get_key_by_index( const opal_json_t *json, const size_t index, + const char **key, const opal_json_t **out); + + /** * Get the number of objects in a container-type value, i.e. object, array. * diff --git a/test/util/opal_json.c b/test/util/opal_json.c index 13611b4ba2e..ed914dfdf12 100644 --- a/test/util/opal_json.c +++ b/test/util/opal_json.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. + * Copyright (c) 2024-2025 Amazon.com, Inc. or its affiliates. * All Rights reserved. * $COPYRIGHT$ * @@ -56,7 +56,7 @@ static int load_json(const char *string, enum INPUT_TYPE input_type, const opal_ fclose(fp); close(fd); /* Load the input string from the temporary file */ - ret = opal_json_load_file(filename, json); + ret = opal_json_load_file(filename, json, 1); /* Remember to delete the file */ (void) remove(filename); break; @@ -129,7 +129,9 @@ static void test_valid_json(void) int ret = 0; size_t size; const opal_json_t *json = NULL, *a = NULL, *b = NULL, *b0 = NULL, *b1 = NULL, *c = NULL, - *d = NULL; + *d = NULL, *dummy = NULL; + const char *found_key; + /** * Human readable form: * { @@ -208,6 +210,21 @@ static void test_valid_json(void) test_json_double_val(d, 3.456); } + ret = opal_json_get_key_by_index(json, 3, &found_key, &d); + if (ret) { + test_failure("opal_json_get_key_by_index failed when called with a valid index"); + } else { + if ( 0 != strcmp(found_key, "d") ) { + test_failure("opal_json_get_key_by_index returned the wrong key value"); + } + test_json_double_val(d, 3.456); + } + + ret = opal_json_get_key_by_index(json, 4, &found_key, &dummy); + if (ret == OPAL_SUCCESS) { + test_failure("opal_json_get_key_by_index returned OPAL_SUCCESS when passed invalid index."); + } + /* JSON objects can be released in any order */ FREE_JSON(a); FREE_JSON(b);