Skip to content

Commit 174b095

Browse files
authored
Merge pull request #10702 from jjhursey/fix-split-guided-undef
comm_split_type HW_GUIDED fix MPI_UNDEFINED handling
2 parents c97c4ee + 6ddcc58 commit 174b095

File tree

2 files changed

+47
-18
lines changed

2 files changed

+47
-18
lines changed

ompi/communicator/comm.c

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "opal/mca/threads/mutex.h"
4747
#include "opal/util/bit_ops.h"
4848
#include "opal/util/output.h"
49+
#include "opal/util/show_help.h"
4950
#include "ompi/mca/topo/topo.h"
5051
#include "ompi/mca/topo/base/base.h"
5152
#include "ompi/dpm/dpm.h"
@@ -79,6 +80,21 @@ static const ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_sup
7980
{.info_value = NULL},
8081
};
8182

83+
static const char * ompi_comm_split_type_to_str(int split_type) {
84+
for (int i = 0; NULL != ompi_comm_split_type_hw_guided_support[i].info_value; ++i) {
85+
if (split_type == ompi_comm_split_type_hw_guided_support[i].split_type) {
86+
return ompi_comm_split_type_hw_guided_support[i].info_value;
87+
}
88+
}
89+
if (MPI_COMM_TYPE_HW_GUIDED == split_type) {
90+
return "MPI_COMM_TYPE_HW_GUIDED";
91+
}
92+
else if (MPI_COMM_TYPE_HW_UNGUIDED == split_type) {
93+
return "MPI_COMM_TYPE_HW_UNGUIDED";
94+
}
95+
return "Unknown";
96+
}
97+
8298
/*
8399
** sort-function for MPI_Comm_split
84100
*/
@@ -793,7 +809,11 @@ static int ompi_comm_split_type_get_part (ompi_group_t *group, const int split_t
793809
* We should not get here as the split type will be changed
794810
* at a higher level.
795811
*/
796-
opal_output(0, "Error: in ompi_comm_split_type_get_part() unexpected split_type=%d", split_type);
812+
opal_show_help("help-comm.txt",
813+
"unexpected-split-type",
814+
true,
815+
ompi_comm_split_type_to_str(split_type),
816+
split_type);
797817
return OMPI_ERR_BAD_PARAM;
798818
}
799819

@@ -868,9 +888,11 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
868888
ompi_communicator_t *newcomp = MPI_COMM_NULL;
869889
int my_size, my_rsize = 0, mode, inter;
870890
int *lranks = NULL, *rranks = NULL;
871-
int global_split_type, global_orig_split_type, ok, tmp[6];
891+
int global_split_type, global_orig_split_type, ok[2], tmp[6];
872892
int rc;
873893
int orig_split_type = split_type;
894+
int flag;
895+
opal_cstring_t *value = NULL;
874896

875897
/* silence clang warning. newcomm should never be NULL */
876898
if (OPAL_UNLIKELY(NULL == newcomm)) {
@@ -881,9 +903,6 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
881903

882904
/* Step 0: Convert MPI_COMM_TYPE_HW_GUIDED to the internal type */
883905
if (MPI_COMM_TYPE_HW_GUIDED == split_type) {
884-
int flag;
885-
opal_cstring_t *value = NULL;
886-
887906
opal_info_get(info, "mpi_hw_resource_type", &value, &flag);
888907
/* If key is not in the 'info', then return MPI_COMM_NULL.
889908
* This is caught at the MPI interface level, but it doesn't hurt to
@@ -941,26 +960,39 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
941960
global_orig_split_type = tmp[0];
942961
global_split_type = tmp[4];
943962

944-
if (tmp[0] != -tmp[1] || inter) {
963+
if (tmp[0] != -tmp[1] || tmp[4] != -tmp[5] || inter) {
945964
/* at least one rank supplied a different split type check if our split_type is ok */
946-
ok = (MPI_UNDEFINED == orig_split_type) || global_orig_split_type == orig_split_type;
965+
ok[0] = (MPI_UNDEFINED == orig_split_type) || global_orig_split_type == orig_split_type;
966+
ok[1] = (MPI_UNDEFINED == orig_split_type) || global_split_type == split_type;
947967

948-
rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, comm,
968+
rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &ok, 2, MPI_INT, MPI_MIN, comm,
949969
comm->c_coll->coll_allreduce_module);
950970
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
951971
return rc;
952972
}
953973

954974
if (inter) {
955975
/* need an extra allreduce to ensure that all ranks have the same result */
956-
rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, comm,
976+
rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &ok, 2, MPI_INT, MPI_MIN, comm,
957977
comm->c_coll->coll_allreduce_module);
958978
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
959979
return rc;
960980
}
961981
}
962982

963-
if (OPAL_UNLIKELY(!ok)) {
983+
if (OPAL_UNLIKELY(!ok[0] || !ok[1])) {
984+
if (0 == ompi_comm_rank(comm)) {
985+
opal_info_get(info, "mpi_hw_resource_type", &value, &flag);
986+
if (!flag) {
987+
value = NULL;
988+
}
989+
opal_show_help("help-comm.txt",
990+
"mismatched-split_type-values",
991+
true,
992+
ompi_comm_split_type_to_str(orig_split_type),
993+
orig_split_type,
994+
NULL == value ? "" : value->string);
995+
}
964996
return OMPI_ERR_BAD_PARAM;
965997
}
966998

@@ -978,14 +1010,6 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
9781010
return OMPI_SUCCESS;
9791011
}
9801012

981-
/* MPI_COMM_TYPE_HW_GUIDED: Check if 'value' the same at all ranks */
982-
if (tmp[4] != -tmp[5]) {
983-
if (0 == ompi_comm_rank(comm)) {
984-
opal_output(0, "Error: Mismatched info values for MPI_COMM_TYPE_HW_GUIDED");
985-
}
986-
return OMPI_ERR_BAD_PARAM;
987-
}
988-
9891013
/* TODO: Make this better...
9901014
*
9911015
* See Example 7.4 in the MPI 4.0 standard for example usage.

ompi/communicator/help-comm.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,8 @@ Comments
2626
MPI_Info_set warning, key is using a reserved prefix.
2727
Key: %s
2828
Reserved prefix: %s
29+
[mismatched-split_type-values]
30+
Detected mismatched split_type and/or info "mpi_hw_resource_type" values
31+
in a call to MPI_Comm_split_type between peers in the communicator.
32+
split_type: %s (%d)
33+
info: %s

0 commit comments

Comments
 (0)