46
46
#include "opal/mca/threads/mutex.h"
47
47
#include "opal/util/bit_ops.h"
48
48
#include "opal/util/output.h"
49
+ #include "opal/util/show_help.h"
49
50
#include "ompi/mca/topo/topo.h"
50
51
#include "ompi/mca/topo/base/base.h"
51
52
#include "ompi/dpm/dpm.h"
@@ -79,6 +80,21 @@ static const ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_sup
79
80
{.info_value = NULL },
80
81
};
81
82
83
+ static const char * ompi_comm_split_type_to_str (int split_type ) {
84
+ for (int i = 0 ; NULL != ompi_comm_split_type_hw_guided_support [i ].info_value ; ++ i ) {
85
+ if (split_type == ompi_comm_split_type_hw_guided_support [i ].split_type ) {
86
+ return ompi_comm_split_type_hw_guided_support [i ].info_value ;
87
+ }
88
+ }
89
+ if (MPI_COMM_TYPE_HW_GUIDED == split_type ) {
90
+ return "MPI_COMM_TYPE_HW_GUIDED" ;
91
+ }
92
+ else if (MPI_COMM_TYPE_HW_UNGUIDED == split_type ) {
93
+ return "MPI_COMM_TYPE_HW_UNGUIDED" ;
94
+ }
95
+ return "Unknown" ;
96
+ }
97
+
82
98
/*
83
99
** sort-function for MPI_Comm_split
84
100
*/
@@ -793,7 +809,11 @@ static int ompi_comm_split_type_get_part (ompi_group_t *group, const int split_t
793
809
* We should not get here as the split type will be changed
794
810
* at a higher level.
795
811
*/
796
- opal_output (0 , "Error: in ompi_comm_split_type_get_part() unexpected split_type=%d" , split_type );
812
+ opal_show_help ("help-comm.txt" ,
813
+ "unexpected-split-type" ,
814
+ true,
815
+ ompi_comm_split_type_to_str (split_type ),
816
+ split_type );
797
817
return OMPI_ERR_BAD_PARAM ;
798
818
}
799
819
@@ -868,9 +888,11 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
868
888
ompi_communicator_t * newcomp = MPI_COMM_NULL ;
869
889
int my_size , my_rsize = 0 , mode , inter ;
870
890
int * lranks = NULL , * rranks = NULL ;
871
- int global_split_type , global_orig_split_type , ok , tmp [6 ];
891
+ int global_split_type , global_orig_split_type , ok [ 2 ] , tmp [6 ];
872
892
int rc ;
873
893
int orig_split_type = split_type ;
894
+ int flag ;
895
+ opal_cstring_t * value = NULL ;
874
896
875
897
/* silence clang warning. newcomm should never be NULL */
876
898
if (OPAL_UNLIKELY (NULL == newcomm )) {
@@ -881,9 +903,6 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
881
903
882
904
/* Step 0: Convert MPI_COMM_TYPE_HW_GUIDED to the internal type */
883
905
if (MPI_COMM_TYPE_HW_GUIDED == split_type ) {
884
- int flag ;
885
- opal_cstring_t * value = NULL ;
886
-
887
906
opal_info_get (info , "mpi_hw_resource_type" , & value , & flag );
888
907
/* If key is not in the 'info', then return MPI_COMM_NULL.
889
908
* This is caught at the MPI interface level, but it doesn't hurt to
@@ -941,26 +960,39 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
941
960
global_orig_split_type = tmp [0 ];
942
961
global_split_type = tmp [4 ];
943
962
944
- if (tmp [0 ] != - tmp [1 ] || inter ) {
963
+ if (tmp [0 ] != - tmp [1 ] || tmp [ 4 ] != - tmp [ 5 ] || inter ) {
945
964
/* at least one rank supplied a different split type check if our split_type is ok */
946
- ok = (MPI_UNDEFINED == orig_split_type ) || global_orig_split_type == orig_split_type ;
965
+ ok [0 ] = (MPI_UNDEFINED == orig_split_type ) || global_orig_split_type == orig_split_type ;
966
+ ok [1 ] = (MPI_UNDEFINED == orig_split_type ) || global_split_type == split_type ;
947
967
948
- rc = comm -> c_coll -> coll_allreduce (MPI_IN_PLACE , & ok , 1 , MPI_INT , MPI_MIN , comm ,
968
+ rc = comm -> c_coll -> coll_allreduce (MPI_IN_PLACE , & ok , 2 , MPI_INT , MPI_MIN , comm ,
949
969
comm -> c_coll -> coll_allreduce_module );
950
970
if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
951
971
return rc ;
952
972
}
953
973
954
974
if (inter ) {
955
975
/* need an extra allreduce to ensure that all ranks have the same result */
956
- rc = comm -> c_coll -> coll_allreduce (MPI_IN_PLACE , & ok , 1 , MPI_INT , MPI_MIN , comm ,
976
+ rc = comm -> c_coll -> coll_allreduce (MPI_IN_PLACE , & ok , 2 , MPI_INT , MPI_MIN , comm ,
957
977
comm -> c_coll -> coll_allreduce_module );
958
978
if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
959
979
return rc ;
960
980
}
961
981
}
962
982
963
- if (OPAL_UNLIKELY (!ok )) {
983
+ if (OPAL_UNLIKELY (!ok [0 ] || !ok [1 ])) {
984
+ if (0 == ompi_comm_rank (comm )) {
985
+ opal_info_get (info , "mpi_hw_resource_type" , & value , & flag );
986
+ if (!flag ) {
987
+ value = NULL ;
988
+ }
989
+ opal_show_help ("help-comm.txt" ,
990
+ "mismatched-split_type-values" ,
991
+ true,
992
+ ompi_comm_split_type_to_str (orig_split_type ),
993
+ orig_split_type ,
994
+ NULL == value ? "" : value -> string );
995
+ }
964
996
return OMPI_ERR_BAD_PARAM ;
965
997
}
966
998
@@ -978,14 +1010,6 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
978
1010
return OMPI_SUCCESS ;
979
1011
}
980
1012
981
- /* MPI_COMM_TYPE_HW_GUIDED: Check if 'value' the same at all ranks */
982
- if (tmp [4 ] != - tmp [5 ]) {
983
- if (0 == ompi_comm_rank (comm )) {
984
- opal_output (0 , "Error: Mismatched info values for MPI_COMM_TYPE_HW_GUIDED" );
985
- }
986
- return OMPI_ERR_BAD_PARAM ;
987
- }
988
-
989
1013
/* TODO: Make this better...
990
1014
*
991
1015
* See Example 7.4 in the MPI 4.0 standard for example usage.
0 commit comments