|
57 | 57 |
|
58 | 58 | #include "ompi/runtime/params.h"
|
59 | 59 |
|
| 60 | +#if MPI_VERSION >= 4 |
| 61 | +struct ompi_comm_split_type_hw_guided_t { |
| 62 | + const char *info_value; |
| 63 | + int split_type; |
| 64 | +}; |
| 65 | +typedef struct ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_t; |
| 66 | +static const ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_support[] = { |
| 67 | + {.info_value = "mpi_shared_memory", .split_type = MPI_COMM_TYPE_SHARED}, |
| 68 | + {.info_value = "hwthread", .split_type = OMPI_COMM_TYPE_HWTHREAD}, |
| 69 | + {.info_value = "core", .split_type = OMPI_COMM_TYPE_CORE}, |
| 70 | + {.info_value = "l1cache", .split_type = OMPI_COMM_TYPE_L1CACHE}, |
| 71 | + {.info_value = "l2cache", .split_type = OMPI_COMM_TYPE_L2CACHE}, |
| 72 | + {.info_value = "l3cache", .split_type = OMPI_COMM_TYPE_L3CACHE}, |
| 73 | + {.info_value = "socket", .split_type = OMPI_COMM_TYPE_SOCKET}, |
| 74 | + {.info_value = "numanode", .split_type = OMPI_COMM_TYPE_NUMA}, |
| 75 | + {.info_value = "board", .split_type = OMPI_COMM_TYPE_BOARD}, |
| 76 | + {.info_value = "host", .split_type = OMPI_COMM_TYPE_HOST}, |
| 77 | + {.info_value = "cu", .split_type = OMPI_COMM_TYPE_CU}, |
| 78 | + {.info_value = "cluster", .split_type = OMPI_COMM_TYPE_CLUSTER}, |
| 79 | + {.info_value = NULL}, |
| 80 | +}; |
| 81 | +#endif |
| 82 | + |
60 | 83 | /*
|
61 | 84 | ** sort-function for MPI_Comm_split
|
62 | 85 | */
|
@@ -764,6 +787,17 @@ static int ompi_comm_split_type_get_part (ompi_group_t *group, const int split_t
|
764 | 787 | case OMPI_COMM_TYPE_CLUSTER:
|
765 | 788 | include = OPAL_PROC_ON_LOCAL_CLUSTER(locality);
|
766 | 789 | break;
|
| 790 | +#if MPI_VERSION >= 4 |
| 791 | + case MPI_COMM_TYPE_HW_GUIDED: |
| 792 | + case MPI_COMM_TYPE_HW_UNGUIDED: |
| 793 | + /* |
| 794 | + * MPI_COMM_TYPE_HW_(UN)GUIDED handled in calling function. |
| 795 | + * We should not get here as the split type will be changed |
| 796 | + * at a higher level. |
| 797 | + */ |
| 798 | + opal_output(0, "Error: in ompi_comm_split_type_get_part() unexpected split_type=%d", split_type); |
| 799 | + return OMPI_ERR_BAD_PARAM; |
| 800 | +#endif |
767 | 801 | }
|
768 | 802 |
|
769 | 803 | if (include) {
|
@@ -899,6 +933,75 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
|
899 | 933 | return OMPI_SUCCESS;
|
900 | 934 | }
|
901 | 935 |
|
| 936 | +#if MPI_VERSION >= 4 |
| 937 | + if (MPI_COMM_TYPE_HW_GUIDED == global_split_type) { |
| 938 | + int flag; |
| 939 | + opal_cstring_t *value = NULL; |
| 940 | + |
| 941 | + opal_info_get(info, "mpi_hw_resource_type", &value, &flag); |
| 942 | + /* If key is not in the 'info', then return MPI_COMM_NULL. |
| 943 | + * This is caught at the MPI interface level, but it doesn't hurt to |
| 944 | + * check it again. |
| 945 | + */ |
| 946 | + if (!flag) { |
| 947 | + *newcomm = MPI_COMM_NULL; |
| 948 | + return OMPI_SUCCESS; |
| 949 | + } |
| 950 | + |
| 951 | + /* Verify the value associated with the "mpi_hw_resource_type" key |
| 952 | + * - is supported, and |
| 953 | + * - is the same value at all ranks |
| 954 | + * |
| 955 | + * If not supported, then return MPI_COMM_NULL. |
| 956 | + * If not the same at all ranks, throw an error. |
| 957 | + */ |
| 958 | + flag = 0; |
| 959 | + for (int i = 0; ompi_comm_split_type_hw_guided_support[i].info_value; ++i) { |
| 960 | + if (0 == strncasecmp(value->string, ompi_comm_split_type_hw_guided_support[i].info_value, strlen(ompi_comm_split_type_hw_guided_support[i].info_value))) { |
| 961 | + global_split_type = ompi_comm_split_type_hw_guided_support[i].split_type; |
| 962 | + flag = 1; |
| 963 | + break; |
| 964 | + } |
| 965 | + } |
| 966 | + /* If not supported, then return MPI_COMM_NULL. */ |
| 967 | + if (0 == flag) { |
| 968 | + *newcomm = MPI_COMM_NULL; |
| 969 | + return OMPI_SUCCESS; |
| 970 | + } |
| 971 | + |
| 972 | + /* Verify all ranks have supplied the same info 'value' represented at |
| 973 | + * this point by global_split_type since we swapped out 'MPI_COMM_TYPE_HW_GUIDED' |
| 974 | + * for the specific OMPI_COMM_TYPE_ represented by the info 'value'. |
| 975 | + */ |
| 976 | + tmp[0] = global_split_type; |
| 977 | + tmp[1] = -global_split_type; |
| 978 | + rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &tmp, 2, MPI_INT, MPI_MAX, comm, |
| 979 | + comm->c_coll->coll_allreduce_module); |
| 980 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { |
| 981 | + return rc; |
| 982 | + } |
| 983 | + /* If not the same at all ranks, throw an error. */ |
| 984 | + if (tmp[0] != -tmp[1]) { |
| 985 | + if (0 == ompi_comm_rank(comm)) { |
| 986 | + opal_output(0, "Error: Mismatched info values for MPI_COMM_TYPE_HW_GUIDED"); |
| 987 | + } |
| 988 | + return OMPI_ERR_BAD_PARAM; |
| 989 | + } |
| 990 | + } |
| 991 | + |
| 992 | + /* TODO: Make this better... |
| 993 | + * |
| 994 | + * See Example 7.4 in the MPI 4.0 standard for example usage. |
| 995 | + * |
| 996 | + * Stage 0: Recognized, but not implemented. |
| 997 | + * Stage 1: Do better than that |
| 998 | + */ |
| 999 | + if (MPI_COMM_TYPE_HW_UNGUIDED == global_split_type) { |
| 1000 | + *newcomm = MPI_COMM_NULL; |
| 1001 | + return OMPI_SUCCESS; |
| 1002 | + } |
| 1003 | +#endif |
| 1004 | + |
902 | 1005 | /* Step 2: Build potential communicator groups. If any ranks will not be part of
|
903 | 1006 | * the ultimate communicator we will drop them later. This saves doing an extra
|
904 | 1007 | * allgather on the whole communicator. By using ompi_comm_split() later only
|
|
0 commit comments