From ddd0c2c09d837aa5fc92d5bfbbcbce61fafb964a Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Wed, 22 Jan 2025 19:33:43 +0000 Subject: [PATCH 1/3] info/info_memkind: add code to handle memkind info add code to handle the memkind info objects defined in MPI 4.1 Signed-off-by: Edgar Gabriel --- ompi/communicator/comm.c | 16 +- ompi/communicator/comm_init.c | 15 +- ompi/file/file.c | 4 +- ompi/info/Makefile.am | 6 +- ompi/info/info.c | 4 + ompi/info/info_memkind.c | 552 ++++++++++++++++++++++++++++++++++ ompi/info/info_memkind.h | 70 +++++ ompi/instance/instance.c | 14 + ompi/mpi/c/intercomm_merge.c | 4 + ompi/win/win.c | 4 +- 10 files changed, 682 insertions(+), 7 deletions(-) create mode 100644 ompi/info/info_memkind.c create mode 100644 ompi/info/info_memkind.h diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index bfb16202e43..079c3146271 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -26,7 +26,7 @@ * Copyright (c) 2021 Nanook Consulting. All rights reserved. * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,6 +56,7 @@ #include "ompi/communicator/communicator.h" #include "ompi/mca/pml/pml.h" #include "ompi/request/request.h" +#include "ompi/info/info_memkind.h" #include "ompi/runtime/params.h" @@ -447,6 +448,7 @@ int ompi_comm_create_w_info (ompi_communicator_t *comm, ompi_group_t *group, opa if (info) { opal_info_dup(info, &(newcomp->super.s_info)); } + ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info); /* Set name for debugging purposes */ snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %s CREATE FROM %s", @@ -699,10 +701,11 @@ int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, ompi_comm_print_cid (newcomp), ompi_comm_print_cid (comm)); /* Copy info if there is one */ + newcomp->super.s_info = OBJ_NEW(opal_info_t); if (info) { - newcomp->super.s_info = OBJ_NEW(opal_info_t); opal_info_dup(info, &(newcomp->super.s_info)); } + ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info); /* Activate the communicator and init coll-component */ rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); @@ -994,6 +997,7 @@ static int ompi_comm_split_type_core(ompi_communicator_t *comm, if (info) { opal_infosubscribe_change_info(&newcomp->super, info); } + ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info); /* Activate the communicator and init coll-component */ rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); @@ -1347,6 +1351,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp if (info) { opal_infosubscribe_change_info(&newcomp->super, info); } + ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info); /* activate communicator and init coll-module */ rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); @@ -1437,6 +1442,7 @@ static int ompi_comm_idup_internal (ompi_communicator_t *comm, ompi_group_t *gro if (info) { opal_info_dup(info, &(newcomp->super.s_info)); } + ompi_info_memkind_copy_or_set (&comm->super, &newcomp->super, info); } ompi_comm_request_schedule_append (request, ompi_comm_idup_getcid, subreq, subreq[0] ? 1 : 0); @@ -1588,6 +1594,7 @@ int ompi_comm_create_from_group (ompi_group_t *group, const char *tag, opal_info if (NULL == newcomp->super.s_info) { return OMPI_ERR_OUT_OF_RESOURCE; } + ompi_info_memkind_copy_or_set (&group->grp_instance->super, &newcomp->super, info); /* activate communicator and init coll-module. use the group allreduce implementation as * no collective module has yet been selected. the tag does not matter as any tag will @@ -1727,6 +1734,10 @@ int ompi_intercomm_create (ompi_communicator_t *local_comm, int local_leader, om return rc; } + // Copy info if there is one. + newcomp->super.s_info = OBJ_NEW(opal_info_t); + ompi_info_memkind_copy_or_set (&local_comm->instance->super, &newcomp->super, &ompi_mpi_info_null.info.super); + *newintercomm = newcomp; return OMPI_SUCCESS; @@ -1888,6 +1899,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead if (info) { opal_info_dup(info, &(newcomp->super.s_info)); } + ompi_info_memkind_copy_or_set (&local_group->grp_instance->super, &newcomp->super, info); /* activate communicator and init coll-module */ rc = ompi_comm_activate (&newcomp, local_comm, leader_comm, &local_leader, &leader_comm_remote_leader, diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index 498bf4a1e70..9f091974e7c 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -25,7 +25,7 @@ * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2023 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * @@ -53,6 +53,7 @@ #include "ompi/dpm/dpm.h" #include "ompi/memchecker.h" #include "ompi/instance/instance.h" +#include "ompi/info/info_memkind.h" /* ** Table for Fortran <-> C communicator handle conversion @@ -266,6 +267,7 @@ int ompi_comm_init_mpi3 (void) free(str); } } + /* Setup MPI_COMM_SELF */ OBJ_CONSTRUCT(&ompi_mpi_comm_self, ompi_communicator_t); assert(ompi_mpi_comm_self.comm.c_f_to_c_index == 1); @@ -300,6 +302,17 @@ int ompi_comm_init_mpi3 (void) MPI_COMM_SELF, the keyhash will automatically be created. */ ompi_mpi_comm_self.comm.c_keyhash = NULL; + char *memkind_requested = getenv ("OMPI_MCA_mpi_memory_alloc_kinds"); + if (NULL != memkind_requested) { + char *memkind_provided; + + ompi_info_memkind_process (memkind_requested, &memkind_provided); + opal_infosubscribe_subscribe (&ompi_mpi_comm_world.comm.super, "mpi_memory_alloc_kinds", memkind_provided, ompi_info_memkind_cb); + opal_infosubscribe_subscribe (&ompi_mpi_comm_self.comm.super, "mpi_memory_alloc_kinds", memkind_provided, ompi_info_memkind_cb); + opal_infosubscribe_subscribe (&ompi_mpi_comm_world.comm.instance->super, "mpi_memory_alloc_kinds", memkind_provided, ompi_info_memkind_cb); + free (memkind_provided); + } + /* * finally here we set the predefined attribute keyvals */ diff --git a/ompi/file/file.c b/ompi/file/file.c index 9026fbea751..11e7a709d4f 100644 --- a/ompi/file/file.c +++ b/ompi/file/file.c @@ -18,6 +18,7 @@ * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,7 +35,7 @@ #include "ompi/runtime/params.h" #include "ompi/mca/io/base/base.h" #include "ompi/info/info.h" - +#include "ompi/info/info_memkind.h" opal_mutex_t ompi_mpi_file_bootstrap_mutex = OPAL_MUTEX_STATIC_INIT; @@ -123,6 +124,7 @@ int ompi_file_open(struct ompi_communicator_t *comm, const char *filename, if (info) { opal_info_dup(info, &(file->super.s_info)); } + ompi_info_memkind_copy_or_set (&comm->instance->super, &file->super, info); file->f_amode = amode; file->f_filename = strdup(filename); diff --git a/ompi/info/Makefile.am b/ompi/info/Makefile.am index e4af170dcf8..171d7877185 100644 --- a/ompi/info/Makefile.am +++ b/ompi/info/Makefile.am @@ -21,7 +21,9 @@ # This makefile.am does not stand on its own - it is included from ompi/Makefile.am headers += \ - info/info.h + info/info.h \ + info/info_memkind.h lib@OMPI_LIBMPI_NAME@_la_SOURCES += \ - info/info.c + info/info.c \ + info/info_memkind.c diff --git a/ompi/info/info.c b/ompi/info/info.c index 577910da840..15b4d50033f 100644 --- a/ompi/info/info.c +++ b/ompi/info/info.c @@ -51,6 +51,7 @@ #include "opal/util/info.h" #include "ompi/info/info.h" +#include "ompi/info/info_memkind.h" #include "ompi/runtime/mpiruntime.h" #include "ompi/runtime/params.h" #include "ompi/runtime/ompi_rte.h" @@ -351,6 +352,9 @@ int ompi_mpiinfo_finalize(void) } } + /* Release the array of available memkind objects */ + ompi_info_memkind_free_available(); + /* All done -- destroy the table */ OBJ_DESTRUCT(&ompi_info_f_to_c_table); diff --git a/ompi/info/info_memkind.c b/ompi/info/info_memkind.c new file mode 100644 index 00000000000..64b94aa1b4d --- /dev/null +++ b/ompi/info/info_memkind.c @@ -0,0 +1,552 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include +#include +#include + +#include "info/info_memkind.h" +#include "opal/util/argv.h" +#include "opal/mca/accelerator/accelerator.h" +#include "opal/mca/accelerator/base/base.h" +#include "opal/util/printf.h" +#include "ompi/errhandler/errcode.h" +#include "ompi/constants.h" + +static opal_mutex_t ompi_info_memkind_mutex = OPAL_MUTEX_STATIC_INIT; +static ompi_memkind_t *ompi_info_memkind_available; +static int ompi_info_memkind_num_available = 0; + +#if 0 +static void ompi_info_memkind_dump (const char *var_name, int num_memkinds, ompi_memkind_t *memkinds) +{ + for (int i = 0; i < num_memkinds; i++) { + printf("[%d] %s memkind[%d].name: %s ", getpid(), var_name, i, memkinds[i].im_name); + if (memkinds[i].im_num_restrictors > 0) { + printf("restrictors: "); + for (int j = 0; j < memkinds[i].im_num_restrictors; j++) { + printf("%c %s", (j == 0 ? ' ': ','), memkinds[i].im_restrictors[j]); + } + } else { + printf("num restrictors = 0"); + } + printf(" no_restrictors = %s", memkinds[i].im_no_restrictors ? "true" : "false"); + printf("\n"); + } +} +#endif + +static void ompi_info_memkind_extract (const char* memkind_str, int *num_memkinds, ompi_memkind_t **memkinds) +{ + /* The memkind string is a comma-separated list of memkinds, which can have two forms: + ** - standalone memkind type, which implies that all restrictors of the memkind are requested + ** (or looking at it the other way around, no restrictions are imposed on that memory kind) + ** - memkind:restrictor + ** The same memkind type can appear multiple times, e.g. + ** memkind_a:restrictor_1,memkind_a:restrictor_2; + */ + + /* Separate requested_str into an array of individual entries */ + char **memkind_combos = opal_argv_split(memkind_str, ','); + int max_num_memkinds = opal_argv_count(memkind_combos); + + ompi_memkind_t *memkind_arr = NULL; + memkind_arr = (ompi_memkind_t *) malloc(max_num_memkinds * sizeof(ompi_memkind_t)); + if (NULL == memkind_arr) { + goto err_exit; + } + for (int i = 0; i < max_num_memkinds; i++) { + memkind_arr[i].im_num_restrictors = 0; + memkind_arr[i].im_no_restrictors = true; + } + + int iter = 0; + char *m = memkind_combos[iter]; + int current_max = 0; + while (m != NULL) { + bool name_found = false; + char **tmp_str = opal_argv_split (m, ':'); + int pos; + + // Try to remove duplicates of the exact same name:restrictor appearance + for (int i = 0; i < current_max; i++) { + if (!strncmp(tmp_str[0], memkind_arr[i].im_name, strlen(tmp_str[0]))) { + name_found = true; + pos = i; + break; + } + } + + if (name_found) { + // check whether restrictor matches (if any present at either m + // or at memkind_arr[pos] ) + if (NULL != tmp_str[1] ) { + if (memkind_arr[pos].im_num_restrictors > 0) { + if (!strncmp(tmp_str[1], memkind_arr[pos].im_restrictors[0], strlen(tmp_str[1]))) { + // We have seen this exact name:restrictor combination already + m = memkind_combos[++iter]; + continue; + } + } + } else { + if (0 == memkind_arr[pos].im_num_restrictors) { + // neither the memkind that we want to add nor the + // element in the list of known memkinds have a restrictors, so skip + m = memkind_combos[++iter]; + continue; + } + } + + } + pos = current_max; + memkind_arr[pos].im_name = strdup (tmp_str[0]); + if (NULL != tmp_str[1]) { + memkind_arr[pos].im_restrictors[0] = strdup(tmp_str[1]); + memkind_arr[pos].im_num_restrictors = 1; + memkind_arr[pos].im_no_restrictors = false; + } + current_max++; + + opal_argv_free(tmp_str); + m = memkind_combos[++iter]; + } + + err_exit: + *num_memkinds = current_max; + *memkinds = memkind_arr; + + return; +} + +static int ompi_info_memkind_get_available(int *num_memkinds, ompi_memkind_t **memkinds) +{ + int ret = OMPI_SUCCESS; + if (ompi_info_memkind_num_available > 0) { + goto exit_no_lock; + } + + OPAL_THREAD_LOCK (&ompi_info_memkind_mutex); + if (ompi_info_memkind_num_available > 0) { + goto exit; + } + + int tmp_num = 2; +#if 0 + if (0 != strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) { + tmp_num++; + } +#endif + + ompi_info_memkind_available = (ompi_memkind_t *) malloc (tmp_num * sizeof(ompi_memkind_t)); + if (NULL == ompi_info_memkind_available) { + *num_memkinds = 0; + *memkinds = NULL; + OPAL_THREAD_UNLOCK(&ompi_info_memkind_mutex); + return OMPI_ERROR; + } + + /* The system and mpi memory kinds are defined in MPI 4.1 section 12.4.3 */ + ompi_info_memkind_available[0].im_name = strdup ("system"); + ompi_info_memkind_available[0].im_num_restrictors = 0; + ompi_info_memkind_available[0].im_no_restrictors = true; + + ompi_info_memkind_available[1].im_name = strdup ("mpi"); + ompi_info_memkind_available[1].im_num_restrictors = 3; + ompi_info_memkind_available[1].im_no_restrictors = false; + ompi_info_memkind_available[1].im_restrictors[0] = strdup ("alloc_mem"); + ompi_info_memkind_available[1].im_restrictors[1] = strdup ("win_allocate"); + ompi_info_memkind_available[1].im_restrictors[2] = strdup ("win_allocate_shared"); + +#if 0 + if (tmp_num > 2) { + opal_accelerator.get_memkind_info (&ompi_info_memkind_available[2]); + } +#endif + ompi_info_memkind_num_available = tmp_num; + + exit: + OPAL_THREAD_UNLOCK(&ompi_info_memkind_mutex); + exit_no_lock: + *num_memkinds = ompi_info_memkind_num_available; + *memkinds = ompi_info_memkind_available; + return ret; +} + +static void ompi_info_memkind_free (int num, ompi_memkind_t *memkind_arr) +{ + for (int i = 0; i < num; i++) { + free (memkind_arr[i].im_name); + for (int j = 0; j < memkind_arr[i].im_num_restrictors; j++) { + free (memkind_arr[i].im_restrictors[j]); + } + } + free (memkind_arr); +} + +static void ompi_info_memkind_str_create (int num_memkinds, ompi_memkind_t *memkinds, char** memkind_str) +{ + int num_elems = 0; + + for (int i = 0; i < num_memkinds; i++) { + if (memkinds[i].im_no_restrictors || (memkinds[i].im_num_restrictors == 0)) { + num_elems++; + } else { + num_elems += memkinds[i].im_num_restrictors; + } + } + + char **tmp_str_arr = (char**) malloc ((num_elems+1) * sizeof (char**)); + if (NULL == tmp_str_arr) { + *memkind_str = NULL; + return; + } + + int c = 0; + for (int i = 0; i < num_memkinds; i++) { + if (memkinds[i].im_no_restrictors || (memkinds[i].im_num_restrictors == 0)) { + opal_asprintf(&tmp_str_arr[c++], "%s",memkinds[i].im_name); + } else { + for (int j = 0; j < memkinds[i].im_num_restrictors; j++) { + opal_asprintf(&tmp_str_arr[c++], "%s:%s",memkinds[i].im_name, + memkinds[i].im_restrictors[j]); + } + } + } + tmp_str_arr[num_elems] = NULL; + + char *tmp_str = opal_argv_join(tmp_str_arr, ','); + opal_argv_free(tmp_str_arr); + + *memkind_str = tmp_str; + return; +} + +#define COPY_MEMKIND(_to,_from,_copy_restrictors) { \ + _to.im_name = strdup(_from.im_name); \ + _to.im_no_restrictors = _from.im_no_restrictors; \ + _to.im_num_restrictors = 0; \ + if (_copy_restrictors) { \ + _to.im_num_restrictors = _from.im_num_restrictors; \ + for (int _i = 0; _i < _from.im_num_restrictors; _i++) { \ + _to.im_restrictors[_i] = strdup (_from.im_restrictors[_i]); \ + } \ + } \ +} + +static int ompi_info_memkind_remove_unsupported (int num_requested, ompi_memkind_t *requested_memkinds, + int num_available, ompi_memkind_t *available_memkinds, + int *num_provided, ompi_memkind_t **provided_memkinds) +{ + bool have_system_memkind = false; + bool have_mpi_memkind = false; + int pos = 0; + int *apos = malloc (num_requested *sizeof(int)); + if (NULL == apos) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + ** Check whether we support the memkinds requested by the user + ** In addition, keep track whether user requested "system" and "mpi" + ** memory_alloc_kinds, since we always add those to the list + ** of support memory_alloc_kinds + */ + for (int i = 0; i < num_requested; i++) { + bool found_name = false; + bool found_all_requested_restrictors = true; + int j = -1; + + if (!have_system_memkind && !strncmp(requested_memkinds[i].im_name, "system", strlen("system"))) { + have_system_memkind = true; + } + if ( (!have_mpi_memkind && !strncmp(requested_memkinds[i].im_name, "mpi", strlen("mpi"))) && + (0 == requested_memkinds[i].im_num_restrictors)) { + have_mpi_memkind = true; + } + + // Check for memory_alloc_kind name first + for (j = 0; j < num_available; j++) { + if (!strncmp(requested_memkinds[i].im_name, available_memkinds[j].im_name, + strlen(requested_memkinds[i].im_name))) { + found_name = true; + break; + } + } + if (found_name) { + // Check whether we recognize all restrictors requested by user for + // this memory_alloc_kind + bool found_this_restrictor = false; + for (int k = 0; k < requested_memkinds[i].im_num_restrictors; k++) { + for (int l = 0; l < available_memkinds[j].im_num_restrictors; l++) { + if (!strncmp(requested_memkinds[i].im_restrictors[k], available_memkinds[j].im_restrictors[l], + strlen(requested_memkinds[i].im_restrictors[k]))) { + found_this_restrictor = true; + break; + } + } + if (!found_this_restrictor) { + found_all_requested_restrictors = false; + break; + } + } + if (found_all_requested_restrictors) { + apos[pos++] = i; + } + } + } + + // Add "system" and "mpi" memkinds as supported, even if not requested by user + int total_len = pos; + if (!have_system_memkind) { + total_len++; + } + if (!have_mpi_memkind) { + total_len++; + } + + ompi_memkind_t *final = (ompi_memkind_t*) malloc (total_len * sizeof(ompi_memkind_t)); + if (NULL == final) { + free (apos); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + int offset = 0; + // assert (!strncmp(available_memkinds[0].im_name, "system", strlen("system"))); + COPY_MEMKIND(final[0], available_memkinds[0], false); + offset++; + + // assert (!strncmp(available_memkinds[1].im_name, "mpim", strlen("mpi"))); + COPY_MEMKIND(final[offset], available_memkinds[1], false); + offset++; + + for (int i = 0; i < pos; i++) { + if (!strncmp(requested_memkinds[apos[i]].im_name, "system", strlen("system"))) { + continue; + } + if ( (!strncmp(requested_memkinds[apos[i]].im_name, "mpi", strlen("mpi"))) && + (0 == requested_memkinds[apos[i]].im_num_restrictors)) { + continue; + } + COPY_MEMKIND (final[offset], requested_memkinds[apos[i]], true); + offset++; + } + + *num_provided = total_len; + *provided_memkinds = final; + return OMPI_SUCCESS; +} + +static bool ompi_info_memkind_is_subset (int num_subset, ompi_memkind_t *subset, + int num_superset, ompi_memkind_t *superset) +{ + bool ret = true; + + for (int i = 0; i < num_subset; i++) { + bool found_name = false; + int j = -1; + + // Check for memory_alloc_kind name first + for (j = 0; j < num_superset; j++) { + if (!strncmp(subset[i].im_name, superset[j].im_name, + strlen(subset[i].im_name))) { + found_name = true; + break; + } + } + if (found_name) { + /* Check whether we recognize all restrictors requested listed in + ** the subset in the superset. Note, that the superset might not + ** have any restrictors set, in which case all restrictors are accepted + */ + if ((0 == superset[j].im_num_restrictors) || superset[j].im_no_restrictors) { + continue; + } + for (int k = 0; k < subset[i].im_num_restrictors; k++) { + bool found_this_restrictor = false; + for (int l = 0; l < superset[j].im_num_restrictors; l++) { + if (!strncmp(subset[i].im_restrictors[k], superset[j].im_restrictors[l], + strlen(subset[i].im_restrictors[k]))) { + found_this_restrictor = true; + break; + } + } + if (!found_this_restrictor) { + ret = false; + goto exit; + } + } + } else { + ret = false; + goto exit; + } + } + + exit: + return ret; +} + +static bool ompi_info_memkind_validate (const char *assert_str, const char *parent_str) +{ + int num_assert_memkinds = 0, num_parent_memkinds = 0; + ompi_memkind_t *assert_memkinds = NULL; + ompi_memkind_t *parent_memkinds = NULL; + bool ret; + + ompi_info_memkind_extract (assert_str, &num_assert_memkinds, &assert_memkinds); + ret = ompi_info_memkind_is_subset (num_assert_memkinds, assert_memkinds, + ompi_info_memkind_num_available, ompi_info_memkind_available); + if (!ret) { + goto exit; + } + + ompi_info_memkind_extract (parent_str, &num_parent_memkinds, &parent_memkinds); + ret = ompi_info_memkind_is_subset (num_assert_memkinds, assert_memkinds, + num_parent_memkinds, parent_memkinds); + + exit: + if (NULL != assert_memkinds) { + ompi_info_memkind_free(num_assert_memkinds, assert_memkinds); + } + if (NULL != parent_memkinds) { + ompi_info_memkind_free(num_parent_memkinds, parent_memkinds); + } + + return ret; +} + + +int ompi_info_memkind_process (const char* requested_str, char **provided_str) +{ + int err; + char *tmp_str = NULL; + + int num_requested_memkinds, num_available_memkinds, num_provided_memkinds; + ompi_memkind_t *requested_memkinds = NULL ; + ompi_memkind_t *available_memkinds = NULL; + ompi_memkind_t *provided_memkinds = NULL; + + if (NULL == requested_str) { + *provided_str = NULL; + return OMPI_SUCCESS; + } + + ompi_info_memkind_extract (requested_str, &num_requested_memkinds, &requested_memkinds); + err = ompi_info_memkind_get_available (&num_available_memkinds, &available_memkinds); + if (OMPI_SUCCESS != err) { + goto exit; + } + + err = ompi_info_memkind_remove_unsupported (num_requested_memkinds, requested_memkinds, + num_available_memkinds, available_memkinds, + &num_provided_memkinds, &provided_memkinds); + if (OMPI_SUCCESS != err) { + goto exit; + } + + ompi_info_memkind_str_create(num_provided_memkinds, provided_memkinds, &tmp_str); + + exit: + if (NULL != requested_memkinds) { + ompi_info_memkind_free(num_requested_memkinds, requested_memkinds); + } + if (NULL != provided_memkinds) { + ompi_info_memkind_free(num_provided_memkinds, provided_memkinds); + } + // Don't free the available_memkinds, they will be released in info_finalize; + + *provided_str = tmp_str; + return err; +} + +/** + * Callback invoked by the info subscriber mechanism. + * Accepts only the first value set. + */ +const char *ompi_info_memkind_cb (opal_infosubscriber_t *obj, const char *key, const char *value) +{ + opal_cstring_t *existing_val; + int flag; + char *ret_string; + + opal_info_get(obj->s_info, key, &existing_val, &flag); + if (0 == flag) { + ret_string = (char *)value; + } else { + ret_string = (char *)existing_val->string; + OBJ_RELEASE(existing_val); + } + return ret_string; +} + +/* +** Algorithm is a bit convoluted: +** +** - retrieve mpi_memory_alloc_kinds from parent instance. +** - if info object passed in as argument to this routine contains +** mpi_assert_memory_alloc_kinds key/value pair: +** - validate that we recognize all memory kinds listed +** - if that is the case, use the value of the of mpi_assert_memory_alloc_kinds +** when setting mpi_memory_alloc_kinds on the child object +** - else ignore the mpi_assert_memory_alloc_kinds. (Quote: +** "If the MPI library does not support one or more of the allocation kinds associated +** with the mpi_assert_memory_alloc_kinds info key, it will ignore this info key". +** So we are supposed to drop the entire key, not just the memory kinds that we did +** recognize.) +** - else use the same memkinds as in mpi_memory_alloc_kinds on the parent object on the +** child object (i.e. we just copy it over) +** +** To summerize, the value of one info key (mpi_assert_memory_alloc_kinds) can influence the +** value of another info key (mpi_memory_alloc_kinds). +*/ +int ompi_info_memkind_copy_or_set (opal_infosubscriber_t *parent, opal_infosubscriber_t *child, + opal_info_t *info) +{ + opal_cstring_t *parent_val; + opal_cstring_t *assert_val; + char *final_str = NULL; + int flag; + + opal_info_get(parent->s_info, "mpi_memory_alloc_kinds", &parent_val, &flag); + if (0 == flag) { + return OMPI_SUCCESS; + } + final_str = (char*) parent_val->string; + + if (NULL != info) { + opal_info_get(info, "mpi_assert_memory_alloc_kinds", &assert_val, &flag); + if (0 == flag) { + // assert_memory_alloc_kinds was not set by code + goto exit; + } + + // Validate asserted memory kind + bool ret = ompi_info_memkind_validate (assert_val->string, parent_val->string); + if (ret) { + final_str = (char*) assert_val->string; + } + OBJ_RELEASE(assert_val); + + opal_infosubscribe_subscribe (child, "mpi_assert_memory_alloc_kinds", final_str, + ompi_info_memkind_cb); + } + + exit: + opal_infosubscribe_subscribe (child, "mpi_memory_alloc_kinds", final_str, + ompi_info_memkind_cb); + OBJ_RELEASE(parent_val); + return OMPI_SUCCESS; +} + +void ompi_info_memkind_free_available (void) +{ + ompi_info_memkind_free (ompi_info_memkind_num_available, ompi_info_memkind_available); +} diff --git a/ompi/info/info_memkind.h b/ompi/info/info_memkind.h new file mode 100644 index 00000000000..42f21177454 --- /dev/null +++ b/ompi/info/info_memkind.h @@ -0,0 +1,70 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_INFO_MEMKIND_H +#define OMPI_INFO_MEMKIND_H + +#include "ompi_config.h" +#include "opal/util/info_subscriber.h" + +BEGIN_C_DECLS + +#define OMPI_MAX_NUM_MEMKIND_RESTRICTORS 3 +struct ompi_memkind_t { + char *im_name; + bool im_no_restrictors; + int im_num_restrictors; + char *im_restrictors[OMPI_MAX_NUM_MEMKIND_RESTRICTORS]; +}; +typedef struct ompi_memkind_t ompi_memkind_t; + +/* +** Given a string of user requested memory alloc kinds, create +** a string with the actually support memory kinds by the library. +** +** @param[IN]: requested_str input string +** @param[OUT]: provided_str result string +** +** @return: OMPI_SUCCESS or error on failure +*/ +OMPI_DECLSPEC int ompi_info_memkind_process (const char* requested_str, + char **provided_str); +/* +** Set the memory_alloc_kind info object on the child object, either +** by copying it from the parent object, or adjusting it based +** on the assert_memory_alloc_kind info object provided by the code +** during object creation +** +** @param[IN]: parent parent object (e.g. comm->super, file->super, etc.) +** @param [INOUT]: child child object +** @param[IN]: info info object provided by code during object creation +** (e.g. MPI_Comm_dup_with_info, MPI_File_open, etc.) +** +** @return: OMPI_SUCCESS or error on failure +*/ +OMPI_DECLSPEC int ompi_info_memkind_copy_or_set (opal_infosubscriber_t *parent, + opal_infosubscriber_t *child, + opal_info_t *info); + +/* +** free the array of available memkinds when shutting down the info +** infrastructure. +*/ +OMPI_DECLSPEC void ompi_info_memkind_free_available (void); + +/* +** Callback function used when registering memkind info object +*/ +OMPI_DECLSPEC const char *ompi_info_memkind_cb (opal_infosubscriber_t *obj, const char *key, const char *value); + +END_C_DECLS + +#endif /* OMPI_INFO_MEMKIND_H */ + diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index adf2e8ace89..4c28d7b69a0 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -33,6 +33,7 @@ #include "ompi/errhandler/errcode.h" #include "ompi/message/message.h" #include "ompi/info/info.h" +#include "ompi/info/info_memkind.h" #include "ompi/attribute/attribute.h" #include "ompi/op/op.h" #include "ompi/dpm/dpm.h" @@ -857,7 +858,20 @@ int ompi_mpi_instance_init (int ts_level, opal_info_t *info, ompi_errhandler_t /* Copy info if there is one. */ if (OPAL_UNLIKELY(NULL != info)) { + opal_cstring_t *memkind_requested; + int flag; + new_instance->super.s_info = OBJ_NEW(opal_info_t); + opal_info_get(info, "mpi_memory_alloc_kinds", &memkind_requested, &flag); + if (1 == flag) { + char *memkind_provided; + ompi_info_memkind_process (memkind_requested->string, &memkind_provided); + opal_infosubscribe_subscribe (&new_instance->super, "mpi_memory_alloc_kinds", + memkind_provided, ompi_info_memkind_cb); + free (memkind_provided); + OBJ_RELEASE(memkind_requested); + } + if (info) { opal_info_dup(info, &new_instance->super.s_info); } diff --git a/ompi/mpi/c/intercomm_merge.c b/ompi/mpi/c/intercomm_merge.c index 08c0b6d97b4..7abafcda5f9 100644 --- a/ompi/mpi/c/intercomm_merge.c +++ b/ompi/mpi/c/intercomm_merge.c @@ -140,6 +140,10 @@ int MPI_Intercomm_merge(MPI_Comm intercomm, int high, goto exit; } + newcomp->super.s_info = OBJ_NEW(opal_info_t); + ompi_info_memkind_copy_or_set (&intercomm->instance->super, &newcomp->super, + &ompi_mpi_info_null.info.super); + exit: if ( NULL != procs ) { diff --git a/ompi/win/win.c b/ompi/win/win.c index 2f0974ac016..aba941a5082 100644 --- a/ompi/win/win.c +++ b/ompi/win/win.c @@ -19,6 +19,7 @@ * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2018-2019 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,6 +39,7 @@ #include "ompi/attribute/attribute.h" #include "ompi/group/group.h" #include "ompi/info/info.h" +#include "ompi/info/info_memkind.h" #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/osc.h" @@ -167,7 +169,7 @@ static int alloc_window(struct ompi_communicator_t *comm, opal_info_t *info, int if (info) { opal_info_dup(info, &(win->super.s_info)); } - + ompi_info_memkind_copy_or_set (&comm->instance->super, &win->super, info); ret = opal_info_get_value_enum (win->super.s_info, "accumulate_ops", &acc_ops, OMPI_WIN_ACCUMULATE_OPS_SAME_OP_NO_OP, From 181c38aacbad6347d5d8d01a65bd1fb78ee48bad Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Sat, 25 Jan 2025 20:45:39 +0000 Subject: [PATCH 2/3] accelerator: add interface to retrieve memkind add an API to the accelerator component to retrieve the memory_alloc_kind information that is supported by the component. The values stored/returned are based on the side document that is about to be ratified, see https://github.com/mpi-forum/mem-alloc/blob/main/mem_alloc.tex Signed-off-by: Edgar Gabriel --- ompi/info/info_memkind.c | 6 +---- opal/mca/accelerator/accelerator.h | 10 ++++++++- opal/mca/accelerator/cuda/accelerator_cuda.c | 17 +++++++++++++- .../null/accelerator_null_component.c | 14 +++++++++++- .../rocm/accelerator_rocm_module.c | 22 +++++++++++++++---- .../accelerator/ze/accelerator_ze_module.c | 19 ++++++++++++++-- 6 files changed, 74 insertions(+), 14 deletions(-) diff --git a/ompi/info/info_memkind.c b/ompi/info/info_memkind.c index 64b94aa1b4d..26d8dd99431 100644 --- a/ompi/info/info_memkind.c +++ b/ompi/info/info_memkind.c @@ -140,11 +140,9 @@ static int ompi_info_memkind_get_available(int *num_memkinds, ompi_memkind_t **m } int tmp_num = 2; -#if 0 if (0 != strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) { tmp_num++; } -#endif ompi_info_memkind_available = (ompi_memkind_t *) malloc (tmp_num * sizeof(ompi_memkind_t)); if (NULL == ompi_info_memkind_available) { @@ -166,11 +164,9 @@ static int ompi_info_memkind_get_available(int *num_memkinds, ompi_memkind_t **m ompi_info_memkind_available[1].im_restrictors[1] = strdup ("win_allocate"); ompi_info_memkind_available[1].im_restrictors[2] = strdup ("win_allocate_shared"); -#if 0 if (tmp_num > 2) { - opal_accelerator.get_memkind_info (&ompi_info_memkind_available[2]); + opal_accelerator.get_memkind (&ompi_info_memkind_available[2]); } -#endif ompi_info_memkind_num_available = tmp_num; exit: diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h index 6279b7c615e..12f025f53c2 100644 --- a/opal/mca/accelerator/accelerator.h +++ b/opal/mca/accelerator/accelerator.h @@ -4,7 +4,7 @@ * reserved. * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. - * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights reserved. + * Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All Rights reserved. * Copyright (c) 2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -81,6 +81,7 @@ #include "opal/class/opal_object.h" #include "opal/mca/mca.h" +#include "ompi/info/info_memkind.h" BEGIN_C_DECLS @@ -654,6 +655,12 @@ typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_device */ typedef int (*opal_accelerator_base_module_get_mem_bw_fn_t)(int device, float *bw); +/** + * Get the memkind information of the accelerator component. + * @param[OUT] supported Memory alloc kinds supported by component + * + */ +typedef void (*opal_accelerator_base_module_get_memkind_fn_t)(ompi_memkind_t *memkind); /* * the standard public API data structure @@ -700,6 +707,7 @@ typedef struct { opal_accelerator_base_module_get_num_devices_fn_t num_devices; opal_accelerator_base_module_get_mem_bw_fn_t get_mem_bw; + opal_accelerator_base_module_get_memkind_fn_t get_memkind; } opal_accelerator_base_module_t; /** diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index 45358ef337e..46d646ee99d 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -26,6 +26,7 @@ #include "opal/mca/rcache/rcache.h" #include "opal/util/show_help.h" #include "opal/util/proc.h" +#include "ompi/info/info_memkind.h" /* Accelerator API's */ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags); static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream); @@ -80,6 +81,7 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc static int accelerator_cuda_sync_stream(opal_accelerator_stream_t *stream); static int accelerator_cuda_get_num_devices(int *num_devices); static int accelerator_cuda_get_mem_bw(int device, float *bw); +static void accelerator_cuda_get_memkind(ompi_memkind_t *memkind); #define GET_STREAM(_stream) \ ((_stream) == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((CUstream *) (_stream)->stream)) @@ -125,7 +127,8 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_get_buffer_id, accelerator_cuda_get_num_devices, - accelerator_cuda_get_mem_bw + accelerator_cuda_get_mem_bw, + accelerator_cuda_get_memkind }; static inline int opal_accelerator_cuda_delayed_init_check(void) @@ -1218,3 +1221,15 @@ static int accelerator_cuda_get_mem_bw(int device, float *bw) *bw = opal_accelerator_cuda_mem_bw[device]; return OPAL_SUCCESS; } + +static void mca_accelerator_cuda_get_memkind (ompi_memkind_t *memkind) +{ + memkind->im_name = strdup("cuda"); + memkind->im_no_restrictors = false; + memkind->im_num_restrictors = 3; + memkind->im_restrictors[0] = strdup("host"); + memkind->im_restrictors[1] = strdup("device"); + memkind->im_restrictors[2] = strdup("managed"); + + return; +} diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c index 8a6f0f8d810..996e5d59bce 100644 --- a/opal/mca/accelerator/null/accelerator_null_component.c +++ b/opal/mca/accelerator/null/accelerator_null_component.c @@ -23,6 +23,7 @@ #include "accelerator_null_component.h" #include "opal/constants.h" +#include "ompi/info/info_memkind.h" #include /* @@ -94,6 +95,7 @@ static int accelerator_null_sync_stream(opal_accelerator_stream_t *stream); static int accelerator_null_get_num_devices(int *num_devices); static int accelerator_null_get_mem_bw(int device, float *bw); +static void accelerator_null_get_memkind(ompi_memkind_t *memkind); /* * Instantiate the public struct with all of our public information @@ -174,7 +176,8 @@ opal_accelerator_base_module_t opal_accelerator_null_module = accelerator_null_get_buffer_id, accelerator_null_get_num_devices, - accelerator_null_get_mem_bw + accelerator_null_get_mem_bw, + accelerator_null_get_memkind }; static int accelerator_null_open(void) @@ -393,3 +396,12 @@ static int accelerator_null_get_mem_bw(int device, float *bw) *bw = 1.0; // return something that is not 0 return OPAL_SUCCESS; } + +static void accelerator_null_get_memkind (ompi_memkind_t *memkind) +{ + memkind->im_name = NULL; + memkind->im_no_restrictors = false; + memkind->im_num_restrictors = 0; + + return; +} diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c index 32b1fc3976a..dff735b325c 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c @@ -1,10 +1,9 @@ /* - * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All Rights reserved. - * $COPYRIGHT$ + * Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights reserved. * Copyright (c) 2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * + * $COPYRIGHT$ * Additional copyrights may follow * * $HEADER$ @@ -16,6 +15,7 @@ #include "opal/mca/accelerator/base/base.h" #include "opal/constants.h" #include "opal/util/output.h" +#include "ompi/info/info_memkind.h" /* Accelerator API's */ static int mca_accelerator_rocm_check_addr(const void *addr, int *dev_id, uint64_t *flags); @@ -74,6 +74,7 @@ static int mca_accelerator_rocm_sync_stream(opal_accelerator_stream_t *stream); static int mca_accelerator_rocm_get_num_devices(int *num_devices); static int mca_accelerator_rocm_get_mem_bw(int device, float *bw); +static void mca_accelerator_rocm_get_memkind(ompi_memkind_t *memkind); #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((hipStream_t *)_stream->stream)) @@ -118,7 +119,8 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module = mca_accelerator_rocm_get_buffer_id, mca_accelerator_rocm_get_num_devices, - mca_accelerator_rocm_get_mem_bw + mca_accelerator_rocm_get_mem_bw, + mca_accelerator_rocm_get_memkind }; @@ -946,3 +948,15 @@ static int mca_accelerator_rocm_get_mem_bw(int device, float *bw) *bw = opal_accelerator_rocm_mem_bw[device]; return OPAL_SUCCESS; } + +static void mca_accelerator_rocm_get_memkind (ompi_memkind_t *memkind) +{ + memkind->im_name = strdup("rocm"); + memkind->im_no_restrictors = false; + memkind->im_num_restrictors = 3; + memkind->im_restrictors[0] = strdup("host"); + memkind->im_restrictors[1] = strdup("device"); + memkind->im_restrictors[2] = strdup("managed"); + + return; +} diff --git a/opal/mca/accelerator/ze/accelerator_ze_module.c b/opal/mca/accelerator/ze/accelerator_ze_module.c index a5f7f37d5ac..cb6cff21fef 100644 --- a/opal/mca/accelerator/ze/accelerator_ze_module.c +++ b/opal/mca/accelerator/ze/accelerator_ze_module.c @@ -18,6 +18,7 @@ #include "opal/util/printf.h" #include "opal/constants.h" #include "opal/util/output.h" +#include "ompi/info/info_memkind.h" /* Accelerator API's */ static int mca_accelerator_ze_check_addr(const void *addr, int *dev_id, uint64_t *flags); @@ -77,6 +78,7 @@ static int mca_accelerator_ze_sync_stream(opal_accelerator_stream_t *stream); static int mca_accelerator_ze_get_num_devices(int *num_devices); static int mca_accelerator_ze_get_mem_bw(int device, float *bw); +static void mca_accelerator_ze_get_memkind(ompi_memkind_t *memkind); opal_accelerator_base_module_t opal_accelerator_ze_module = { @@ -118,7 +120,8 @@ opal_accelerator_base_module_t opal_accelerator_ze_module = .get_buffer_id = mca_accelerator_ze_get_buffer_id, .num_devices = mca_accelerator_ze_get_num_devices, - .get_mem_bw = mca_accelerator_ze_get_mem_bw + .get_mem_bw = mca_accelerator_ze_get_mem_bw, + .get_memkind = mca_accelerator_ze_get_memkind }; static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice) @@ -872,4 +875,16 @@ static int mca_accelerator_ze_get_mem_bw(int device, float *bw) * TODO */ return OPAL_ERR_NOT_IMPLEMENTED; -} \ No newline at end of file +} + +static void mca_accelerator_ze_get_memkind (ompi_memkind_t *memkind) +{ + memkind->im_name = strdup("level_zero"); + memkind->im_no_restrictors = false; + memkind->im_num_restrictors = 3; + memkind->im_restrictors[0] = strdup("host"); + memkind->im_restrictors[1] = strdup("device"); + memkind->im_restrictors[2] = strdup("shared"); + + return; +} From e901e6a081f6189aa4c16c7ceccc2284eaa137cc Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Mon, 10 Feb 2025 12:36:48 -0600 Subject: [PATCH 3/3] ompi/communicator: set grp_instance for leader_group in intercomm_create_from_group, we need to set the grp_instance value for the leader_group. This issue was exposed with the new mmemkind code and the mpi4py testsuite. Signed-off-by: Edgar Gabriel --- ompi/communicator/comm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 079c3146271..489357f2004 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -1802,6 +1802,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead ompi_comm_free (&local_comm); return OMPI_ERR_OUT_OF_RESOURCE; } + leader_group->grp_instance = local_group->grp_instance; /* create a unique tag for allocating the leader communicator. we can eliminate this step * if we take a CID from the newly allocated block belonging to local_comm. this is