diff --git a/opal/mca/btl/sm/Makefile.am b/opal/mca/btl/sm/Makefile.am index 4439a91c598..a4145356f21 100644 --- a/opal/mca/btl/sm/Makefile.am +++ b/opal/mca/btl/sm/Makefile.am @@ -13,7 +13,7 @@ # Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved. -# Copyright (c) 2020 Google, LLC. All rights reserved. +# Copyright (c) 2020-2021 Google, LLC. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -38,10 +38,6 @@ libmca_btl_sm_la_sources = \ btl_sm_fbox.h \ btl_sm_get.c \ btl_sm_put.c \ - btl_sm_xpmem.c \ - btl_sm_xpmem.h \ - btl_sm_knem.c \ - btl_sm_knem.h \ btl_sm_types.h \ btl_sm_virtual.h diff --git a/opal/mca/btl/sm/btl_sm.h b/opal/mca/btl/sm/btl_sm.h index 2ffa00b76d6..1de36e0f807 100644 --- a/opal/mca/btl/sm/btl_sm.h +++ b/opal/mca/btl/sm/btl_sm.h @@ -61,9 +61,6 @@ #include "opal/mca/pmix/pmix-internal.h" -#include "btl_sm_knem.h" -#include "btl_sm_xpmem.h" - BEGIN_C_DECLS #define min(a, b) ((a) < (b) ? (a) : (b)) @@ -121,32 +118,12 @@ int mca_btl_sm_sendi(struct mca_btl_base_module_t *btl, struct mca_btl_base_endp * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred */ -#if OPAL_BTL_SM_HAVE_XPMEM -int mca_btl_sm_put_xpmem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata); -#endif - -#if OPAL_BTL_SM_HAVE_CMA -int mca_btl_sm_put_cma(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata); -#endif - -#if OPAL_BTL_SM_HAVE_KNEM -int mca_btl_sm_put_knem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata); -#endif +int mca_btl_sm_put(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata); /** * Initiate an synchronous get. @@ -155,34 +132,12 @@ int mca_btl_sm_put_knem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *end * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred */ -#if OPAL_BTL_SM_HAVE_XPMEM -int mca_btl_sm_get_xpmem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata); -#endif - -#if OPAL_BTL_SM_HAVE_CMA -int mca_btl_sm_get_cma(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata); -#endif - -#if OPAL_BTL_SM_HAVE_KNEM -int mca_btl_sm_get_knem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata); -#endif - -ino_t mca_btl_sm_get_user_ns_id(void); +int mca_btl_sm_get(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata); /** * Allocate a segment. diff --git a/opal/mca/btl/sm/btl_sm_component.c b/opal/mca/btl/sm/btl_sm_component.c index a7040296b62..4c6555e3ed2 100644 --- a/opal/mca/btl/sm/btl_sm_component.c +++ b/opal/mca/btl/sm/btl_sm_component.c @@ -21,7 +21,7 @@ * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2018 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2019-2020 Google, Inc. All rights reserved. + * Copyright (c) 2019-2021 Google, Inc. All rights reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * @@ -41,7 +41,8 @@ #include "opal/mca/btl/sm/btl_sm_fbox.h" #include "opal/mca/btl/sm/btl_sm_fifo.h" #include "opal/mca/btl/sm/btl_sm_frag.h" -#include "opal/mca/btl/sm/btl_sm_xpmem.h" +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/smsc/smsc.h" #ifdef HAVE_SYS_STAT_H # include @@ -66,20 +67,6 @@ static int mca_btl_sm_component_register(void); static mca_btl_base_module_t ** mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enable_mpi_threads); -/* This enumeration is in order of preference */ -static mca_base_var_enum_value_t single_copy_mechanisms[] = { -#if OPAL_BTL_SM_HAVE_XPMEM - {.value = MCA_BTL_SM_XPMEM, .string = "xpmem"}, -#endif -#if OPAL_BTL_SM_HAVE_CMA - {.value = MCA_BTL_SM_CMA, .string = "cma"}, -#endif -#if OPAL_BTL_SM_HAVE_KNEM - {.value = MCA_BTL_SM_KNEM, .string = "knem"}, -#endif - {.value = MCA_BTL_SM_NONE, .string = "none"}, - {.value = 0, .string = NULL}}; - /* * Shared Memory (SM) component instance. */ @@ -106,8 +93,6 @@ mca_btl_sm_component_t mca_btl_sm_component = { static int mca_btl_sm_component_register(void) { - mca_base_var_enum_t *new_enum; - (void) mca_base_var_group_component_register(&mca_btl_sm_component.super.btl_version, "Enhanced shared memory byte transport later"); @@ -146,25 +131,15 @@ static int mca_btl_sm_component_register(void) MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_sm_component.memcpy_limit); -#if OPAL_BTL_SM_HAVE_XPMEM - mca_btl_sm_component.log_attach_align = 21; - (void) mca_base_component_var_register(&mca_btl_sm_component.super.btl_version, "log_align", - "Log base 2 of the alignment to use for xpmem " - "segments (default: 21, minimum: 12, maximum: 25)", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_btl_sm_component.log_attach_align); -#endif -#if OPAL_BTL_SM_HAVE_XPMEM && 64 == MCA_BTL_SM_BITNESS +#if 64 == MCA_BTL_SM_BITNESS mca_btl_sm_component.segment_size = 1 << 24; #else mca_btl_sm_component.segment_size = 1 << 22; #endif (void) mca_base_component_var_register(&mca_btl_sm_component.super.btl_version, "segment_size", "Maximum size of all shared " -#if OPAL_BTL_SM_HAVE_XPMEM && 64 == MCA_BTL_SM_BITNESS +#if 64 == MCA_BTL_SM_BITNESS "memory buffers (default: 16M)", #else "memory buffers (default: 4M)", @@ -212,21 +187,6 @@ static int mca_btl_sm_component_register(void) MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_sm_component.fbox_size); - (void) mca_base_var_enum_create("btl_sm_single_copy_mechanisms", single_copy_mechanisms, - &new_enum); - - /* Default to the best available mechanism (see the enumerator for ordering) */ - mca_btl_sm_component.single_copy_mechanism = single_copy_mechanisms[0].value; - (void) - mca_base_component_var_register(&mca_btl_sm_component.super.btl_version, - "single_copy_mechanism", - "Single copy mechanism to use (defaults to best available)", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, - MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_GROUP, - &mca_btl_sm_component.single_copy_mechanism); - OBJ_RELEASE(new_enum); - if (0 == access("/dev/shm", W_OK)) { mca_btl_sm_component.backing_directory = "/dev/shm"; } else { @@ -240,45 +200,19 @@ static int mca_btl_sm_component_register(void) MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_sm_component.backing_directory); -#if OPAL_BTL_SM_HAVE_KNEM - /* Currently disabling DMA mode by default; it's not clear that this is useful in all - * applications and architectures. */ - mca_btl_sm_component.knem_dma_min = 0; - (void) mca_base_component_var_register( - &mca_btl_sm_component.super.btl_version, "knem_dma_min", - "Minimum message size (in bytes) to use the knem DMA mode; " - "ignored if knem does not support DMA mode (0 = do not use the " - "knem DMA mode, default: 0)", - MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_btl_sm_component.knem_dma_min); -#endif - mca_btl_sm.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH; - if (MCA_BTL_SM_XPMEM == mca_btl_sm_component.single_copy_mechanism) { - mca_btl_sm.super.btl_eager_limit = 32 * 1024; - mca_btl_sm.super.btl_rndv_eager_limit = mca_btl_sm.super.btl_eager_limit; - mca_btl_sm.super.btl_max_send_size = mca_btl_sm.super.btl_eager_limit; - mca_btl_sm.super.btl_min_rdma_pipeline_size = INT_MAX; - } else { - mca_btl_sm.super.btl_eager_limit = 4 * 1024; - mca_btl_sm.super.btl_rndv_eager_limit = 32 * 1024; - mca_btl_sm.super.btl_max_send_size = 32 * 1024; - mca_btl_sm.super.btl_min_rdma_pipeline_size = INT_MAX; - } + mca_btl_sm.super.btl_eager_limit = 4 * 1024; + mca_btl_sm.super.btl_rndv_eager_limit = 32 * 1024; + mca_btl_sm.super.btl_max_send_size = 32 * 1024; + mca_btl_sm.super.btl_min_rdma_pipeline_size = INT_MAX; mca_btl_sm.super.btl_rdma_pipeline_send_length = mca_btl_sm.super.btl_eager_limit; mca_btl_sm.super.btl_rdma_pipeline_frag_size = mca_btl_sm.super.btl_eager_limit; mca_btl_sm.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_SEND; - if (MCA_BTL_SM_NONE != mca_btl_sm_component.single_copy_mechanism) { - /* True single copy mechanisms should provide better bandwidth */ - mca_btl_sm.super.btl_bandwidth = 40000; /* Mbs */ - } else { - mca_btl_sm.super.btl_bandwidth = 10000; /* Mbs */ - } - + mca_btl_sm.super.btl_bandwidth = 20000; /* Mbs */ mca_btl_sm.super.btl_latency = 1; /* Microsecs */ /* Call the BTL based to register its MCA params */ @@ -302,9 +236,6 @@ static int mca_btl_sm_component_open(void) OBJ_CONSTRUCT(&mca_btl_sm_component.lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_sm_component.pending_endpoints, opal_list_t); OBJ_CONSTRUCT(&mca_btl_sm_component.pending_fragments, opal_list_t); -#if OPAL_BTL_SM_HAVE_KNEM - mca_btl_sm.knem_fd = -1; -#endif return OPAL_SUCCESS; } @@ -323,17 +254,13 @@ static int mca_btl_sm_component_close(void) OBJ_DESTRUCT(&mca_btl_sm_component.pending_endpoints); OBJ_DESTRUCT(&mca_btl_sm_component.pending_fragments); - if (MCA_BTL_SM_XPMEM == mca_btl_sm_component.single_copy_mechanism + if (mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP) && NULL != mca_btl_sm_component.my_segment) { munmap(mca_btl_sm_component.my_segment, mca_btl_sm_component.segment_size); } mca_btl_sm_component.my_segment = NULL; -#if OPAL_BTL_SM_HAVE_KNEM - mca_btl_sm_knem_fini(); -#endif - if (mca_btl_sm_component.mpool) { mca_btl_sm_component.mpool->mpool_finalize(mca_btl_sm_component.mpool); mca_btl_sm_component.mpool = NULL; @@ -342,157 +269,40 @@ static int mca_btl_sm_component_close(void) return OPAL_SUCCESS; } -/* - * mca_btl_sm_parse_proc_ns_user() tries to get the user namespace ID - * of the current process. - * Returns the ID of the user namespace. In the case of an error '0' is returned. - */ -ino_t mca_btl_sm_get_user_ns_id(void) -{ - struct stat buf; - - if (0 > stat("/proc/self/ns/user", &buf)) { - /* - * Something went wrong, probably an old kernel that does not support namespaces - * simply assume all processes are in the same user namespace and return 0 - */ - return 0; - } - - return buf.st_ino; -} static int mca_btl_base_sm_modex_send(void) { - union sm_modex_t modex; - int modex_size, rc; + mca_btl_sm_modex_t modex; + int modex_size; -#if OPAL_BTL_SM_HAVE_XPMEM - if (MCA_BTL_SM_XPMEM == mca_btl_sm_component.single_copy_mechanism) { - modex.xpmem.seg_id = mca_btl_sm_component.my_seg_id; - modex.xpmem.segment_base = mca_btl_sm_component.my_segment; - modex.xpmem.address_max = mca_btl_sm_component.my_address_max; + modex_size = sizeof(modex) - sizeof(modex.seg_ds); - modex_size = sizeof(modex.xpmem); + if (!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { + modex.seg_ds_size = opal_shmem_sizeof_shmem_ds(&mca_btl_sm_component.seg_ds); + memmove(&modex.seg_ds, &mca_btl_sm_component.seg_ds, modex.seg_ds_size); + modex_size += modex.seg_ds_size; } else { -#endif - modex.other.seg_ds_size = opal_shmem_sizeof_shmem_ds(&mca_btl_sm_component.seg_ds); - memmove(&modex.other.seg_ds, &mca_btl_sm_component.seg_ds, modex.other.seg_ds_size); - modex.other.user_ns_id = mca_btl_sm_get_user_ns_id(); - /* - * If modex.other.user_ns_id is '0' something did not work out - * during user namespace detection. Assuming there are no - * namespaces available it will return '0' for all processes and - * the check later will see '0' everywhere and not disable CMA. - */ - modex_size = sizeof(modex.other); - -#if OPAL_BTL_SM_HAVE_XPMEM + modex.segment_base = (uintptr_t) mca_btl_sm_component.my_segment; + modex.seg_ds_size = 0; } -#endif + int rc; OPAL_MODEX_SEND(rc, PMIX_LOCAL, &mca_btl_sm_component.super.btl_version, &modex, modex_size); return rc; } -#if OPAL_BTL_SM_HAVE_XPMEM || OPAL_BTL_SM_HAVE_CMA || OPAL_BTL_SM_HAVE_KNEM -static void mca_btl_sm_select_next_single_copy_mechanism(void) +static mca_btl_base_registration_handle_t * +mca_btl_sm_register_mem(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags) { - for (int i = 0; single_copy_mechanisms[i].value != MCA_BTL_SM_NONE; ++i) { - if (single_copy_mechanisms[i].value == mca_btl_sm_component.single_copy_mechanism) { - mca_btl_sm_component.single_copy_mechanism = single_copy_mechanisms[i + 1].value; - return; - } - } + return (mca_btl_base_registration_handle_t *) MCA_SMSC_CALL(register_region, base, size); } -#endif -static void mca_btl_sm_check_single_copy(void) +static int mca_btl_sm_deregister_mem_knem(struct mca_btl_base_module_t *btl, + struct mca_btl_base_registration_handle_t *handle) { -#if OPAL_BTL_SM_HAVE_XPMEM || OPAL_BTL_SM_HAVE_CMA || OPAL_BTL_SM_HAVE_KNEM - int initial_mechanism = mca_btl_sm_component.single_copy_mechanism; -#endif - -#if OPAL_BTL_SM_HAVE_XPMEM - if (MCA_BTL_SM_XPMEM == mca_btl_sm_component.single_copy_mechanism) { - /* try to create an xpmem segment for the entire address space */ - int rc = mca_btl_sm_xpmem_init(); - if (OPAL_SUCCESS != rc) { - if (MCA_BTL_SM_XPMEM == initial_mechanism) { - opal_show_help("help-btl-sm.txt", "xpmem-make-failed", true, - opal_process_info.nodename, errno, strerror(errno)); - } - - mca_btl_sm_select_next_single_copy_mechanism(); - } - } -#endif - -#if OPAL_BTL_SM_HAVE_CMA - if (MCA_BTL_SM_CMA == mca_btl_sm_component.single_copy_mechanism) { - /* Check if we have the proper permissions for CMA */ - char buffer = '0'; - bool cma_happy = false; - int fd; - - /* check system setting for current ptrace scope */ - fd = open("/proc/sys/kernel/yama/ptrace_scope", O_RDONLY); - if (0 <= fd) { - read(fd, &buffer, 1); - close(fd); - } - - /* ptrace scope 0 will allow an attach from any of the process owner's - * processes. ptrace scope 1 limits attachers to the process tree - * starting at the parent of this process. */ - if ('0' != buffer) { -# if defined PR_SET_PTRACER - /* try setting the ptrace scope to allow attach */ - int ret = prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); - if (0 == ret) { - cma_happy = true; - } -# endif - } else { - cma_happy = true; - } - - if (!cma_happy) { - mca_btl_sm_select_next_single_copy_mechanism(); - - if (MCA_BTL_SM_CMA == initial_mechanism) { - opal_show_help("help-btl-sm.txt", "cma-permission-denied", true, - opal_process_info.nodename); - } - } else { - /* ptrace_scope will allow CMA */ - mca_btl_sm.super.btl_get = mca_btl_sm_get_cma; - mca_btl_sm.super.btl_put = mca_btl_sm_put_cma; - } - } -#endif - -#if OPAL_BTL_SM_HAVE_KNEM - if (MCA_BTL_SM_KNEM == mca_btl_sm_component.single_copy_mechanism) { - /* mca_btl_sm_knem_init will set the appropriate get/put functions */ - int rc = mca_btl_sm_knem_init(); - if (OPAL_SUCCESS != rc) { - if (MCA_BTL_SM_KNEM == initial_mechanism) { - opal_show_help("help-btl-sm.txt", "knem requested but not available", true, - opal_process_info.nodename); - } - - /* disable single copy */ - mca_btl_sm_select_next_single_copy_mechanism(); - } - } -#endif - - if (MCA_BTL_SM_NONE == mca_btl_sm_component.single_copy_mechanism) { - mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_RDMA; - mca_btl_sm.super.btl_get = NULL; - mca_btl_sm.super.btl_put = NULL; - } + MCA_SMSC_CALL(deregister_region, (void *) handle); + return OPAL_SUCCESS; } /* @@ -513,15 +323,6 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab return NULL; } -#if OPAL_BTL_SM_HAVE_XPMEM - /* limit segment alignment to be between 4k and 16M */ - if (component->log_attach_align < 12) { - component->log_attach_align = 12; - } else if (component->log_attach_align > 25) { - component->log_attach_align = 25; - } -#endif - btls = (mca_btl_base_module_t **) calloc(1, sizeof(mca_btl_base_module_t *)); if (NULL == btls) { return NULL; @@ -542,9 +343,40 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab /* no fast boxes allocated initially */ component->num_fbox_in_endpoints = 0; - mca_btl_sm_check_single_copy(); + rc = mca_smsc_base_select(); + if (OPAL_SUCCESS == rc) { + mca_btl_sm.super.btl_flags |= MCA_BTL_FLAGS_RDMA; + mca_btl_sm.super.btl_get = mca_btl_sm_get; + mca_btl_sm.super.btl_put = mca_btl_sm_put; + + mca_btl_sm.super.btl_bandwidth = 40000; /* Mbs */ + + if (mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { + mca_btl_sm.super.btl_eager_limit = 32 * 1024; + mca_btl_sm.super.btl_rndv_eager_limit = mca_btl_sm.super.btl_eager_limit; + mca_btl_sm.super.btl_max_send_size = mca_btl_sm.super.btl_eager_limit; + mca_btl_sm.super.btl_min_rdma_pipeline_size = INT_MAX; + } + if (mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTATION)) { + ssize_t handle_size = mca_smsc_base_registration_data_size(); + if (handle_size > 0) { + mca_btl_sm.super.btl_registration_handle_size = (size_t) handle_size; + mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem; + mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem_knem; + } else { + BTL_ERROR(("single-copy component requires registration but could not provide the " + "registration handle size")); + rc = (int) handle_size; + } + } + } + if (OPAL_SUCCESS != rc) { + mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_RDMA; + mca_btl_sm.super.btl_get = NULL; + mca_btl_sm.super.btl_put = NULL; + } - if (MCA_BTL_SM_XPMEM != mca_btl_sm_component.single_copy_mechanism) { + if (!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { char *sm_file; rc = opal_asprintf(&sm_file, "%s" OPAL_PATH_SEP "sm_segment.%s.%u.%x.%d", @@ -570,7 +402,8 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab goto failed; } } else { - /* when using xpmem it is safe to use an anonymous segment */ + /* if the shared-memory single-copy component can map memory (XPMEM) an anonymous segment + * can be used instead */ component->my_segment = mmap(NULL, component->segment_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); if ((void *) -1 == component->my_segment) { @@ -599,12 +432,11 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab return btls; failed: -#if OPAL_BTL_SM_HAVE_XPMEM - if (MCA_BTL_SM_XPMEM == mca_btl_sm_component.single_copy_mechanism) { + if (mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { munmap(component->my_segment, component->segment_size); - } else -#endif + } else { opal_shmem_unlink(&component->seg_ds); + } if (btls) { free(btls); @@ -630,23 +462,17 @@ void mca_btl_sm_poll_handle_frag(mca_btl_sm_hdr_t *hdr, struct mca_btl_base_endp .cbdata = reg->cbdata}; if (hdr->flags & MCA_BTL_SM_FLAG_SINGLE_COPY) { -#if OPAL_BTL_SM_HAVE_XPMEM - mca_rcache_base_registration_t *xpmem_reg; - - xpmem_reg = sm_get_registation(endpoint, hdr->sc_iov.iov_base, hdr->sc_iov.iov_len, 0, - &segments[1].seg_addr.pval); - assert(NULL != xpmem_reg); + void *ctx = MCA_SMSC_CALL(map_peer_region, endpoint->smsc_endpoint, /*flags=*/0, + hdr->sc_iov.iov_base, hdr->sc_iov.iov_len, + &segments[1].seg_addr.pval); + assert(NULL != ctx); segments[1].seg_len = hdr->sc_iov.iov_len; frag.des_segment_count = 2; /* recv upcall */ reg->cbfunc(&mca_btl_sm.super, &frag); - sm_return_registration(xpmem_reg, endpoint); -#else - BTL_ERROR(("illegal flag set in incoming fragment")); - _exit(EXIT_FAILURE); -#endif + MCA_SMSC_CALL(unmap_peer_region, ctx); } else { reg->cbfunc(&mca_btl_sm.super, &frag); } diff --git a/opal/mca/btl/sm/btl_sm_get.c b/opal/mca/btl/sm/btl_sm_get.c index dc52e7ad939..3868cda4f8b 100644 --- a/opal/mca/btl/sm/btl_sm_get.c +++ b/opal/mca/btl/sm/btl_sm_get.c @@ -4,7 +4,7 @@ * reserved. * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2019 Google, Inc. All rights reserved. + * Copyright (c) 2019-2021 Google, Inc. All rights reserved. * Copyright (c) 2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -20,16 +20,7 @@ #include "opal/mca/btl/sm/btl_sm.h" #include "opal/mca/btl/sm/btl_sm_endpoint.h" #include "opal/mca/btl/sm/btl_sm_frag.h" -#include "opal/mca/btl/sm/btl_sm_xpmem.h" - -#if OPAL_BTL_SM_HAVE_CMA -# include - -# if OPAL_CMA_NEED_SYSCALL_DEFS -# include "opal/sys/cma.h" -# endif /* OPAL_CMA_NEED_SYSCALL_DEFS */ - -#endif +#include "opal/mca/smsc/smsc.h" /** * Initiate an synchronous get. @@ -38,131 +29,18 @@ * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred */ -#if OPAL_BTL_SM_HAVE_XPMEM -int mca_btl_sm_get_xpmem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata) -{ - mca_rcache_base_registration_t *reg; - void *rem_ptr; - - /* silence warning about unused arguments */ - (void) local_handle; - (void) remote_handle; - - reg = sm_get_registation(endpoint, (void *) (intptr_t) remote_address, size, 0, &rem_ptr); - if (OPAL_UNLIKELY(NULL == rem_ptr)) { - return OPAL_ERROR; - } - - sm_memmove(local_address, rem_ptr, size); - - sm_return_registration(reg, endpoint); - - /* always call the callback function */ - cbfunc(btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); - - return OPAL_SUCCESS; -} -#endif -#if OPAL_BTL_SM_HAVE_CMA -int mca_btl_sm_get_cma(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata) +int mca_btl_sm_get(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata) { - struct iovec src_iov = {.iov_base = (void *) (intptr_t) remote_address, .iov_len = size}; - struct iovec dst_iov = {.iov_base = local_address, .iov_len = size}; - ssize_t ret; - - /* - * According to the man page : - * "On success, process_vm_readv() returns the number of bytes read and - * process_vm_writev() returns the number of bytes written. This return - * value may be less than the total number of requested bytes, if a - * partial read/write occurred. (Partial transfers apply at the - * granularity of iovec elements. These system calls won't perform a - * partial transfer that splits a single iovec element.)". - * So since we use a single iovec element, the returned size should either - * be 0 or size, and the do loop should not be needed here. - * We tried on various Linux kernels with size > 2 GB, and surprisingly, - * the returned value is always 0x7ffff000 (fwiw, it happens to be the size - * of the larger number of pages that fits a signed 32 bits integer). - * We do not know whether this is a bug from the kernel, the libc or even - * the man page, but for the time being, we do as is process_vm_readv() could - * return any value. - */ - do { - ret = process_vm_readv(endpoint->segment_data.other.seg_ds->seg_cpid, &dst_iov, 1, &src_iov, - 1, 0); - if (0 > ret) { - if (ESRCH == errno) { - BTL_PEER_ERROR(NULL, ("CMA read %ld, expected %lu, errno = %d\n", (long) ret, - (unsigned long) size, errno)); - return OPAL_ERROR; - } - BTL_ERROR(("CMA read %ld, expected %lu, errno = %d\n", (long) ret, (unsigned long) size, - errno)); - return OPAL_ERROR; - } - src_iov.iov_base = (void *) ((char *) src_iov.iov_base + ret); - src_iov.iov_len -= ret; - dst_iov.iov_base = (void *) ((char *) dst_iov.iov_base + ret); - dst_iov.iov_len -= ret; - } while (0 < src_iov.iov_len); - - /* always call the callback function */ - cbfunc(btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); - - return OPAL_SUCCESS; -} -#endif - -#if OPAL_BTL_SM_HAVE_KNEM -int mca_btl_sm_get_knem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata) -{ - struct knem_cmd_param_iovec recv_iovec; - struct knem_cmd_inline_copy icopy; - - /* Fill in the ioctl data fields. There's no async completion, so - we don't need to worry about getting a slot, etc. */ - recv_iovec.base = (uintptr_t) local_address; - recv_iovec.len = size; - icopy.local_iovec_array = (uintptr_t) &recv_iovec; - icopy.local_iovec_nr = 1; - icopy.remote_cookie = remote_handle->cookie; - icopy.remote_offset = remote_address - remote_handle->base_addr; - icopy.write = 0; - icopy.flags = 0; - - /* Use the DMA flag if knem supports it *and* the segment length - * is greater than the cutoff. Not that if DMA is not supported - * or the user specified 0 for knem_dma_min the knem_dma_min was - * set to UINT_MAX in mca_btl_sm_knem_init. */ - if (mca_btl_sm_component.knem_dma_min <= size) { - icopy.flags = KNEM_FLAG_DMA; - } - /* synchronous flags only, no need to specify icopy.async_status_index */ - - /* When the ioctl returns, the transfer is done and we can invoke - the btl callback and return the frag */ - if (OPAL_UNLIKELY(0 != ioctl(mca_btl_sm.knem_fd, KNEM_CMD_INLINE_COPY, &icopy))) { - return OPAL_ERROR; - } - - if (KNEM_STATUS_FAILED == icopy.current_status) { - return OPAL_ERROR; + int ret = MCA_SMSC_CALL(copy_from, endpoint->smsc_endpoint, local_address, + (void *) (intptr_t) remote_address, size, remote_handle); + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + return ret; } /* always call the callback function */ @@ -170,4 +48,3 @@ int mca_btl_sm_get_knem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *end return OPAL_SUCCESS; } -#endif diff --git a/opal/mca/btl/sm/btl_sm_knem.c b/opal/mca/btl/sm/btl_sm_knem.c deleted file mode 100644 index a005beeef05..00000000000 --- a/opal/mca/btl/sm/btl_sm_knem.c +++ /dev/null @@ -1,203 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal/mca/btl/sm/btl_sm.h" - -#if OPAL_BTL_SM_HAVE_KNEM - -# include - -# include -# include -# include -# include - -# include "opal/util/show_help.h" - -OBJ_CLASS_INSTANCE(mca_btl_sm_registration_handle_t, mca_rcache_base_registration_t, NULL, NULL); - -static int mca_btl_sm_knem_reg(void *reg_data, void *base, size_t size, - mca_rcache_base_registration_t *reg) -{ - mca_btl_sm_registration_handle_t *knem_reg = (mca_btl_sm_registration_handle_t *) reg; - struct knem_cmd_create_region knem_cr; - struct knem_cmd_param_iovec knem_iov; - - knem_iov.base = (uintptr_t) base; - knem_iov.len = size; - - knem_cr.iovec_array = (uintptr_t) &knem_iov; - knem_cr.iovec_nr = 1; - knem_cr.protection = 0; - - if (reg->access_flags & (MCA_RCACHE_ACCESS_LOCAL_WRITE | MCA_RCACHE_ACCESS_REMOTE_WRITE)) { - knem_cr.protection |= PROT_WRITE; - } - - if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_READ) { - knem_cr.protection |= PROT_READ; - } - - /* Sm will explicitly destroy this cookie */ - knem_cr.flags = 0; - if (OPAL_UNLIKELY(ioctl(mca_btl_sm.knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) { - return OPAL_ERROR; - } - - knem_reg->btl_handle.cookie = knem_cr.cookie; - knem_reg->btl_handle.base_addr = (intptr_t) base; - - return OPAL_SUCCESS; -} - -static int mca_btl_sm_knem_dereg(void *reg_data, mca_rcache_base_registration_t *reg) -{ - mca_btl_sm_registration_handle_t *knem_reg = (mca_btl_sm_registration_handle_t *) reg; - - /* NTH: explicity ignore the return code. Don't care about this cookie anymore anyway. */ - (void) ioctl(mca_btl_sm.knem_fd, KNEM_CMD_DESTROY_REGION, &knem_reg->btl_handle.cookie); - - return OPAL_SUCCESS; -} - -static mca_btl_base_registration_handle_t * -mca_btl_sm_register_mem_knem(struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *base, size_t size, - uint32_t flags) -{ - mca_btl_sm_t *sm_module = (mca_btl_sm_t *) btl; - mca_btl_sm_registration_handle_t *reg = NULL; - int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; - int rc; - - rc = sm_module->knem_rcache->rcache_register(sm_module->knem_rcache, base, size, 0, - access_flags, - (mca_rcache_base_registration_t **) ®); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return NULL; - } - - return ®->btl_handle; -} - -static int mca_btl_sm_deregister_mem_knem(struct mca_btl_base_module_t *btl, - struct mca_btl_base_registration_handle_t *handle) -{ - mca_btl_sm_t *sm_module = (mca_btl_sm_t *) btl; - mca_btl_sm_registration_handle_t *reg = (mca_btl_sm_registration_handle_t - *) ((intptr_t) handle - - offsetof(mca_btl_sm_registration_handle_t, - btl_handle)); - - sm_module->knem_rcache->rcache_deregister(sm_module->knem_rcache, ®->base); - - return OPAL_SUCCESS; -} - -int mca_btl_sm_knem_init(void) -{ - mca_rcache_base_resources_t rcache_resources = {.cache_name = "sm", - .reg_data = NULL, - .sizeof_reg = sizeof( - mca_btl_sm_registration_handle_t), - .register_mem = mca_btl_sm_knem_reg, - .deregister_mem = mca_btl_sm_knem_dereg}; - struct knem_cmd_info knem_info; - int rc; - - /* Open the knem device. Try to print a helpful message if we - fail to open it. */ - mca_btl_sm.knem_fd = open("/dev/knem", O_RDWR); - if (mca_btl_sm.knem_fd < 0) { - if (EACCES == errno) { - struct stat sbuf; - if (0 != stat("/dev/knem", &sbuf)) { - sbuf.st_mode = 0; - } - opal_show_help("help-btl-sm.txt", "knem permission denied", true, - opal_process_info.nodename, sbuf.st_mode); - } else { - opal_show_help("help-btl-sm.txt", "knem fail open", true, opal_process_info.nodename, - errno, strerror(errno)); - } - - return OPAL_ERR_NOT_AVAILABLE; - } - - do { - /* Check that the ABI if kernel module running is the same - * as what we were compiled against. */ - memset(&knem_info, 0, sizeof(knem_info)); - rc = ioctl(mca_btl_sm.knem_fd, KNEM_CMD_GET_INFO, &knem_info); - if (rc < 0) { - opal_show_help("help-btl-sm.txt", "knem get ABI fail", true, opal_process_info.nodename, - errno, strerror(errno)); - break; - } - - if (KNEM_ABI_VERSION != knem_info.abi) { - opal_show_help("help-btl-sm.txt", "knem ABI mismatch", true, opal_process_info.nodename, - KNEM_ABI_VERSION, knem_info.abi); - break; - } - - if (!(mca_btl_sm_component.knem_dma_min && (knem_info.features & KNEM_FEATURE_DMA))) { - /* disable DMA */ - mca_btl_sm_component.knem_dma_min = UINT_MAX; - } - - /* TODO: add async support */ - - /* knem set up successfully */ - mca_btl_sm.super.btl_get = mca_btl_sm_get_knem; - mca_btl_sm.super.btl_put = mca_btl_sm_put_knem; - - /* knem requires registration */ - mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem_knem; - mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem_knem; - mca_btl_sm.super.btl_registration_handle_size = sizeof(mca_btl_base_registration_handle_t); - - mca_btl_sm.knem_rcache = mca_rcache_base_module_create("grdma", NULL, &rcache_resources); - if (NULL == mca_btl_sm.knem_rcache) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - return OPAL_SUCCESS; - } while (0); - - mca_btl_sm_knem_fini(); - - return OPAL_ERR_NOT_AVAILABLE; - ; -} - -int mca_btl_sm_knem_fini(void) -{ - if (-1 != mca_btl_sm.knem_fd) { - close(mca_btl_sm.knem_fd); - mca_btl_sm.knem_fd = -1; - } - - if (mca_btl_sm.knem_rcache) { - (void) mca_rcache_base_module_destroy(mca_btl_sm.knem_rcache); - mca_btl_sm.knem_rcache = NULL; - } - - return OPAL_SUCCESS; -} - -int mca_btl_sm_knem_progress(void) -{ - /* NTH: does nothing until async support is added */ - return OPAL_SUCCESS; -} - -#endif diff --git a/opal/mca/btl/sm/btl_sm_knem.h b/opal/mca/btl/sm/btl_sm_knem.h deleted file mode 100644 index bea93f5fc64..00000000000 --- a/opal/mca/btl/sm/btl_sm_knem.h +++ /dev/null @@ -1,25 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef BTL_SM_KNEM_H -#define BTL_SM_KNEM_H - -#if OPAL_BTL_SM_HAVE_KNEM - -# include - -int mca_btl_sm_knem_init(void); -int mca_btl_sm_knem_fini(void); -int mca_btl_sm_knem_progress(void); - -#endif /* OPAL_BTL_SM_HAVE_KNEM */ - -#endif /* BTL_SM_KNEM_H */ diff --git a/opal/mca/btl/sm/btl_sm_module.c b/opal/mca/btl/sm/btl_sm_module.c index 8720f17f317..689965cef0b 100644 --- a/opal/mca/btl/sm/btl_sm_module.c +++ b/opal/mca/btl/sm/btl_sm_module.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, @@ -36,7 +36,7 @@ #include "opal/mca/btl/sm/btl_sm_fbox.h" #include "opal/mca/btl/sm/btl_sm_fifo.h" #include "opal/mca/btl/sm/btl_sm_frag.h" -#include "opal/mca/btl/sm/btl_sm_xpmem.h" +#include "opal/mca/smsc/smsc.h" #include @@ -64,27 +64,6 @@ mca_btl_sm_t mca_btl_sm = { .btl_prepare_src = sm_prepare_src, .btl_send = mca_btl_sm_send, .btl_sendi = mca_btl_sm_sendi, .btl_dump = mca_btl_base_dump, .btl_register_error = sm_register_error_cb}}; -/* - * Exit function copied from btl_usnic_util.c - * - * The following comment tells Coverity that this function does not return. - * See https://scan.coverity.com/tune. - */ - -/* coverity[+kill] */ -static void sm_btl_exit(mca_btl_sm_t *btl) -{ - if (NULL != btl && NULL != btl->error_cb) { - btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL, (opal_proc_t *) opal_proc_local_get(), - "The sm BTL is aborting the MPI job (via PML error callback)."); - } - - /* If the PML error callback returns (or if there wasn't one), just exit. Shrug. */ - fprintf(stderr, "*** The Open MPI sm BTL is aborting the MPI job (via exit(3)).\n"); - fflush(stderr); - exit(1); -} - static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) { mca_btl_sm_component_t *component = &mca_btl_sm_component; @@ -147,7 +126,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) return rc; } - if (MCA_BTL_SM_XPMEM != mca_btl_sm_component.single_copy_mechanism) { + if (!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { /* initialize free list for buffered send fragments */ rc = opal_free_list_init(&component->sm_frags_max_send, sizeof(mca_btl_sm_frag_t), opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag_t), @@ -164,20 +143,13 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) /* set flag indicating btl has been inited */ sm_btl->btl_inited = true; -#if OPAL_BTL_SM_HAVE_XPMEM - if (MCA_BTL_SM_XPMEM == mca_btl_sm_component.single_copy_mechanism) { - mca_btl_sm_component.vma_module = mca_rcache_base_vma_module_alloc(); - } -#endif - return OPAL_SUCCESS; } static int init_sm_endpoint(struct mca_btl_base_endpoint_t **ep_out, struct opal_proc_t *proc) { mca_btl_sm_component_t *component = &mca_btl_sm_component; - union sm_modex_t *modex; - ino_t my_user_ns_id; + mca_btl_sm_modex_t *modex; size_t msg_size; int rc; @@ -204,69 +176,36 @@ static int init_sm_endpoint(struct mca_btl_base_endpoint_t **ep_out, struct opal } /* attach to the remote segment */ -#if OPAL_BTL_SM_HAVE_XPMEM - if (MCA_BTL_SM_XPMEM == mca_btl_sm_component.single_copy_mechanism) { - /* always use xpmem if it is available */ - ep->segment_data.xpmem.apid = xpmem_get(modex->xpmem.seg_id, XPMEM_RDWR, - XPMEM_PERMIT_MODE, (void *) 0666); - ep->segment_data.xpmem.address_max = modex->xpmem.address_max; - (void) sm_get_registation(ep, modex->xpmem.segment_base, - mca_btl_sm_component.segment_size, MCA_RCACHE_FLAGS_PERSIST, - (void **) &ep->segment_base); + ep->smsc_endpoint = NULL; /* assume no one sided support */ + if( NULL != mca_smsc ) { + ep->smsc_endpoint = MCA_SMSC_CALL(get_endpoint, proc); + } + if (NULL == ep->smsc_endpoint) { + /* disable RDMA */ + mca_btl_sm.super.btl_get = NULL; + mca_btl_sm.super.btl_put = NULL; + mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_RDMA; + } + if (mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { + ep->smsc_map_context = MCA_SMSC_CALL(map_peer_region, ep->smsc_endpoint, /*flag=*/0, + (void *) (uintptr_t) modex->segment_base, + mca_btl_sm_component.segment_size, + (void **) &ep->segment_base); } else { -#endif /* store a copy of the segment information for detach */ - ep->segment_data.other.seg_ds = malloc(modex->other.seg_ds_size); - if (NULL == ep->segment_data.other.seg_ds) { + ep->seg_ds = malloc(modex->seg_ds_size); + if (NULL == ep->seg_ds) { return OPAL_ERR_OUT_OF_RESOURCE; } - memcpy(ep->segment_data.other.seg_ds, &modex->other.seg_ds, modex->other.seg_ds_size); + memcpy(ep->seg_ds, &modex->seg_ds, modex->seg_ds_size); - ep->segment_base = opal_shmem_segment_attach(ep->segment_data.other.seg_ds); + ep->segment_base = opal_shmem_segment_attach(ep->seg_ds); if (NULL == ep->segment_base) { return OPAL_ERROR; } - - if (MCA_BTL_SM_CMA == mca_btl_sm_component.single_copy_mechanism) { - my_user_ns_id = mca_btl_sm_get_user_ns_id(); - if (my_user_ns_id != modex->other.user_ns_id) { - mca_base_var_source_t source; - int vari; - rc = mca_base_var_find_by_name("btl_sm_single_copy_mechanism", &vari); - if (OPAL_ERROR == rc) { - return OPAL_ERROR; - } - rc = mca_base_var_get_value(vari, NULL, &source, NULL); - if (OPAL_ERROR == rc) { - return OPAL_ERROR; - } - /* - * CMA is not possible as different user namespaces are in use. - * Currently the kernel does not allow * process_vm_{read,write}v() - * for processes running in different user namespaces even if - * all involved user IDs are mapped to the same user ID. - */ - if (MCA_BASE_VAR_SOURCE_DEFAULT != source) { - /* If CMA has been explicitly selected we want to error out */ - opal_show_help("help-btl-sm.txt", "cma-different-user-namespace-error", - true, opal_process_info.nodename); - sm_btl_exit(&mca_btl_sm); - } - /* - * If CMA has been selected because it is the default or - * some fallback, this falls back even further. - */ - opal_show_help("help-btl-sm.txt", "cma-different-user-namespace-warning", true, - opal_process_info.nodename); - mca_btl_sm_component.single_copy_mechanism = MCA_BTL_SM_NONE; - mca_btl_sm.super.btl_get = NULL; - mca_btl_sm.super.btl_put = NULL; - } - } -#if OPAL_BTL_SM_HAVE_XPMEM } -#endif + OBJ_CONSTRUCT(&ep->lock, opal_mutex_t); free(modex); @@ -415,17 +354,11 @@ static int sm_finalize(struct mca_btl_base_module_t *btl) free(component->fbox_in_endpoints); component->fbox_in_endpoints = NULL; - if (MCA_BTL_SM_XPMEM != mca_btl_sm_component.single_copy_mechanism) { + if (!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { opal_shmem_unlink(&mca_btl_sm_component.seg_ds); opal_shmem_segment_detach(&mca_btl_sm_component.seg_ds); } -#if OPAL_BTL_SM_HAVE_XPMEM - if (NULL != mca_btl_sm_component.vma_module) { - OBJ_RELEASE(mca_btl_sm_component.vma_module); - } -#endif - return OPAL_SUCCESS; } @@ -459,7 +392,7 @@ mca_btl_base_descriptor_t *mca_btl_sm_alloc(struct mca_btl_base_module_t *btl, MCA_BTL_SM_FRAG_ALLOC_USER(frag, endpoint); } else if (size <= mca_btl_sm.super.btl_eager_limit) { MCA_BTL_SM_FRAG_ALLOC_EAGER(frag, endpoint); - } else if (MCA_BTL_SM_XPMEM != mca_btl_sm_component.single_copy_mechanism + } else if (!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP) && size <= mca_btl_sm.super.btl_max_send_size) { MCA_BTL_SM_FRAG_ALLOC_MAX(frag, endpoint); } @@ -512,7 +445,7 @@ static struct mca_btl_base_descriptor_t *sm_prepare_src(struct mca_btl_base_modu struct iovec iov; /* non-contiguous data requires the convertor */ - if (MCA_BTL_SM_XPMEM != mca_btl_sm_component.single_copy_mechanism + if (!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP) && total_size > mca_btl_sm.super.btl_eager_limit) { MCA_BTL_SM_FRAG_ALLOC_MAX(frag, endpoint); } else { @@ -534,7 +467,7 @@ static struct mca_btl_base_descriptor_t *sm_prepare_src(struct mca_btl_base_modu frag->segments[0].seg_len = *size + reserve; } else { - if (MCA_BTL_SM_XPMEM != mca_btl_sm_component.single_copy_mechanism) { + if (!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { if (OPAL_LIKELY(total_size <= mca_btl_sm.super.btl_eager_limit)) { MCA_BTL_SM_FRAG_ALLOC_EAGER(frag, endpoint); } else { @@ -548,10 +481,9 @@ static struct mca_btl_base_descriptor_t *sm_prepare_src(struct mca_btl_base_modu return NULL; } -#if OPAL_BTL_SM_HAVE_XPMEM - /* use xpmem to send this segment if it is above the max inline send size */ - if (OPAL_UNLIKELY(MCA_BTL_SM_XPMEM == mca_btl_sm_component.single_copy_mechanism - && total_size > (size_t) mca_btl_sm_component.max_inline_send)) { + /* use single-copy to send this segment if it is above the max inline send size */ + if (mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP) + && total_size > (size_t) mca_btl_sm_component.max_inline_send) { /* single copy send */ frag->hdr->flags = MCA_BTL_SM_FLAG_SINGLE_COPY; @@ -564,14 +496,11 @@ static struct mca_btl_base_descriptor_t *sm_prepare_src(struct mca_btl_base_modu frag->segments[1].seg_addr.pval = data_ptr; frag->base.des_segment_count = 2; } else { -#endif /* NTH: the covertor adds some latency so we bypass it here */ memcpy((void *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve), data_ptr, *size); frag->segments[0].seg_len = total_size; -#if OPAL_BTL_SM_HAVE_XPMEM } -#endif } frag->base.order = order; @@ -588,37 +517,38 @@ static void mca_btl_sm_endpoint_constructor(mca_btl_sm_endpoint_t *ep) ep->fbox_out.fbox = NULL; } -#if OPAL_BTL_SM_HAVE_XPMEM -#endif - static void mca_btl_sm_endpoint_destructor(mca_btl_sm_endpoint_t *ep) { OBJ_DESTRUCT(&ep->pending_frags); OBJ_DESTRUCT(&ep->pending_frags_lock); -#if OPAL_BTL_SM_HAVE_XPMEM - if (MCA_BTL_SM_XPMEM == mca_btl_sm_component.single_copy_mechanism) { - mca_btl_sm_xpmem_cleanup_endpoint(ep); - } else -#endif - if (ep->segment_data.other.seg_ds) { - opal_shmem_ds_t seg_ds; - - /* opal_shmem_segment_detach expects a opal_shmem_ds_t and will - * stomp past the end of the seg_ds if it is too small (which - * ep->seg_ds probably is) */ - memcpy(&seg_ds, ep->segment_data.other.seg_ds, - opal_shmem_sizeof_shmem_ds(ep->segment_data.other.seg_ds)); - free(ep->segment_data.other.seg_ds); - ep->segment_data.other.seg_ds = NULL; - - /* disconnect from the peer's segment */ - opal_shmem_segment_detach(&seg_ds); + if (!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { + if (ep->seg_ds) { + opal_shmem_ds_t seg_ds; + + /* opal_shmem_segment_detach expects a opal_shmem_ds_t and will + * stomp past the end of the seg_ds if it is too small (which + * ep->seg_ds probably is) */ + memcpy(&seg_ds, ep->seg_ds, opal_shmem_sizeof_shmem_ds(ep->seg_ds)); + free(ep->seg_ds); + ep->seg_ds = NULL; + + /* disconnect from the peer's segment */ + opal_shmem_segment_detach(&seg_ds); + } + } else if (NULL != ep->smsc_map_context) { + MCA_SMSC_CALL(unmap_peer_region, ep->smsc_map_context); } + if (ep->fbox_out.fbox) { opal_free_list_return(&mca_btl_sm_component.sm_fboxes, ep->fbox_out.fbox); } + if (ep->smsc_endpoint) { + MCA_SMSC_CALL(return_endpoint, ep->smsc_endpoint); + ep->smsc_endpoint = NULL; + } + ep->fbox_in.buffer = ep->fbox_out.buffer = NULL; ep->fbox_out.fbox = NULL; ep->segment_base = NULL; diff --git a/opal/mca/btl/sm/btl_sm_put.c b/opal/mca/btl/sm/btl_sm_put.c index c497999d7a2..99f397eb7ac 100644 --- a/opal/mca/btl/sm/btl_sm_put.c +++ b/opal/mca/btl/sm/btl_sm_put.c @@ -4,7 +4,7 @@ * reserved. * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2019 Google, Inc. All rights reserved. + * Copyright (c) 2019-2021 Google, Inc. All rights reserved. * Copyright (c) 2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -20,16 +20,7 @@ #include "opal/mca/btl/sm/btl_sm.h" #include "opal/mca/btl/sm/btl_sm_endpoint.h" #include "opal/mca/btl/sm/btl_sm_frag.h" -#include "opal/mca/btl/sm/btl_sm_xpmem.h" - -#if OPAL_BTL_SM_HAVE_CMA -# include - -# if OPAL_CMA_NEED_SYSCALL_DEFS -# include "opal/sys/cma.h" -# endif /* OPAL_CMA_NEED_SYSCALL_DEFS */ - -#endif +#include "opal/mca/smsc/smsc.h" /** * Initiate an synchronous put. @@ -38,111 +29,18 @@ * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred */ -#if OPAL_BTL_SM_HAVE_XPMEM -int mca_btl_sm_put_xpmem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata) -{ - mca_rcache_base_registration_t *reg; - void *rem_ptr; - - reg = sm_get_registation(endpoint, (void *) (intptr_t) remote_address, size, 0, &rem_ptr); - if (OPAL_UNLIKELY(NULL == reg)) { - return OPAL_ERROR; - } - - sm_memmove(rem_ptr, local_address, size); - - sm_return_registration(reg, endpoint); - /* always call the callback function */ - cbfunc(btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); - - return OPAL_SUCCESS; -} -#endif - -#if OPAL_BTL_SM_HAVE_CMA -int mca_btl_sm_put_cma(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata) +int mca_btl_sm_put(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata) { - struct iovec src_iov = {.iov_base = local_address, .iov_len = size}; - struct iovec dst_iov = {.iov_base = (void *) (intptr_t) remote_address, .iov_len = size}; - ssize_t ret; - - /* This should not be needed, see the rationale in mca_btl_sm_get_cma() */ - do { - ret = process_vm_writev(endpoint->segment_data.other.seg_ds->seg_cpid, &src_iov, 1, - &dst_iov, 1, 0); - if (0 > ret) { - if (ESRCH == errno) { - BTL_PEER_ERROR(NULL, ("CMA wrote %ld, expected %lu, errno = %d\n", (long) ret, - (unsigned long) size, errno)); - return OPAL_ERROR; - } - BTL_ERROR(("CMA wrote %ld, expected %lu, errno = %d\n", (long) ret, - (unsigned long) size, errno)); - return OPAL_ERROR; - } - src_iov.iov_base = (void *) ((char *) src_iov.iov_base + ret); - src_iov.iov_len -= ret; - dst_iov.iov_base = (void *) ((char *) dst_iov.iov_base + ret); - dst_iov.iov_len -= ret; - } while (0 < src_iov.iov_len); - - /* always call the callback function */ - cbfunc(btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); - - return OPAL_SUCCESS; -} -#endif - -#if OPAL_BTL_SM_HAVE_KNEM -int mca_btl_sm_put_knem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata) -{ - struct knem_cmd_param_iovec send_iovec; - struct knem_cmd_inline_copy icopy; - - /* Fill in the ioctl data fields. There's no async completion, so - we don't need to worry about getting a slot, etc. */ - send_iovec.base = (uintptr_t) local_address; - send_iovec.len = size; - icopy.local_iovec_array = (uintptr_t) &send_iovec; - icopy.local_iovec_nr = 1; - icopy.remote_cookie = remote_handle->cookie; - icopy.remote_offset = remote_address - remote_handle->base_addr; - icopy.write = 1; - icopy.flags = 0; - - /* Use the DMA flag if knem supports it *and* the segment length - * is greater than the cutoff. Not that if DMA is not supported - * or the user specified 0 for knem_dma_min the knem_dma_min was - * set to UINT_MAX in mca_btl_sm_knem_init. */ - if (mca_btl_sm_component.knem_dma_min <= size) { - icopy.flags = KNEM_FLAG_DMA; - } - /* synchronous flags only, no need to specify icopy.async_status_index */ - - /* When the ioctl returns, the transfer is done and we can invoke - the btl callback and return the frag */ - if (OPAL_UNLIKELY(0 != ioctl(mca_btl_sm.knem_fd, KNEM_CMD_INLINE_COPY, &icopy))) { - return OPAL_ERROR; - } - - if (KNEM_STATUS_FAILED == icopy.current_status) { - return OPAL_ERROR; + int ret = MCA_SMSC_CALL(copy_to, endpoint->smsc_endpoint, local_address, + (void *) (intptr_t) remote_address, size, remote_handle); + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + return ret; } /* always call the callback function */ @@ -150,4 +48,3 @@ int mca_btl_sm_put_knem(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *end return OPAL_SUCCESS; } -#endif diff --git a/opal/mca/btl/sm/btl_sm_types.h b/opal/mca/btl/sm/btl_sm_types.h index 7ca79d0095a..c9e5803c5a7 100644 --- a/opal/mca/btl/sm/btl_sm_types.h +++ b/opal/mca/btl/sm/btl_sm_types.h @@ -35,49 +35,19 @@ #include "opal_config.h" #include "opal/class/opal_free_list.h" #include "opal/mca/btl/btl.h" - -#if OPAL_BTL_SM_HAVE_XPMEM - -# if defined(HAVE_XPMEM_H) -# include - -typedef struct xpmem_addr xpmem_addr_t; -# elif defined(HAVE_SN_XPMEM_H) -# include - -typedef int64_t xpmem_segid_t; -typedef int64_t xpmem_apid_t; -# endif -#endif +#include "opal/mca/smsc/smsc.h" /* * Modex data */ -union sm_modex_t { -#if OPAL_BTL_SM_HAVE_XPMEM - struct sm_modex_xpmem_t { - xpmem_segid_t seg_id; - void *segment_base; - uintptr_t address_max; - } xpmem; -#endif - struct sm_modex_other_t { - ino_t user_ns_id; - int seg_ds_size; - /* seg_ds needs to be the last element */ - opal_shmem_ds_t seg_ds; - } other; +struct mca_btl_sm_modex_t { + uint64_t segment_base; + int seg_ds_size; + /* seg_ds needs to be the last element */ + opal_shmem_ds_t seg_ds; }; -/** - * Single copy mechanisms - */ -enum { - MCA_BTL_SM_XPMEM = 0, - MCA_BTL_SM_CMA = 1, - MCA_BTL_SM_KNEM = 2, - MCA_BTL_SM_NONE = 3, -}; +typedef struct mca_btl_sm_modex_t mca_btl_sm_modex_t; typedef struct mca_btl_base_endpoint_t { opal_list_item_t super; @@ -109,18 +79,9 @@ typedef struct mca_btl_base_endpoint_t { opal_mutex_t lock; /**< lock to protect endpoint structures from concurrent * access */ - union { -#if OPAL_BTL_SM_HAVE_XPMEM - struct { - xpmem_apid_t apid; /**< xpmem apid for remote peer */ - uintptr_t address_max; /**< largest address that can be attached */ - } xpmem; -#endif - struct { - pid_t pid; /**< pid of remote peer (used for CMA) */ - opal_shmem_ds_t *seg_ds; /**< stored segment information for detach */ - } other; - } segment_data; + mca_smsc_endpoint_t *smsc_endpoint; + void *smsc_map_context; + opal_shmem_ds_t *seg_ds; /**< stored segment information for detach */ opal_mutex_t pending_frags_lock; /**< protect pending_frags */ opal_list_t pending_frags; /**< fragments pending fast box space */ @@ -139,11 +100,6 @@ struct mca_btl_sm_component_t { int sm_free_list_num; /**< initial size of free lists */ int sm_free_list_max; /**< maximum size of free lists */ int sm_free_list_inc; /**< number of elements to alloc when growing free lists */ -#if OPAL_BTL_SM_HAVE_XPMEM - xpmem_segid_t my_seg_id; /**< this rank's xpmem segment id */ - uintptr_t my_address_max; /**< largest address */ - mca_rcache_base_vma_module_t *vma_module; /**< registration cache for xpmem segments */ -#endif opal_shmem_ds_t seg_ds; /**< this rank's shared memory segment (when not using xpmem) */ opal_mutex_t lock; /**< lock to protect concurrent updates to this structure's members */ @@ -163,7 +119,6 @@ struct mca_btl_sm_component_t { int single_copy_mechanism; /**< single copy mechanism to use */ int memcpy_limit; /**< Limit where we switch from memmove to memcpy */ - int log_attach_align; /**< Log of the alignment for xpmem segments */ unsigned int max_inline_send; /**< Limit for copy-in-copy-out fragments */ mca_btl_base_endpoint_t @@ -177,10 +132,6 @@ struct mca_btl_sm_component_t { char *backing_directory; /**< directory to place shared memory backing files */ - /* knem stuff */ -#if OPAL_BTL_SM_HAVE_KNEM - unsigned int knem_dma_min; /**< minimum size to enable DMA for knem transfers (0 disables) */ -#endif mca_mpool_base_module_t *mpool; }; typedef struct mca_btl_sm_component_t mca_btl_sm_component_t; @@ -192,12 +143,6 @@ struct mca_btl_sm_t { mca_btl_base_module_t super; /**< base BTL interface */ bool btl_inited; /**< flag indicating if btl has been inited */ mca_btl_base_module_error_cb_fn_t error_cb; -#if OPAL_BTL_SM_HAVE_KNEM - int knem_fd; - - /* registration cache */ - mca_rcache_base_module_t *knem_rcache; -#endif }; typedef struct mca_btl_sm_t mca_btl_sm_t; @@ -278,18 +223,6 @@ typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t; OBJ_CLASS_DECLARATION(mca_btl_sm_frag_t); -/* At this time only knem requires a registration of "RDMA" buffers */ -struct mca_btl_base_registration_handle_t { - uint64_t cookie; - intptr_t base_addr; -}; - -struct mca_btl_sm_registration_handle_t { - mca_rcache_base_registration_t base; - mca_btl_base_registration_handle_t btl_handle; -}; -typedef struct mca_btl_sm_registration_handle_t mca_btl_sm_registration_handle_t; - /** FIFO types **/ typedef opal_atomic_intptr_t atomic_fifo_value_t; typedef intptr_t fifo_value_t; diff --git a/opal/mca/btl/sm/btl_sm_xpmem.c b/opal/mca/btl/sm/btl_sm_xpmem.c deleted file mode 100644 index eda6fbd85a2..00000000000 --- a/opal/mca/btl/sm/btl_sm_xpmem.c +++ /dev/null @@ -1,249 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2020 Google, LLC. All rights reserved. - * Copyright (c) 2021 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal/mca/btl/sm/btl_sm.h" - -#include "opal/include/opal/align.h" -#include "opal/mca/memchecker/base/base.h" - -#if OPAL_BTL_SM_HAVE_XPMEM - -int mca_btl_sm_xpmem_init(void) -{ - /* Any attachment that goes past the Linux TASK_SIZE will always fail. To prevent this we need - * to determine the value of TASK_SIZE. On x86_64 the value was hard-coded in sm to be - * 0x7ffffffffffful but this approach does not work with AARCH64 (and possibly other - * architectures). Since there is really no way to directly determine the value we can (in all - * cases?) look through the mapping for this process to determine what the largest address is. - * This should be the top of the stack. No heap allocations should be larger than this value. - * Since the largest address may differ between processes the value must be shared as part of - * the modex and stored in the endpoint. */ - FILE *fh = fopen("/proc/self/maps", "r"); - if (NULL == fh) { - BTL_ERROR(("could not open /proc/self/maps for reading. disabling XPMEM")); - return OPAL_ERR_NOT_AVAILABLE; - } - - char buffer[1024]; - uintptr_t address_max = 0; - while (fgets(buffer, sizeof(buffer), fh)) { - uintptr_t low, high; - char *tmp; - /* each line of /proc/self/maps starts with low-high in hexidecimal (without a 0x) */ - low = strtoul(buffer, &tmp, 16); - high = strtoul(tmp + 1, NULL, 16); - if (address_max < high) { - address_max = high; - } - } - - fclose(fh); - - if (0 == address_max) { - BTL_ERROR(("could not determine the address max")); - return OPAL_ERR_NOT_AVAILABLE; - } - - /* save the calcuated maximum */ - mca_btl_sm_component.my_address_max = address_max - 1; - - /* it is safe to use XPMEM_MAXADDR_SIZE here (which is always (size_t)-1 even though - * it is not safe for attach */ - mca_btl_sm_component.my_seg_id = xpmem_make(0, XPMEM_MAXADDR_SIZE, XPMEM_PERMIT_MODE, - (void *) 0666); - if (-1 == mca_btl_sm_component.my_seg_id) { - return OPAL_ERR_NOT_AVAILABLE; - } - - mca_btl_sm.super.btl_get = mca_btl_sm_get_xpmem; - mca_btl_sm.super.btl_put = mca_btl_sm_put_xpmem; - - return OPAL_SUCCESS; -} - -struct sm_check_reg_ctx_t { - mca_btl_base_endpoint_t *ep; - mca_rcache_base_registration_t **reg; - uintptr_t base; - uintptr_t bound; -}; -typedef struct sm_check_reg_ctx_t sm_check_reg_ctx_t; - -static int sm_check_reg(mca_rcache_base_registration_t *reg, void *ctx) -{ - sm_check_reg_ctx_t *sm_ctx = (sm_check_reg_ctx_t *) ctx; - - if ((intptr_t) reg->alloc_base != sm_ctx->ep->peer_smp_rank) { - /* ignore this registration */ - return OPAL_SUCCESS; - } - - sm_ctx->reg[0] = reg; - - if (sm_ctx->bound <= (uintptr_t) reg->bound && sm_ctx->base >= (uintptr_t) reg->base) { - if (0 == opal_atomic_fetch_add_32(®->ref_count, 1)) { - /* registration is being deleted by a thread in sm_return_registration. the - * VMA tree implementation will block in mca_rcache_delete until we finish - * iterating over the VMA tree so it is safe to just ignore this registration - * and continue. */ - sm_ctx->reg[0] = NULL; - return OPAL_SUCCESS; - } - return 1; - } - - if (MCA_RCACHE_FLAGS_INVALID & opal_atomic_fetch_or_32(®->flags, MCA_RCACHE_FLAGS_INVALID)) { - /* another thread has already marked this registration as invalid. ignore and continue. */ - sm_ctx->reg[0] = NULL; - return OPAL_SUCCESS; - } - - /* let the caller know we found an overlapping registration that can be coalesced into - * the requested interval. the caller will remove the last reference and delete the - * registration. */ - return 2; -} - -void sm_return_registration(mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *ep) -{ - mca_rcache_base_vma_module_t *vma_module = mca_btl_sm_component.vma_module; - int32_t ref_count; - - ref_count = opal_atomic_add_fetch_32(®->ref_count, -1); - if (OPAL_UNLIKELY(0 == ref_count && !(reg->flags & MCA_RCACHE_FLAGS_PERSIST))) { -# if OPAL_ENABLE_DEBUG - int ret = mca_rcache_base_vma_delete(vma_module, reg); - assert(OPAL_SUCCESS == ret); -# else - (void) mca_rcache_base_vma_delete(vma_module, reg); -# endif - opal_memchecker_base_mem_noaccess(reg->rcache_context, (uintptr_t)(reg->bound - reg->base)); - (void) xpmem_detach(reg->rcache_context); - OBJ_RELEASE(reg); - } -} - -/* look up the remote pointer in the peer rcache and attach if - * necessary */ -mca_rcache_base_registration_t *sm_get_registation(struct mca_btl_base_endpoint_t *ep, - void *rem_ptr, size_t size, int flags, - void **local_ptr) -{ - mca_rcache_base_vma_module_t *vma_module = mca_btl_sm_component.vma_module; - uint64_t attach_align = 1 << mca_btl_sm_component.log_attach_align; - mca_rcache_base_registration_t *reg = NULL; - sm_check_reg_ctx_t check_ctx = {.ep = ep, .reg = ®}; - xpmem_addr_t xpmem_addr; - uintptr_t base, bound; - int rc; - - base = OPAL_DOWN_ALIGN((uintptr_t) rem_ptr, attach_align, uintptr_t); - bound = OPAL_ALIGN((uintptr_t) rem_ptr + size - 1, attach_align, uintptr_t) + 1; - if (OPAL_UNLIKELY(bound > ep->segment_data.xpmem.address_max)) { - bound = ep->segment_data.xpmem.address_max; - } - - check_ctx.base = base; - check_ctx.bound = bound; - - /* several segments may match the base pointer */ - rc = mca_rcache_base_vma_iterate(vma_module, (void *) base, bound - base, true, sm_check_reg, - &check_ctx); - if (2 == rc) { - bound = bound < (uintptr_t) reg->bound ? (uintptr_t) reg->bound : bound; - base = base > (uintptr_t) reg->base ? (uintptr_t) reg->base : base; - sm_return_registration(reg, ep); - reg = NULL; - } - - if (NULL == reg) { - reg = OBJ_NEW(mca_rcache_base_registration_t); - if (OPAL_LIKELY(NULL != reg)) { - /* stick around for awhile */ - reg->ref_count = 2; - reg->base = (unsigned char *) base; - reg->bound = (unsigned char *) bound; - reg->flags = flags; - reg->alloc_base = (void *) (intptr_t) ep->peer_smp_rank; - -# if defined(HAVE_SN_XPMEM_H) - xpmem_addr.id = ep->segment_data.xpmem.apid; -# else - xpmem_addr.apid = ep->segment_data.xpmem.apid; -# endif - xpmem_addr.offset = base; - - reg->rcache_context = xpmem_attach(xpmem_addr, bound - base, NULL); - if (OPAL_UNLIKELY((void *) -1 == reg->rcache_context)) { - OBJ_RELEASE(reg); - return NULL; - } - - opal_memchecker_base_mem_defined(reg->rcache_context, bound - base); - - if (!(flags & MCA_RCACHE_FLAGS_PERSIST)) { - mca_rcache_base_vma_insert(vma_module, reg, 0); - } - } - } - - opal_atomic_wmb(); - *local_ptr = (void *) ((uintptr_t) reg->rcache_context - + (ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base)); - - return reg; -} - -struct sm_cleanup_reg_ctx { - mca_btl_sm_endpoint_t *ep; - opal_list_t *registrations; -}; - -static int mca_btl_sm_endpoint_xpmem_rcache_cleanup(mca_rcache_base_registration_t *reg, void *ctx) -{ - struct sm_cleanup_reg_ctx *cleanup_ctx = (struct sm_cleanup_reg_ctx *) ctx; - if ((intptr_t) reg->alloc_base == cleanup_ctx->ep->peer_smp_rank) { - opal_list_append(cleanup_ctx->registrations, ®->super.super); - } - - return OPAL_SUCCESS; -} - -void mca_btl_sm_xpmem_cleanup_endpoint(struct mca_btl_base_endpoint_t *ep) -{ - mca_rcache_base_registration_t *reg; - opal_list_t registrations; - struct sm_cleanup_reg_ctx cleanup_ctx = {.ep = ep, .registrations = ®istrations}; - - OBJ_CONSTRUCT(®istrations, opal_list_t); - - /* clean out the registration cache */ - (void) mca_rcache_base_vma_iterate(mca_btl_sm_component.vma_module, NULL, (size_t) -1, true, - mca_btl_sm_endpoint_xpmem_rcache_cleanup, - (void *) &cleanup_ctx); - while (NULL - != (reg = (mca_rcache_base_registration_t *) opal_list_remove_first(®istrations))) { - sm_return_registration(reg, ep); - } - OBJ_DESTRUCT(®istrations); - - if (ep->segment_base) { - xpmem_release(ep->segment_data.xpmem.apid); - ep->segment_data.xpmem.apid = 0; - } -} - -#endif /* OPAL_BTL_SM_HAVE_XPMEM */ diff --git a/opal/mca/btl/sm/btl_sm_xpmem.h b/opal/mca/btl/sm/btl_sm_xpmem.h deleted file mode 100644 index 42dbb1bca94..00000000000 --- a/opal/mca/btl/sm/btl_sm_xpmem.h +++ /dev/null @@ -1,41 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2016 ARM, Inc. All rights reserved. - * Copyright (c) 2020 Google, LLC. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_SM_XPMEM_H -#define MCA_BTL_SM_XPMEM_H - -#include "opal_config.h" -#include "opal/mca/btl/sm/btl_sm_types.h" -#include "opal/mca/rcache/base/rcache_base_vma.h" -#include "opal/mca/rcache/rcache.h" - -#if OPAL_BTL_SM_HAVE_XPMEM - -/* look up the remote pointer in the peer rcache and attach if - * necessary */ - -struct mca_btl_base_endpoint_t; - -int mca_btl_sm_xpmem_init(void); - -mca_rcache_base_registration_t *sm_get_registation(struct mca_btl_base_endpoint_t *endpoint, - void *rem_ptr, size_t size, int flags, - void **local_ptr); - -void sm_return_registration(mca_rcache_base_registration_t *reg, - struct mca_btl_base_endpoint_t *endpoint); -void mca_btl_sm_xpmem_cleanup_endpoint(struct mca_btl_base_endpoint_t *ep); - -#endif /* OPAL_BTL_SM_HAVE_XPMEM */ - -#endif /* MCA_BTL_SM_XPMEM_H */ diff --git a/opal/mca/btl/sm/configure.m4 b/opal/mca/btl/sm/configure.m4 index f1eea2710dc..2edbdad5d8a 100644 --- a/opal/mca/btl/sm/configure.m4 +++ b/opal/mca/btl/sm/configure.m4 @@ -21,25 +21,6 @@ AC_DEFUN([MCA_opal_btl_sm_CONFIG],[ AC_CONFIG_FILES([opal/mca/btl/sm/Makefile]) - OPAL_VAR_SCOPE_PUSH([btl_sm_xpmem_happy btl_sm_cma_happy btl_sm_knem_happy]) - - # Check for single-copy APIs - - OPAL_CHECK_XPMEM([btl_sm], [btl_sm_xpmem_happy=1], [btl_sm_xpmem_happy=0]) - OPAL_CHECK_KNEM([btl_sm], [btl_sm_knem_happy=1],[btl_sm_knem_happy=0]) - OPAL_CHECK_CMA([btl_sm], [AC_CHECK_HEADER([sys/prctl.h]) btl_sm_cma_happy=1], [btl_sm_cma_happy=0]) - - AC_DEFINE_UNQUOTED([OPAL_BTL_SM_HAVE_XPMEM], [$btl_sm_xpmem_happy], - [If XPMEM support can be enabled within sm]) - - AC_DEFINE_UNQUOTED([OPAL_BTL_SM_HAVE_CMA], [$btl_sm_cma_happy], - [If CMA support can be enabled within sm]) - - AC_DEFINE_UNQUOTED([OPAL_BTL_SM_HAVE_KNEM], [$btl_sm_knem_happy], - [If KNEM support can be enabled within sm]) - - OPAL_VAR_SCOPE_POP - # always happy [$1] diff --git a/opal/mca/smsc/Makefile.am b/opal/mca/smsc/Makefile.am new file mode 100644 index 00000000000..975eee7705f --- /dev/null +++ b/opal/mca/smsc/Makefile.am @@ -0,0 +1,38 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2021 Google, LLC. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_smsc.la +libmca_smsc_la_SOURCES = + +# local files +headers = smsc.h +libmca_smsc_la_SOURCES += $(headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +opaldir = $(opalincludedir)/$(subdir) +nobase_opal_HEADERS = $(headers) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/opal/mca/smsc/base/Makefile.am b/opal/mca/smsc/base/Makefile.am new file mode 100644 index 00000000000..28a0678cd17 --- /dev/null +++ b/opal/mca/smsc/base/Makefile.am @@ -0,0 +1,26 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2021 Google, LLC. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = ${smsc_@DIRECT_smsc@_CPPFLAGS} + +headers += \ + base/base.h + +libmca_smsc_la_SOURCES += \ + base/smsc_base_frame.c diff --git a/opal/mca/smsc/base/base.h b/opal/mca/smsc/base/base.h new file mode 100644 index 00000000000..430f855842f --- /dev/null +++ b/opal/mca/smsc/base/base.h @@ -0,0 +1,23 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_MCA_SMSC_BASE_BASE_H +#define OPAL_MCA_SMSC_BASE_BASE_H + +#include "opal/mca/smsc/smsc.h" + +extern mca_base_framework_t opal_smsc_base_framework; +extern mca_smsc_component_t *selected_component; +extern mca_smsc_module_t *selected_module; + +int mca_smsc_base_select(void); +void mca_smsc_base_register_default_params(mca_smsc_component_t *component, int default_priority); + +#endif /* OPAL_MCA_SMSC_BASE_BASE_H */ diff --git a/opal/mca/smsc/base/smsc_base_frame.c b/opal/mca/smsc/base/smsc_base_frame.c new file mode 100644 index 00000000000..db99074b0e0 --- /dev/null +++ b/opal/mca/smsc/base/smsc_base_frame.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2021 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" +#include +#include + +#include "opal/class/opal_list.h" +#include "opal/mca/base/base.h" +#include "opal/mca/mca.h" +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/smsc/smsc.h" +#include "opal/util/printf.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ +#include "opal/mca/smsc/base/static-components.h" + +mca_smsc_component_t *selected_component = NULL; +mca_smsc_module_t *mca_smsc = NULL; + +/* + * Global variables + */ +MCA_BASE_FRAMEWORK_DECLARE(opal, smsc, NULL, NULL, NULL, NULL, mca_smsc_base_static_components, 0); + +static int mca_smsc_compare_components(opal_list_item_t **a, opal_list_item_t **b) +{ + mca_smsc_component_t *componenta + = (mca_smsc_component_t *) ((mca_base_component_list_item_t *) *a)->cli_component; + mca_smsc_component_t *componentb + = (mca_smsc_component_t *) ((mca_base_component_list_item_t *) *b)->cli_component; + + return (componenta->priority > componentb->priority) + ? -1 + : ((componenta->priority < componentb->priority) ? 1 : 0); +} + +int mca_smsc_base_select(void) +{ + mca_base_component_list_item_t *cli, *next; + + OPAL_LIST_FOREACH_SAFE (cli, next, &opal_smsc_base_framework.framework_components, + mca_base_component_list_item_t) { + mca_smsc_component_t *component = (mca_smsc_component_t *) cli->cli_component; + + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, opal_smsc_base_framework.framework_output, + "mca_smsc_base_select: checking component %s", + component->smsc_version.mca_component_name); + + int ret = component->query(); + if (OPAL_SUCCESS != ret) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, + opal_smsc_base_framework.framework_output, + "mca_smsc_base_select: could not select component %s. query " + "returned error code %d", + component->smsc_version.mca_component_name, ret); + opal_list_remove_item(&opal_smsc_base_framework.framework_components, &cli->super); + OBJ_RELEASE(cli); + mca_base_component_close(&component->smsc_version, + opal_smsc_base_framework.framework_output); + continue; + } + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, opal_smsc_base_framework.framework_output, + "mca_smsc_base_select: component %s priority=%d", + component->smsc_version.mca_component_name, component->priority); + } + + opal_list_sort(&opal_smsc_base_framework.framework_components, mca_smsc_compare_components); + + if (opal_list_get_size(&opal_smsc_base_framework.framework_components) > 0) { + cli = (mca_base_component_list_item_t *) opal_list_get_first( + &opal_smsc_base_framework.framework_components); + + selected_component = (mca_smsc_component_t *) cli->cli_component; + mca_smsc = selected_component->enable(); + + opal_output_verbose( + MCA_BASE_VERBOSE_COMPONENT, opal_smsc_base_framework.framework_output, + "mca_smsc_base_select: selected shared-memory single-copy component: %s", + selected_component->smsc_version.mca_component_name); + } else { + opal_output_verbose( + MCA_BASE_VERBOSE_COMPONENT, opal_smsc_base_framework.framework_output, + "mca_smsc_base_select: no shared-memory single-copy component available for selection"); + } + + return OPAL_SUCCESS; +} + +void mca_smsc_base_register_default_params(mca_smsc_component_t *component, int default_priority) +{ + + char *tmp; + (void) opal_asprintf(&tmp, "Priority of the %s component (default: %d)", + component->smsc_version.mca_component_name, default_priority); + component->priority = default_priority; + (void) mca_base_component_var_register(&component->smsc_version, "priority", /*help_msg=*/tmp, + MCA_BASE_VAR_TYPE_INT, /*enumerator=*/NULL, /*bind=*/0, + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_ALL_EQ, &component->priority); + free(tmp); +} diff --git a/opal/mca/smsc/cma/Makefile.am b/opal/mca/smsc/cma/Makefile.am new file mode 100644 index 00000000000..02b539f0dcd --- /dev/null +++ b/opal/mca/smsc/cma/Makefile.am @@ -0,0 +1,56 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2009 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2020-2021 Google, LLC. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = post_configure.sh + +AM_CPPFLAGS = $(smsc_cma_CPPFLAGS) + +libmca_smsc_cma_la_sources = \ + smsc_cma_component.c \ + smsc_cma_module.c \ + smsc_cma_internal.h \ + smsc_cma.h + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_opal_smsc_cma_DSO +component_noinst = +component_install = mca_smsc_cma.la +else +component_noinst = libmca_smsc_cma.la +component_install = +endif + +mcacomponentdir = $(opallibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_smsc_cma_la_SOURCES = $(libmca_smsc_cma_la_sources) +mca_smsc_cma_la_LDFLAGS = -module -avoid-version $(smsc_cma_LDFLAGS) +mca_smsc_cma_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \ + $(smsc_cma_LIBS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_smsc_cma_la_SOURCES = $(libmca_smsc_cma_la_sources) +libmca_smsc_cma_la_LIBADD = $(smsc_cma_LIBS) +libmca_smsc_cma_la_LDFLAGS = -module -avoid-version $(smsc_cma_LDFLAGS) diff --git a/opal/mca/smsc/cma/configure.m4 b/opal/mca/smsc/cma/configure.m4 new file mode 100644 index 00000000000..fc8ff2a09f4 --- /dev/null +++ b/opal/mca/smsc/cma/configure.m4 @@ -0,0 +1,31 @@ +# -*- shell-script -*- +# +# Copyright (c) 2009 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2015 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# Copyright (c) 2021 Google, LLC. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_smsc_cma_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_opal_smsc_cma_CONFIG],[ + AC_CONFIG_FILES([opal/mca/smsc/cma/Makefile]) + + OPAL_CHECK_CMA([smsc_cma], [AC_CHECK_HEADER([sys/prctl.h]) $1], [$2]) + + AC_SUBST([smsc_cma_CFLAGS]) + AC_SUBST([smsc_cma_CPPFLAGS]) + AC_SUBST([smsc_cma_LDFLAGS]) + AC_SUBST([smsc_cma_LIBS]) +])dnl diff --git a/opal/mca/smsc/cma/post_configure.sh b/opal/mca/smsc/cma/post_configure.sh new file mode 100644 index 00000000000..3059fa936d4 --- /dev/null +++ b/opal/mca/smsc/cma/post_configure.sh @@ -0,0 +1 @@ +DIRECT_CALL_HEADER="opal/mca/smsc/cma/smsc_cma.h" diff --git a/opal/mca/smsc/cma/smsc_cma.h b/opal/mca/smsc/cma/smsc_cma.h new file mode 100644 index 00000000000..64aa9dd9497 --- /dev/null +++ b/opal/mca/smsc/cma/smsc_cma.h @@ -0,0 +1,33 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_MCA_SMSC_CMA_SMSC_CMA_H +#define OPAL_MCA_SMSC_CMA_SMSC_CMA_H + +#include "opal_config.h" + +#include "opal/mca/smsc/smsc.h" + +mca_smsc_endpoint_t *mca_smsc_cma_get_endpoint(opal_proc_t *peer_proc); +void mca_smsc_cma_return_endpoint(mca_smsc_endpoint_t *endpoint); + +int mca_smsc_cma_copy_to(mca_smsc_endpoint_t *endpoint, void *local_address, void *remote_address, + size_t size, void *reg_handle); +int mca_smsc_cma_copy_from(mca_smsc_endpoint_t *endpoint, void *local_address, void *remote_address, + size_t size, void *reg_handle); + +/* unsupported interfaces defined to support MCA direct */ +void *mca_smsc_cma_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t flags, + void *remote_address, size_t size, void **local_mapping); +void mca_smsc_cma_unmap_peer_region(void *ctx); +void *mca_smsc_cma_register_region(void *local_address, size_t size); +void mca_smsc_cma_deregister_region(void *reg_data); + +#endif /* OPAL_MCA_SMSC_CMA_SMSC_CMA_H */ diff --git a/opal/mca/smsc/cma/smsc_cma_component.c b/opal/mca/smsc/cma/smsc_cma_component.c new file mode 100644 index 00000000000..4ccd731ba24 --- /dev/null +++ b/opal/mca/smsc/cma/smsc_cma_component.c @@ -0,0 +1,147 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "opal_config.h" + +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/smsc/cma/smsc_cma_internal.h" + +#include +#include +#include +#include +#include +#include + +static int mca_smsc_cma_component_register(void); +static int mca_smsc_cma_component_open(void); +static int mca_smsc_cma_component_close(void); +static int mca_smsc_cma_component_query(void); +static mca_smsc_module_t *mca_smsc_cma_component_enable(void); + +#define MCA_SMSC_CMA_DEFAULT_PRIORITY 37 +static const int mca_smsc_cma_default_priority = MCA_SMSC_CMA_DEFAULT_PRIORITY; + +mca_smsc_component_t mca_smsc_cma_component = { + .smsc_version = { + MCA_SMSC_DEFAULT_VERSION("cma"), + .mca_open_component = mca_smsc_cma_component_open, + .mca_close_component = mca_smsc_cma_component_close, + .mca_register_component_params = mca_smsc_cma_component_register, + }, + .priority = MCA_SMSC_CMA_DEFAULT_PRIORITY, + .query = mca_smsc_cma_component_query, + .enable = mca_smsc_cma_component_enable, +}; + +static int mca_smsc_cma_component_register(void) +{ + mca_smsc_base_register_default_params(&mca_smsc_cma_component, mca_smsc_cma_default_priority); + return OPAL_SUCCESS; +} + +static int mca_smsc_cma_component_open(void) +{ + /* nothing to do */ + return OPAL_SUCCESS; +} + +static int mca_smsc_cma_component_close(void) +{ + /* nothing to do */ + return OPAL_SUCCESS; +} + +/* + * mca_btl_sm_parse_proc_ns_user() tries to get the user namespace ID + * of the current process. + * Returns the ID of the user namespace. In the case of an error '0' is returned. + */ +ino_t mca_smsc_cma_get_user_ns_id(void) +{ + struct stat buf; + + if (0 > stat("/proc/self/ns/user", &buf)) { + /* + * Something went wrong, probably an old kernel that does not support namespaces + * simply assume all processes are in the same user namespace and return 0 + */ + return 0; + } + + return buf.st_ino; +} + +static int mca_smsc_cma_send_modex(void) +{ + mca_smsc_cma_modex_t modex; + + modex.pid = getpid(); + modex.user_ns_id = mca_smsc_cma_get_user_ns_id(); + + int rc; + OPAL_MODEX_SEND(rc, PMIX_LOCAL, &mca_smsc_cma_component.smsc_version, &modex, sizeof(modex)); + return rc; +} + +static int mca_smsc_cma_component_query(void) +{ + /* Check if we have the proper permissions for CMA */ + char buffer = '0'; + bool cma_happy = false; + + /* check system setting for current ptrace scope */ + int fd = open("/proc/sys/kernel/yama/ptrace_scope", O_RDONLY); + if (0 <= fd) { + int ret = read(fd, &buffer, 1); + if (ret < 0) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, + opal_smsc_base_framework.framework_output, + "mca_smsc_cma_component_query: could not read ptrace_scope. " + "assuming ptrace scope is 0"); + } + close(fd); + } + + /* ptrace scope 0 will allow an attach from any of the process owner's + * processes. ptrace scope 1 limits attachers to the process tree + * starting at the parent of this process. */ + if ('0' != buffer) { +#if defined PR_SET_PTRACER + /* try setting the ptrace scope to allow attach */ + int ret = prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); + if (0 == ret) { + cma_happy = true; + } +#endif + } else { + cma_happy = true; + } + + if (!cma_happy) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, opal_smsc_base_framework.framework_output, + "mca_smsc_cma_component_query: could not select for use. insufficient " + "ptrace permissions."); + mca_smsc_cma_component.priority = -1; + return OPAL_ERR_NOT_AVAILABLE; + } + + mca_smsc_cma_send_modex(); + + return OPAL_SUCCESS; +} + +static mca_smsc_module_t *mca_smsc_cma_component_enable(void) +{ + if (0 > mca_smsc_cma_component.priority) { + return NULL; + } + + return &mca_smsc_cma_module; +} diff --git a/opal/mca/smsc/cma/smsc_cma_internal.h b/opal/mca/smsc/cma/smsc_cma_internal.h new file mode 100644 index 00000000000..7de42b64a0c --- /dev/null +++ b/opal/mca/smsc/cma/smsc_cma_internal.h @@ -0,0 +1,37 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_MCA_SMSC_CMA_SMSC_CMA_INTERNAL_H +#define OPAL_MCA_SMSC_CMA_SMSC_CMA_INTERNAL_H + +#include "opal/mca/smsc/cma/smsc_cma.h" + +struct mca_smsc_cma_modex_t { + pid_t pid; + ino_t user_ns_id; +}; + +typedef struct mca_smsc_cma_modex_t mca_smsc_cma_modex_t; + +struct mca_smsc_cma_endpoint_t { + mca_smsc_endpoint_t super; + pid_t pid; +}; + +typedef struct mca_smsc_cma_endpoint_t mca_smsc_cma_endpoint_t; + +OBJ_CLASS_DECLARATION(mca_smsc_cma_endpoint_t); + +extern mca_smsc_module_t mca_smsc_cma_module; +extern mca_smsc_component_t mca_smsc_cma_component; + +ino_t mca_smsc_cma_get_user_ns_id(void); + +#endif /* OPAL_MCA_SMSC_CMA_SMSC_CMA_INTERNAL_H */ diff --git a/opal/mca/smsc/cma/smsc_cma_module.c b/opal/mca/smsc/cma/smsc_cma_module.c new file mode 100644 index 00000000000..4e536090152 --- /dev/null +++ b/opal/mca/smsc/cma/smsc_cma_module.c @@ -0,0 +1,213 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2010-2014 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2014-2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "opal_config.h" + +#include "opal/mca/pmix/pmix-internal.h" +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/smsc/cma/smsc_cma_internal.h" + +#if OPAL_CMA_NEED_SYSCALL_DEFS +# include "opal/sys/cma.h" +#else +# include +#endif /* OPAL_CMA_NEED_SYSCALL_DEFS */ + +OBJ_CLASS_INSTANCE(mca_smsc_cma_endpoint_t, opal_object_t, NULL, NULL); + +mca_smsc_endpoint_t *mca_smsc_cma_get_endpoint(opal_proc_t *peer_proc) +{ + mca_smsc_cma_endpoint_t *endpoint = OBJ_NEW(mca_smsc_cma_endpoint_t); + if (OPAL_UNLIKELY(NULL == endpoint)) { + return NULL; + } + + endpoint->super.proc = peer_proc; + + int rc; + size_t modex_size; + mca_smsc_cma_modex_t *modex; + OPAL_MODEX_RECV_IMMEDIATE(rc, &mca_smsc_cma_component.smsc_version, &peer_proc->proc_name, + (void **) &modex, &modex_size); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + OBJ_RELEASE(endpoint); + return NULL; + } + + ino_t my_ns_id = mca_smsc_cma_get_user_ns_id(); + if (modex->user_ns_id != my_ns_id) { + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, opal_smsc_base_framework.framework_output, + "mca_smsc_cma_module_get_endpoint: can not proceed. processes are in " + "difference namespaces"); + /* can't use CMA with this peer */ + OBJ_RELEASE(endpoint); + free(modex); + return NULL; + } + + endpoint->pid = modex->pid; + return &endpoint->super; +} + +void mca_smsc_cma_return_endpoint(mca_smsc_endpoint_t *endpoint) +{ + OBJ_RELEASE(endpoint); +} + +static inline void mca_smsc_cma_iov_advance(struct iovec *iov, ssize_t length) +{ + iov->iov_base = (void *) ((uintptr_t) iov->iov_base + length); + iov->iov_len -= length; +} + +int mca_smsc_cma_copy_to(mca_smsc_endpoint_t *endpoint, void *local_address, void *remote_address, + size_t size, void *reg_handle) +{ + /* ignore the registration handle as it is not used for CMA */ + (void) reg_handle; + + mca_smsc_cma_endpoint_t *cma_endpoint = (mca_smsc_cma_endpoint_t *) endpoint; + + /* + * According to the man page : + * "On success, process_vm_readv() returns the number of bytes read and + * process_vm_writev() returns the number of bytes written. This return + * value may be less than the total number of requested bytes, if a + * partial read/write occurred. (Partial transfers apply at the + * granularity of iovec elements. These system calls won't perform a + * partial transfer that splits a single iovec element.)". + * So since we use a single iovec element, the returned size should either + * be 0 or size, and the do loop should not be needed here. + * We tried on various Linux kernels with size > 2 GB, and surprisingly, + * the returned value is always 0x7ffff000 (fwiw, it happens to be the size + * of the larger number of pages that fits a signed 32 bits integer). + * We do not know whether this is a bug from the kernel, the libc or even + * the man page, but for the time being, we do as is process_vm_readv() could + * return any value. + */ + struct iovec src_iov = { + .iov_base = local_address, + .iov_len = size, + }; + struct iovec dst_iov = { + .iov_base = remote_address, + .iov_len = size, + }; + ssize_t ret; + do { + ret = process_vm_writev(cma_endpoint->pid, &src_iov, 1, &dst_iov, 1, 0); + if (0 > ret) { + if (ESRCH == errno) { + OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_ERROR, + opal_smsc_base_framework.framework_output, + "CMA wrote %ld, expected %lu, errno = %d", (long) ret, + (unsigned long) size, errno)); + return OPAL_ERROR; + } + OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_ERROR, opal_smsc_base_framework.framework_output, + "CMA wrote %ld, expected %lu, errno = %d", (long) ret, + (unsigned long) size, errno)); + return OPAL_ERROR; + } + mca_smsc_cma_iov_advance(&src_iov, ret); + mca_smsc_cma_iov_advance(&dst_iov, ret); + } while (0 < src_iov.iov_len); + + return OPAL_SUCCESS; +} + +int mca_smsc_cma_copy_from(mca_smsc_endpoint_t *endpoint, void *local_address, void *remote_address, + size_t size, void *reg_handle) +{ + /* ignore the registration handle as it is not used for CMA */ + (void) reg_handle; + + mca_smsc_cma_endpoint_t *cma_endpoint = (mca_smsc_cma_endpoint_t *) endpoint; + + /* + * According to the man page : + * "On success, process_vm_readv() returns the number of bytes read and + * process_vm_writev() returns the number of bytes written. This return + * value may be less than the total number of requested bytes, if a + * partial read/write occurred. (Partial transfers apply at the + * granularity of iovec elements. These system calls won't perform a + * partial transfer that splits a single iovec element.)". + * So since we use a single iovec element, the returned size should either + * be 0 or size, and the do loop should not be needed here. + * We tried on various Linux kernels with size > 2 GB, and surprisingly, + * the returned value is always 0x7ffff000 (fwiw, it happens to be the size + * of the larger number of pages that fits a signed 32 bits integer). + * We do not know whether this is a bug from the kernel, the libc or even + * the man page, but for the time being, we do as is process_vm_readv() could + * return any value. + */ + struct iovec src_iov = { + .iov_base = remote_address, + .iov_len = size, + }; + struct iovec dst_iov = { + .iov_base = local_address, + .iov_len = size, + }; + ssize_t ret; + do { + ret = process_vm_readv(cma_endpoint->pid, &dst_iov, 1, &src_iov, 1, 0); + if (0 > ret) { + if (ESRCH == errno) { + OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_ERROR, + opal_smsc_base_framework.framework_output, + "CMA read %ld, expected %lu, errno = %d", (long) ret, + (unsigned long) size, errno)); + return OPAL_ERROR; + } + OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_ERROR, opal_smsc_base_framework.framework_output, + "CMA read %ld, expected %lu, errno = %d\n", (long) ret, + (unsigned long) size, errno)); + return OPAL_ERROR; + } + mca_smsc_cma_iov_advance(&src_iov, ret); + mca_smsc_cma_iov_advance(&dst_iov, ret); + } while (0 < src_iov.iov_len); + + return OPAL_SUCCESS; +} + +/* unsupported interfaces defined to support MCA direct */ +void *mca_smsc_cma_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t flags, + void *remote_address, size_t size, void **local_mapping) +{ + return NULL; +} + +void mca_smsc_cma_unmap_peer_region(void *ctx) +{ +} + +void *mca_smsc_cma_register_region(void *local_address, size_t size) +{ + return NULL; +} + +void mca_smsc_cma_deregister_region(void *reg_data) +{ +} + +mca_smsc_module_t mca_smsc_cma_module = { + .get_endpoint = mca_smsc_cma_get_endpoint, + .return_endpoint = mca_smsc_cma_return_endpoint, + .copy_to = mca_smsc_cma_copy_to, + .copy_from = mca_smsc_cma_copy_from, +}; diff --git a/opal/mca/smsc/configure.m4 b/opal/mca/smsc/configure.m4 new file mode 100644 index 00000000000..51544fedc11 --- /dev/null +++ b/opal/mca/smsc/configure.m4 @@ -0,0 +1,19 @@ +# -*- shell-script -*- +# +# Copyright (c) 2013 Sandia National Laboratories. All rights reserved. +# Copyright (c) 2021 Google, LLC. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([MCA_opal_smsc_CONFIG],[ + # configure all the components + MCA_CONFIGURE_FRAMEWORK($1, $2, 1) + + # this is a direct callable component, so set that up. + MCA_SETUP_DIRECT_CALL($1, $2) +]) diff --git a/opal/mca/smsc/knem/Makefile.am b/opal/mca/smsc/knem/Makefile.am new file mode 100644 index 00000000000..7f53f45124b --- /dev/null +++ b/opal/mca/smsc/knem/Makefile.am @@ -0,0 +1,58 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2009 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2020-2021 Google, LLC. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = post_configure.sh + +AM_CPPFLAGS = $(smsc_knem_CPPFLAGS) + +dist_opaldata_DATA = help-smsc-knem.txt + +libmca_smsc_knem_la_sources = \ + smsc_knem_component.c \ + smsc_knem_module.c \ + smsc_knem_internal.h \ + smsc_knem.h + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_opal_smsc_knem_DSO +component_noinst = +component_install = mca_smsc_knem.la +else +component_noinst = libmca_smsc_knem.la +component_install = +endif + +mcacomponentdir = $(opallibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_smsc_knem_la_SOURCES = $(libmca_smsc_knem_la_sources) +mca_smsc_knem_la_LDFLAGS = -module -avoid-version $(smsc_knem_LDFLAGS) +mca_smsc_knem_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \ + $(smsc_knem_LIBS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_smsc_knem_la_SOURCES = $(libmca_smsc_knem_la_sources) +libmca_smsc_knem_la_LIBADD = $(smsc_knem_LIBS) +libmca_smsc_knem_la_LDFLAGS = -module -avoid-version $(smsc_knem_LDFLAGS) diff --git a/opal/mca/smsc/knem/configure.m4 b/opal/mca/smsc/knem/configure.m4 new file mode 100644 index 00000000000..a6da2a69402 --- /dev/null +++ b/opal/mca/smsc/knem/configure.m4 @@ -0,0 +1,31 @@ +# -*- shell-script -*- +# +# Copyright (c) 2009 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2015 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# Copyright (c) 2021 Google, LLC. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_smsc_knem_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_opal_smsc_knem_CONFIG],[ + AC_CONFIG_FILES([opal/mca/smsc/knem/Makefile]) + + OPAL_CHECK_KNEM([smsc_knem], [$1], [$2]) + + AC_SUBST([smsc_knem_CFLAGS]) + AC_SUBST([smsc_knem_CPPFLAGS]) + AC_SUBST([smsc_knem_LDFLAGS]) + AC_SUBST([smsc_knem_LIBS]) +])dnl diff --git a/opal/mca/smsc/knem/help-smsc-knem.txt b/opal/mca/smsc/knem/help-smsc-knem.txt new file mode 100644 index 00000000000..84e00c36d1c --- /dev/null +++ b/opal/mca/smsc/knem/help-smsc-knem.txt @@ -0,0 +1,156 @@ +# -*- text -*- +# +# Copyright (c) 2004-2009 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012-2014 Los Alamos National Security, LLC. +# All rights reserved. +# Copyright (c) 2014 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open MPI's shared memory support. +# +[sys call fail] +A system call failed during sm shared memory BTL initialization +that should not have. It is likely that your MPI job will now either +abort or experience performance degradation. + + System call: %s + Error: %s (errno %d) +# +[no locality] +WARNING: Missing locality information required for sm shared memory +BTL initialization. Continuing without shared memory support. +# +[knem permission denied] +WARING: Open MPI failed to open the /dev/knem device due to a +permissions problem. Please check with your system administrator to +get the permissions fixed, or set the btl_sm_single_copy_mechanism +MCA variable to none to silence this warning and run without knem +support. + + Local host: %s + /dev/knem permissions: 0%o +# +[knem fail open] +WARNING: Open MPI failed to open the /dev/knem device due to a local +error. Please check with your system administrator to get the problem +fixed, or set the btl_sm_single_copy_mechanism MCA variable to none +to silence this warning and run without knem support. + +The sm shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s + Errno: %d (%s) +# +[knem get ABI fail] +WARNING: Open MPI failed to retrieve the ABI version from the +/dev/knem device due to a local error. This usually indicates an +error in your knem installation; please check with your system +administrator, or set the btl_sm_single_copy_mechanism MCA variable +to none to silence this warning and run without knem support. + +The sm shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s + Errno: %d (%s) +# +[knem ABI mismatch] +WARNING: Open MPI was compiled with support for one version of the +knem kernel module, but it discovered a different version running in +/dev/knem. Open MPI needs to be installed with support for the same +version of knem as is in the running Linux kernel. Please check with +your system administrator, or set the btl_sm_single_copy_mechanism +MCA variable to none to silence this warning and run without knem +support. + +The sm shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s + Open MPI's knem version: 0x%x + /dev/knem's version: 0x%x +# +[knem mmap fail] +Open MPI failed to map support from the knem Linux kernel module; this +shouldn't happen. Please check with your system administrator, or set +the btl_sm_single_copy_mechanism MCA variable to none to silence +this warning and run without knem support. + +The sm shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s + System call: mmap() + Errno: %d (%s) +# +[knem init error] +Open MPI encountered an error during the knem initialization. Please +check with your system administrator, or set the +btl_sm_single_copy_mechanism MCA variable to none to silence this +warning and run without knem support. + +The sm shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s + System call: %s + Errno: %d (%s) +# +[knem requested but not available] +WARNING: Linux kernel knem support was requested via the +btl_sm_single_copy_mechanism MCA parameter, but Knem support was either not +compiled into this Open MPI installation, or Knem support was unable +to be activated in this process. + +The sm BTL will fall back on another single-copy mechanism if one +is available. This may result in lower performance. + + Local host: %s +# +[cma-permission-denied] +WARNING: Linux kernel CMA support was requested via the +btl_sm_single_copy_mechanism MCA variable, but CMA support is +not available due to restrictive ptrace settings. + +The sm shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s +# +[cma-different-user-namespace-error] +ERROR: Linux kernel CMA support was requested via the +btl_sm_single_copy_mechanism MCA variable, but CMA support is +not available due to different user namespaces. + +Your MPI job will abort now. Please select another value for +btl_sm_single_copy_mechanism. + + Local host: %s +# +[cma-different-user-namespace-warning] +WARNING: The default btl_sm_single_copy_mechanism CMA is +not available due to different user namespaces. + +The sm shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s +# +[xpmem-make-failed] +WARNING: Could not generate an xpmem segment id for this process' +address space. + +The sm shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s + Error code: %d (%s) diff --git a/opal/mca/smsc/knem/post_configure.sh b/opal/mca/smsc/knem/post_configure.sh new file mode 100644 index 00000000000..fd7a0eb1abe --- /dev/null +++ b/opal/mca/smsc/knem/post_configure.sh @@ -0,0 +1 @@ +DIRECT_CALL_HEADER="opal/mca/smsc/knem/smsc_knem.h" diff --git a/opal/mca/smsc/knem/smsc_knem.h b/opal/mca/smsc/knem/smsc_knem.h new file mode 100644 index 00000000000..824b7ad7b39 --- /dev/null +++ b/opal/mca/smsc/knem/smsc_knem.h @@ -0,0 +1,34 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_MCA_SMSC_KNEM_SMSC_KNEM_H +#define OPAL_MCA_SMSC_KNEM_SMSC_KNEM_H + +#include "opal_config.h" + +#include "opal/mca/smsc/smsc.h" + +mca_smsc_endpoint_t *mca_smsc_knem_get_endpoint(opal_proc_t *peer_proc); +void mca_smsc_knem_return_endpoint(mca_smsc_endpoint_t *endpoint); + +int mca_smsc_knem_copy_to(mca_smsc_endpoint_t *endpoint, void *local_address, void *remote_address, + size_t size, void *reg_data); +int mca_smsc_knem_copy_from(mca_smsc_endpoint_t *endpoint, void *local_address, + void *remote_address, size_t size, void *reg_data); + +void *mca_smsc_knem_register_region(void *local_address, size_t size); +void mca_smsc_knem_deregister_region(void *reg_data); + +/* unsupported interfaces defined to support MCA direct */ +void *mca_smsc_knem_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t flags, + void *remote_address, size_t size, void **local_mapping); +void mca_smsc_knem_unmap_peer_region(void *ctx); + +#endif /* OPAL_MCA_SMSC_KNEM_SMSC_KNEM_H */ diff --git a/opal/mca/smsc/knem/smsc_knem_component.c b/opal/mca/smsc/knem/smsc_knem_component.c new file mode 100644 index 00000000000..301fc7984db --- /dev/null +++ b/opal/mca/smsc/knem/smsc_knem_component.c @@ -0,0 +1,224 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/smsc/knem/smsc_knem_internal.h" +#include "opal/util/show_help.h" + +#include +#include +#include +#include +#include +#include + +static int mca_smsc_knem_component_register(void); +static int mca_smsc_knem_component_open(void); +static int mca_smsc_knem_component_close(void); +static int mca_smsc_knem_component_query(void); +static mca_smsc_module_t *mca_smsc_knem_component_enable(void); + +static int mca_smsc_knem_fini(void); + +#define MCA_SMSC_KNEM_DEFAULT_PRIORITY 23 +static const int mca_smsc_knem_default_priority = MCA_SMSC_KNEM_DEFAULT_PRIORITY; + +mca_smsc_knem_component_t mca_smsc_knem_component = { + .super = { + .smsc_version = { + MCA_SMSC_DEFAULT_VERSION("knem"), + .mca_open_component = mca_smsc_knem_component_open, + .mca_close_component = mca_smsc_knem_component_close, + .mca_register_component_params = mca_smsc_knem_component_register, + }, + .priority = MCA_SMSC_KNEM_DEFAULT_PRIORITY, + .query = mca_smsc_knem_component_query, + .enable = mca_smsc_knem_component_enable, + }, +}; + +static int mca_smsc_knem_component_register(void) +{ + /* Currently disabling DMA mode by default; it's not clear that this is useful in all + * applications and architectures. */ + mca_smsc_knem_component.dma_min = 0; + (void) mca_base_component_var_register( + &mca_smsc_knem_component.super.smsc_version, "dma_min", + "Minimum message size (in bytes) to use the knem DMA mode; " + "ignored if knem does not support DMA mode (0 = do not use the " + "knem DMA mode, default: 0)", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_smsc_knem_component.dma_min); + + mca_smsc_base_register_default_params(&mca_smsc_knem_component.super, + mca_smsc_knem_default_priority); + return OPAL_SUCCESS; +} + +static int mca_smsc_knem_component_open(void) +{ + /* nothing to do */ + return OPAL_SUCCESS; +} + +static int mca_smsc_knem_component_close(void) +{ + return mca_smsc_knem_fini(); +} + +static int mca_smsc_knem_get_info(struct knem_cmd_info *knem_info) +{ + /* Only show the help message if this is the only component. */ + bool show_help = (1 == opal_list_get_size(&opal_smsc_base_framework.framework_components)); + + /* Check that the ABI if kernel module running is the same + * as what we were compiled against. */ + memset(knem_info, 0, sizeof(*knem_info)); + int rc = ioctl(mca_smsc_knem_component.knem_fd, KNEM_CMD_GET_INFO, knem_info); + if (rc < 0) { + if (show_help) { + opal_show_help("help-smsc-knem.txt", "knem get ABI fail", true, + opal_process_info.nodename, errno, strerror(errno)); + } + return OPAL_ERR_NOT_AVAILABLE; + } + + if (KNEM_ABI_VERSION != knem_info->abi) { + if (show_help) { + opal_show_help("help-smsc-knem.txt", "knem ABI mismatch", true, + opal_process_info.nodename, KNEM_ABI_VERSION, knem_info->abi); + } + return OPAL_ERR_NOT_AVAILABLE; + } + + return OPAL_SUCCESS; +} +static int mca_smsc_knem_reg(void *reg_data, void *base, size_t size, + mca_rcache_base_registration_t *reg) +{ + mca_smsc_knem_registration_handle_t *knem_reg = (mca_smsc_knem_registration_handle_t *) reg; + struct knem_cmd_create_region knem_cr; + struct knem_cmd_param_iovec knem_iov; + + knem_iov.base = (uintptr_t) base; + knem_iov.len = size; + + knem_cr.iovec_array = (uintptr_t) &knem_iov; + knem_cr.iovec_nr = 1; + knem_cr.protection = 0; + + if (reg->access_flags & (MCA_RCACHE_ACCESS_LOCAL_WRITE | MCA_RCACHE_ACCESS_REMOTE_WRITE)) { + knem_cr.protection |= PROT_WRITE; + } + + if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_READ) { + knem_cr.protection |= PROT_READ; + } + + /* We will explicitly destroy this cookie. Do not use the single-use flag here. */ + knem_cr.flags = 0; + if (OPAL_UNLIKELY(ioctl(mca_smsc_knem_component.knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) + < 0)) { + return OPAL_ERROR; + } + + knem_reg->data.cookie = knem_cr.cookie; + knem_reg->data.base_addr = (intptr_t) base; + + return OPAL_SUCCESS; +} + +static int mca_smsc_knem_dereg(void *reg_data, mca_rcache_base_registration_t *reg) +{ + mca_smsc_knem_registration_handle_t *knem_reg = (mca_smsc_knem_registration_handle_t *) reg; + + /* NTH: explicity ignore the return code. Don't care about this cookie anymore anyway. */ + (void) ioctl(mca_smsc_knem_component.knem_fd, KNEM_CMD_DESTROY_REGION, &knem_reg->data.cookie); + + return OPAL_SUCCESS; +} + +static int mca_smsc_knem_fini(void) +{ + if (-1 != mca_smsc_knem_component.knem_fd) { + close(mca_smsc_knem_component.knem_fd); + mca_smsc_knem_component.knem_fd = -1; + } + + if (mca_smsc_knem_module.rcache) { + (void) mca_rcache_base_module_destroy(mca_smsc_knem_module.rcache); + mca_smsc_knem_module.rcache = NULL; + } + + return OPAL_SUCCESS; +} + +static int mca_smsc_knem_component_query(void) +{ + struct knem_cmd_info knem_info; + int rc; + + /* Open the knem device. Try to print a helpful message if we + fail to open it. */ + mca_smsc_knem_component.knem_fd = open("/dev/knem", O_RDWR); + if (mca_smsc_knem_component.knem_fd < 0) { + if (EACCES == errno) { + struct stat sbuf; + if (0 != stat("/dev/knem", &sbuf)) { + sbuf.st_mode = 0; + } + opal_show_help("help-smsc-knem.txt", "knem permission denied", true, + opal_process_info.nodename, sbuf.st_mode); + } else { + opal_show_help("help-smsc-knem.txt", "knem fail open", true, opal_process_info.nodename, + errno, strerror(errno)); + } + + return OPAL_ERR_NOT_AVAILABLE; + } + + rc = mca_smsc_knem_get_info(&knem_info); + if (OPAL_SUCCESS != rc) { + mca_smsc_knem_fini(); + return rc; + } + + if (!(mca_smsc_knem_component.dma_min && (knem_info.features & KNEM_FEATURE_DMA))) { + /* disable DMA */ + mca_smsc_knem_component.dma_min = UINT_MAX; + } + + return OPAL_SUCCESS; +} + +static mca_smsc_module_t *mca_smsc_knem_component_enable(void) +{ + if (0 > mca_smsc_knem_component.super.priority) { + return NULL; + } + + mca_rcache_base_resources_t rcache_resources = {.cache_name = "smsc_knem", + .reg_data = NULL, + .sizeof_reg = sizeof( + mca_smsc_knem_registration_handle_t), + .register_mem = mca_smsc_knem_reg, + .deregister_mem = mca_smsc_knem_dereg}; + + mca_smsc_knem_module.rcache = mca_rcache_base_module_create("grdma", NULL, &rcache_resources); + if (NULL == mca_smsc_knem_module.rcache) { + return NULL; + } + + return &mca_smsc_knem_module.super; +} diff --git a/opal/mca/smsc/knem/smsc_knem_internal.h b/opal/mca/smsc/knem/smsc_knem_internal.h new file mode 100644 index 00000000000..60a8ddbf676 --- /dev/null +++ b/opal/mca/smsc/knem/smsc_knem_internal.h @@ -0,0 +1,74 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_MCA_SMSC_KNEM_SMSC_KNEM_INTERNAL_H +#define OPAL_MCA_SMSC_KNEM_SMSC_KNEM_INTERNAL_H + +#include "opal_config.h" + +#include "opal/mca/rcache/base/base.h" +#include "opal/mca/rcache/rcache.h" +#include "opal/mca/smsc/knem/smsc_knem.h" + +#include + +#include + +struct mca_smsc_knem_registration_data_t { + uint64_t cookie; + intptr_t base_addr; +}; + +typedef struct mca_smsc_knem_registration_data_t mca_smsc_knem_registration_data_t; + +struct mca_smsc_knem_registration_handle_t { + mca_rcache_base_registration_t base; + mca_smsc_knem_registration_data_t data; +}; + +typedef struct mca_smsc_knem_registration_handle_t mca_smsc_knem_registration_handle_t; + +#define MCA_SMSC_KNEM_REG_HANDLE_TO_DATA(handle) (&(handle)->data) +#define MCA_SMSC_KNEM_REG_DATA_TO_HANDLE(data_ptr) \ + ((mca_smsc_knem_registration_handle_t *) ((uintptr_t) data_ptr \ + - offsetof(mca_smsc_knem_registration_handle_t, \ + data))) + +struct mca_smsc_knem_endpoint_t { + mca_smsc_endpoint_t super; +}; + +typedef struct mca_smsc_knem_endpoint_t mca_smsc_knem_endpoint_t; + +OBJ_CLASS_DECLARATION(mca_smsc_knem_endpoint_t); + +struct mca_smsc_knem_component_t { + mca_smsc_component_t super; + + int knem_fd; + unsigned int dma_min; +}; + +typedef struct mca_smsc_knem_component_t mca_smsc_knem_component_t; + +struct mca_smsc_knem_module_t { + mca_smsc_module_t super; + + /** cache of knem attachments. this cache holds attachments for all peers. the registrations + * are differentiated by the alloc_base which is set to the endpoint. */ + mca_rcache_base_module_t *rcache; +}; + +typedef struct mca_smsc_knem_module_t mca_smsc_knem_module_t; + +extern mca_smsc_knem_module_t mca_smsc_knem_module; +extern mca_smsc_knem_component_t mca_smsc_knem_component; + +#endif /* OPAL_MCA_SMSC_KNEM_SMSC_KNEM_INTERNAL_H */ diff --git a/opal/mca/smsc/knem/smsc_knem_module.c b/opal/mca/smsc/knem/smsc_knem_module.c new file mode 100644 index 00000000000..745ca014193 --- /dev/null +++ b/opal/mca/smsc/knem/smsc_knem_module.c @@ -0,0 +1,158 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2014 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020-2021 Google, LLC. All rights reserved. + * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "opal/include/opal/align.h" +#include "opal/mca/memchecker/base/base.h" +#include "opal/mca/pmix/pmix-internal.h" +#include "opal/mca/rcache/rcache.h" +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/smsc/knem/smsc_knem_internal.h" +#include "opal/util/minmax.h" + +OBJ_CLASS_INSTANCE(mca_smsc_knem_endpoint_t, opal_object_t, NULL, NULL); + +mca_smsc_endpoint_t *mca_smsc_knem_get_endpoint(opal_proc_t *peer_proc) +{ + mca_smsc_knem_endpoint_t *endpoint = OBJ_NEW(mca_smsc_knem_endpoint_t); + if (OPAL_UNLIKELY(NULL == endpoint)) { + return NULL; + } + + endpoint->super.proc = peer_proc; + return &endpoint->super; +} + +void mca_smsc_knem_return_endpoint(mca_smsc_endpoint_t *endpoint) +{ + OBJ_RELEASE(endpoint); +} + +void *mca_smsc_knem_register_region(void *local_address, size_t size) +{ + mca_smsc_knem_module_t *knem_module = &mca_smsc_knem_module; + mca_smsc_knem_registration_handle_t *reg = NULL; + int rc; + + rc = knem_module->rcache->rcache_register(knem_module->rcache, local_address, size, + /*flags=*/0, MCA_RCACHE_ACCESS_ANY, + (mca_rcache_base_registration_t **) ®); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + opal_output_verbose( + MCA_BASE_VERBOSE_WARN, opal_smsc_base_framework.framework_output, + "mca_smsc_knem_register_mem: failed to register memory for single-copy"); + return NULL; + } + + return MCA_SMSC_KNEM_REG_HANDLE_TO_DATA(reg); +} + +void mca_smsc_knem_deregister_region(void *reg_data) +{ + mca_smsc_knem_module_t *knem_module = &mca_smsc_knem_module; + mca_smsc_knem_registration_handle_t *reg = MCA_SMSC_KNEM_REG_DATA_TO_HANDLE(reg_data); + + knem_module->rcache->rcache_deregister(knem_module->rcache, ®->base); +} + +static int mca_smsc_knem_module_copy(mca_smsc_endpoint_t *endpoint, void *local_address, + void *remote_address, size_t size, void *reg_data, + bool is_write) +{ + if (OPAL_UNLIKELY(NULL == reg_data)) { + return OPAL_ERR_BAD_PARAM; + } + + struct knem_cmd_param_iovec send_iovec = { + .base = (uintptr_t) local_address, + .len = size, + }; + mca_smsc_knem_registration_data_t *reg = (mca_smsc_knem_registration_data_t *) reg_data; + /* Fill in the ioctl data fields. There's no async completion, so + we don't need to worry about getting a slot, etc. */ + struct knem_cmd_inline_copy icopy = { + .local_iovec_array = (uintptr_t) &send_iovec, + .local_iovec_nr = 1, + .remote_cookie = reg->cookie, + .remote_offset = (uintptr_t) remote_address - reg->base_addr, + .write = is_write, + .flags = 0, + }; + + /* Use the DMA flag if knem supports it *and* the segment length + * is greater than the cutoff. Not that if DMA is not supported + * or the user specified 0 for knem_dma_min the knem_dma_min was + * set to UINT_MAX in mca_smsc_knem_query. */ + if (mca_smsc_knem_component.dma_min <= size) { + icopy.flags = KNEM_FLAG_DMA; + } + /* synchronous flags only, no need to specify icopy.async_status_index */ + + /* When the ioctl returns, the transfer is done and we can invoke + the btl callback and return the frag */ + if (OPAL_UNLIKELY(0 != ioctl(mca_smsc_knem_component.knem_fd, KNEM_CMD_INLINE_COPY, &icopy))) { + opal_output_verbose(MCA_BASE_VERBOSE_WARN, opal_smsc_base_framework.framework_output, + "mca_smsc_knem_module_copy: failed to intiate transfer"); + return OPAL_ERROR; + } + + if (KNEM_STATUS_FAILED == icopy.current_status) { + opal_output_verbose(MCA_BASE_VERBOSE_WARN, opal_smsc_base_framework.framework_output, + "mca_smsc_knem_module_copy: transfter failed"); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} + +int mca_smsc_knem_copy_to(mca_smsc_endpoint_t *endpoint, void *local_address, void *remote_address, + size_t size, void *reg_data) +{ + return mca_smsc_knem_module_copy(endpoint, local_address, remote_address, size, reg_data, + /*is_write=*/true); +} + +int mca_smsc_knem_copy_from(mca_smsc_endpoint_t *endpoint, void *local_address, + void *remote_address, size_t size, void *reg_data) +{ + return mca_smsc_knem_module_copy(endpoint, local_address, remote_address, size, reg_data, + /*is_write=*/false); +} + +/* unsupported interfaces (for MCA direct) */ +void *mca_smsc_knem_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t flags, + void *remote_address, size_t size, void **local_mapping) +{ + return NULL; +} + +void mca_smsc_knem_unmap_peer_region(void *ctx) +{ +} + +mca_smsc_knem_module_t mca_smsc_knem_module = { + .super = { + .features = MCA_SMSC_FEATURE_REQUIRE_REGISTATION, + .registration_data_size = sizeof(mca_smsc_knem_registration_data_t), + .get_endpoint = mca_smsc_knem_get_endpoint, + .return_endpoint = mca_smsc_knem_return_endpoint, + .copy_to = mca_smsc_knem_copy_to, + .copy_from = mca_smsc_knem_copy_from, + .register_region = mca_smsc_knem_register_region, + .deregister_region = mca_smsc_knem_deregister_region, + }, +}; diff --git a/opal/mca/smsc/smsc.h b/opal/mca/smsc/smsc.h new file mode 100644 index 00000000000..6d8523b253b --- /dev/null +++ b/opal/mca/smsc/smsc.h @@ -0,0 +1,258 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Shared Memory Single Copy + * + * This framework provides support for copying memory from one process to another on the same host + * system. The components expose process read, write, and may provide a way to map peer memory into + * this processes memory space. + */ + +#ifndef OPAL_MCA_SMSC_H +#define OPAL_MCA_SMSC_H + +#include "opal_config.h" +#include "opal/class/opal_object.h" +#include "opal/util/proc.h" + +#define MCA_SMSC_BASE_MAJOR_VERSION 1 +#define MCA_SMSC_BASE_MINOR_VERSION 0 +#define MCA_SMSC_BASE_PATCH_VERSION 0 + +struct mca_smsc_module_t; + +struct mca_smsc_endpoint_t { + opal_object_t super; + /** Opal proc object for this peer. */ + opal_proc_t *proc; +}; + +typedef struct mca_smsc_endpoint_t mca_smsc_endpoint_t; + +OBJ_CLASS_DECLARATION(mca_smsc_endpoint_t); + +/** + * @brief Get an endpoint for a peer proc. + * + * @param(in) module shared-memory single-copy module + * @param(in) peer_proc proc to get an endpoint for + */ +typedef mca_smsc_endpoint_t *(*mca_smsc_module_get_endpoint_fn_t)(opal_proc_t *peer_proc); + +/** + * @brief Return a shared-memory single-copy endpoint. + * + * @param(in) module shared-memory single-copy module + * @param(in) endpoint shared-memory single-copy endpoint + * + * This method returns an endpoint created by get_endpoint. The endpoint should be considered + * invalid and may be freed after this call completes. + */ +typedef void (*mca_smsc_module_return_endpoint_fn_t)(mca_smsc_endpoint_t *endpoint); + +/** + * @brief Copy to/from a peer process. + * + * @param(in) module shared-memory single-copy module + * @param(in) endpoint shared-memory single-copy endpoint + * @param(in) local_address local address to use + * @param(in) remote_address remote address to use + * @param(in) size amount to copy + * @param(in) reg_data pointer to memory containing registration data (if required) + * + * A module must provide both copy_from and copy_to function. + */ +typedef int (*mca_smsc_module_copy_fn_t)(mca_smsc_endpoint_t *endpoint, void *local_address, + void *remote_address, size_t size, void *reg_data); + +/** + * @brief Map a peer's memory onto local memory. + * + * @param(in) module shared-memory single-copy module + * @param(in) endpoint shared-memory single-copy endpoint + * @param(in) flags flags for this map operation (set to 0) + * @param(in) remote_address pointer valid in peer's address space + * @param(in) size size of region to map + * @param(out) local_mapping local address for peer region + * + * @returns a reference to the mapping + * + * This method, if provided, provides support for mapping a local peer's memory into this address + * space. The caller is responsible for verifying that the address is valid or access to the region + * may result in an access violation (SEGV). The function returns a reference (if needed) that can + * be used to clear the mapping. It is the caller's responsibility to unmap the region using the + * returned context. + */ +typedef void *(*mca_smsc_module_map_peer_region_fn_t)(mca_smsc_endpoint_t *endpoint, uint64_t flags, + void *remote_address, size_t size, + void **local_mapping); + +/** + * @brief Clear a memory mapping. + * + * @param(in) module shared-memory single-copy module + * @param(in) ctx memory mapping context + */ +typedef void (*mca_smsc_module_unmap_peer_region_fn_t)(void *ctx); + +/** + * @brief Register a memory region for remote access. + * + * @param(in) module shared-memory single-copy module + * @param(in) local_address local address to register (ideally page-aligned) + * @param(in) size size of the memory region (ideally page-aligned) + * + * @returns a pointer to registration data that can be used for copy by a peer process + * + * This method registers a region for access by a local peer. The returned data can be passed to a + * local peer and used by that peer for either copy_to or copy_from. + */ +typedef void *(*mca_smsc_module_register_region_fn_t)(void *local_address, size_t size); + +/** + * @brief Deregister a registered region. + * + * @param(in) module shared-memory single-copy module + * @param(in) reg_data registration data returned by the registration function + * + * This function deregisters a region from use by a peers copy_from and copy_to function. Once a + * region has been deregistered the data is immediately not usable by any local peer. + */ +typedef void (*mca_smsc_module_deregister_region_fn_t)(void *reg_data); + +enum { + /** Module requires the local registration of any region that will be used for single-copy + * operations. It is theresponsibility of the caller to pass this data with the pointer to the + * peer. */ + MCA_SMSC_FEATURE_REQUIRE_REGISTATION = 1, + /** Module can map peer memory into the local processes' address space. */ + MCA_SMSC_FEATURE_CAN_MAP = 2, +}; + +struct mca_smsc_module_t { + /** Module features. */ + uint64_t features; + + /** Ignore if MCA_SMSC_FEATURE_REQUIRES_REGISTRATION is not set. */ + size_t registration_data_size; + + /** Get an endpoint for a peer. This function should always return a newly-allocated endpoint. + * The base will be responsible for caching that endpoint. */ + mca_smsc_module_get_endpoint_fn_t get_endpoint; + /** Delete an endpoint and clean up all resources associated with it. */ + mca_smsc_module_return_endpoint_fn_t return_endpoint; + + /* All components must provide an implementation of the copy functions. */ + /** Copy data into a peer's memory space. */ + mca_smsc_module_copy_fn_t copy_to; + /** Copy data from a peer's memory space. */ + mca_smsc_module_copy_fn_t copy_from; + + /* Defined if MCA_SMSC_FEATURE_CAN_MAP is set. */ + /** Map a peer memory region into this processes address space. The module is allowed to cache + * the mapping and return it in subsequent calls. */ + mca_smsc_module_map_peer_region_fn_t map_peer_region; + /** Delete a mapping. This is allowed to leave the mappping in place. */ + mca_smsc_module_unmap_peer_region_fn_t unmap_peer_region; + + /* Defined if MCA_SMSC_FEATURE_REQUIRES_REGISTRATION is set. */ + /** Register a memory region for use with single-copy by a remote peer. The module may cache + * this registration for future use. */ + mca_smsc_module_register_region_fn_t register_region; + /** Deregister a memory region for use with single-copy. */ + mca_smsc_module_deregister_region_fn_t deregister_region; +}; + +typedef struct mca_smsc_module_t mca_smsc_module_t; + +/** + * @brief Query if this component can run. + * + * @returns OPAL_SUCCESS if the component can run or an opal error code otherwise + * + * This function is responsible for verifying the component can run. It should do the minimum amount + * of work to run at any time during execution. This includes sending any modex message if needed. + * It should refrain from allocating resources if possible. + */ +typedef int (*mca_smsc_component_query_fn_t)(void); + +/** + * @brief Enable the use of this component and return a module. + * + * @returns A module on success or NULL otherwise. + * + * This function should do any remaining work (not already done in query) to prepare the component + * for use. It should return a fully initialized module. + */ +typedef mca_smsc_module_t *(*mca_smsc_component_enable_fn_t)(void); + +struct mca_smsc_component_1_0_0_t { + mca_base_component_t smsc_version; + mca_base_component_data_t smsc_data; + + /** Priority of this component. Only the winning component will be used. */ + int priority; + + /** Check if this component can be used. */ + mca_smsc_component_query_fn_t query; + /** Enable the use of this component. */ + mca_smsc_component_enable_fn_t enable; +}; + +typedef struct mca_smsc_component_1_0_0_t mca_smsc_component_1_0_0_t; +typedef mca_smsc_component_1_0_0_t mca_smsc_component_t; + +OPAL_DECLSPEC extern mca_smsc_module_t *mca_smsc; + +#if MCA_opal_smsc_DIRECT_CALL +# include MCA_opal_smsc_DIRECT_CALL_HEADER + +# define MCA_SMSC_CALL_STAMP(a, b, ...) mca_smsc_##a##_##b(__VA_ARGS__) +# define MCA_SMSC_CALL_EXPANDER(a, b, ...) MCA_SMSC_CALL_STAMP(a, b, __VA_ARGS__) +# define MCA_SMSC_CALL(a, ...) \ + MCA_SMSC_CALL_EXPANDER(MCA_opal_smsc_DIRECT_CALL_COMPONENT, a, __VA_ARGS__) + +#else + +# define MCA_SMSC_CALL(a, ...) mca_smsc->a(__VA_ARGS__) + +#endif /* MCA_opal_smsc_DIRECT_CALL */ + +/** + * @brief Check if the selected component has a feature. + * + * @param(in) feature feature to check for (see smsc.h for list of features) + */ +static inline bool mca_smsc_base_has_feature(uint64_t feature) +{ + return (NULL != mca_smsc) && !!(mca_smsc->features & feature); +} + +static inline ssize_t mca_smsc_base_registration_data_size(void) +{ + if (NULL == mca_smsc || !mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTATION)) { + return OPAL_ERR_NOT_AVAILABLE; + } + + return mca_smsc->registration_data_size; +} + +#define MCA_SMSC_BASE_VERSION_1_0_0 \ + OPAL_MCA_BASE_VERSION_2_1_0("smsc", MCA_SMSC_BASE_MAJOR_VERSION, MCA_SMSC_BASE_MINOR_VERSION, \ + MCA_SMSC_BASE_PATCH_VERSION) + +#define MCA_SMSC_DEFAULT_VERSION(name) \ + MCA_SMSC_BASE_VERSION_1_0_0, .mca_component_name = name, \ + MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, \ + OPAL_MINOR_VERSION, OPAL_RELEASE_VERSION) + +#endif /* OPAL_MCA_SMSC_H */ diff --git a/opal/mca/smsc/xpmem/Makefile.am b/opal/mca/smsc/xpmem/Makefile.am new file mode 100644 index 00000000000..1944d3ef24e --- /dev/null +++ b/opal/mca/smsc/xpmem/Makefile.am @@ -0,0 +1,56 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2009 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2020-2021 Google, LLC. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = post_configure.sh + +AM_CPPFLAGS = $(smsc_xpmem_CPPFLAGS) + +libmca_smsc_xpmem_la_sources = \ + smsc_xpmem_component.c \ + smsc_xpmem_module.c \ + smsc_xpmem_internal.h \ + smsc_xpmem.h + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_opal_smsc_xpmem_DSO +component_noinst = +component_install = mca_smsc_xpmem.la +else +component_noinst = libmca_smsc_xpmem.la +component_install = +endif + +mcacomponentdir = $(opallibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_smsc_xpmem_la_SOURCES = $(libmca_smsc_xpmem_la_sources) +mca_smsc_xpmem_la_LDFLAGS = -module -avoid-version $(smsc_xpmem_LDFLAGS) +mca_smsc_xpmem_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \ + $(smsc_xpmem_LIBS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_smsc_xpmem_la_SOURCES = $(libmca_smsc_xpmem_la_sources) +libmca_smsc_xpmem_la_LIBADD = $(smsc_xpmem_LIBS) +libmca_smsc_xpmem_la_LDFLAGS = -module -avoid-version $(smsc_xpmem_LDFLAGS) diff --git a/opal/mca/smsc/xpmem/configure.m4 b/opal/mca/smsc/xpmem/configure.m4 new file mode 100644 index 00000000000..648b514724e --- /dev/null +++ b/opal/mca/smsc/xpmem/configure.m4 @@ -0,0 +1,31 @@ +# -*- shell-script -*- +# +# Copyright (c) 2009 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2015 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# Copyright (c) 2021 Google, LLC. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_smsc_xpmem_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_opal_smsc_xpmem_CONFIG],[ + AC_CONFIG_FILES([opal/mca/smsc/xpmem/Makefile]) + + OPAL_CHECK_XPMEM([smsc_xpmem], [$1], [$2]) + + AC_SUBST([smsc_xpmem_CFLAGS]) + AC_SUBST([smsc_xpmem_CPPFLAGS]) + AC_SUBST([smsc_xpmem_LDFLAGS]) + AC_SUBST([smsc_xpmem_LIBS]) +])dnl diff --git a/opal/mca/smsc/xpmem/post_configure.sh b/opal/mca/smsc/xpmem/post_configure.sh new file mode 100644 index 00000000000..a4d999a1161 --- /dev/null +++ b/opal/mca/smsc/xpmem/post_configure.sh @@ -0,0 +1 @@ +DIRECT_CALL_HEADER="opal/mca/smsc/xpmem/smsc_xpmem.h" diff --git a/opal/mca/smsc/xpmem/smsc_xpmem.h b/opal/mca/smsc/xpmem/smsc_xpmem.h new file mode 100644 index 00000000000..ae8d33efbb3 --- /dev/null +++ b/opal/mca/smsc/xpmem/smsc_xpmem.h @@ -0,0 +1,43 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_MCA_SMSC_XPMEM_SMSC_XPMEM_H +#define OPAL_MCA_SMSC_XPMEM_SMSC_XPMEM_H + +#include "opal_config.h" + +#include "opal/mca/smsc/smsc.h" + +mca_smsc_endpoint_t *mca_smsc_xpmem_get_endpoint(opal_proc_t *peer_proc); +void mca_smsc_xpmem_return_endpoint(mca_smsc_endpoint_t *endpoint); + +int mca_smsc_xpmem_copy_to(mca_smsc_endpoint_t *endpoint, void *local_address, void *remote_address, + size_t size, void *reg_handle); + +int mca_smsc_xpmem_copy_from(mca_smsc_endpoint_t *endpoint, void *local_address, + void *remote_address, size_t size, void *reg_handle); + +/** + * @brief Map a peer memory region into this processes address space. + * + * See the description in smsc.h. + * + * Caveats: XPMEM does not support futex operations within the region. Attempts to wake the + * process owning the mutex will result in an EFAULT error code. + */ +void *mca_smsc_xpmem_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t flags, + void *remote_ptr, size_t size, void **local_ptr); +void mca_smsc_xpmem_unmap_peer_region(void *ctx); + +/* unsupported interfaces defined to support MCA direct */ +void *mca_smsc_xpmem_register_region(void *local_address, size_t size); +void mca_smsc_xpmem_deregister_region(void *reg_data); + +#endif /* OPAL_MCA_SMSC_XPMEM_SMSC_XPMEM_H */ diff --git a/opal/mca/smsc/xpmem/smsc_xpmem_component.c b/opal/mca/smsc/xpmem/smsc_xpmem_component.c new file mode 100644 index 00000000000..9ebced1cb76 --- /dev/null +++ b/opal/mca/smsc/xpmem/smsc_xpmem_component.c @@ -0,0 +1,167 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "opal_config.h" + +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/smsc/xpmem/smsc_xpmem_internal.h" +#include "opal/util/minmax.h" + +#include +#include +#include +#include +#include +#include + +static int mca_smsc_xpmem_component_register(void); +static int mca_smsc_xpmem_component_open(void); +static int mca_smsc_xpmem_component_close(void); +static int mca_smsc_xpmem_component_query(void); +static mca_smsc_module_t *mca_smsc_xpmem_component_enable(void); + +#define MCA_SMSC_XPMEM_DEFAULT_PRIORITY 42 +static const int mca_smsc_xpmem_default_priority = MCA_SMSC_XPMEM_DEFAULT_PRIORITY; + +mca_smsc_xpmem_component_t mca_smsc_xpmem_component = { + .super = { + .smsc_version = { + MCA_SMSC_DEFAULT_VERSION("xpmem"), + .mca_open_component = mca_smsc_xpmem_component_open, + .mca_close_component = mca_smsc_xpmem_component_close, + .mca_register_component_params = mca_smsc_xpmem_component_register, + }, + .priority = MCA_SMSC_XPMEM_DEFAULT_PRIORITY, + .query = mca_smsc_xpmem_component_query, + .enable = mca_smsc_xpmem_component_enable, + }, +}; + +static int mca_smsc_xpmem_component_register(void) +{ + mca_smsc_xpmem_component.log_attach_align = 23; + (void) mca_base_component_var_register(&mca_smsc_xpmem_component.super.smsc_version, + "log_align", + "Log base 2 of the alignment to use for xpmem " + "segments (default: 23, minimum: 12, maximum: 25)", + MCA_BASE_VAR_TYPE_INT, /*enumerator=*/NULL, /*bind=*/0, + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_LOCAL, + &mca_smsc_xpmem_component.log_attach_align); + + mca_smsc_xpmem_component.memcpy_chunk_size = 262144; + (void) mca_base_component_var_register( + &mca_smsc_xpmem_component.super.smsc_version, "memcpy_chunk_size", + "Maximum size to copy with a single call to memcpy. On some systems a smaller or larger " + "number may provide better performance (default: 256k)", + MCA_BASE_VAR_TYPE_UINT64_T, /*enumerator=*/NULL, /*bind=*/0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_smsc_xpmem_component.memcpy_chunk_size); + + mca_smsc_base_register_default_params(&mca_smsc_xpmem_component.super, + mca_smsc_xpmem_default_priority); + return OPAL_SUCCESS; +} + +static int mca_smsc_xpmem_component_open(void) +{ + /* nothing to do */ + return OPAL_SUCCESS; +} + +static int mca_smsc_xpmem_component_close(void) +{ + if (mca_smsc_xpmem_module.vma_module) { + OBJ_RELEASE(mca_smsc_xpmem_module.vma_module); + } + + return OPAL_SUCCESS; +} + +static int mca_smsc_xpmem_send_modex(void) +{ + mca_smsc_xpmem_modex_t modex; + + modex.seg_id = mca_smsc_xpmem_component.my_seg_id; + modex.address_max = mca_smsc_xpmem_component.my_address_max; + + int rc; + OPAL_MODEX_SEND(rc, PMIX_LOCAL, &mca_smsc_xpmem_component.super.smsc_version, &modex, + sizeof(modex)); + return rc; +} + +static int mca_smsc_xpmem_component_query(void) +{ + /* Any attachment that goes past the Linux TASK_SIZE will always fail. To prevent this we need + * to determine the value of TASK_SIZE. On x86_64 the value was hard-coded in sm to be + * 0x7ffffffffffful but this approach does not work with AARCH64 (and possibly other + * architectures). Since there is really no way to directly determine the value we can (in all + * cases?) look through the mapping for this process to determine what the largest address is. + * This should be the top of the stack. No heap allocations should be larger than this value. + * Since the largest address may differ between processes the value must be shared as part of + * the modex and stored in the endpoint. */ + FILE *fh = fopen("/proc/self/maps", "r"); + if (NULL == fh) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, opal_smsc_base_framework.framework_output, + "mca_smsc_xpmem_component_query: could not open /proc/self/maps for " + "reading. disabling XPMEM"); + return OPAL_ERR_NOT_AVAILABLE; + } + + char buffer[1024]; + uintptr_t address_max = 0; + while (fgets(buffer, sizeof(buffer), fh)) { + uintptr_t low, high; + char *tmp; + /* each line of /proc/self/maps starts with low-high in hexidecimal (without a 0x) */ + low = strtoul(buffer, &tmp, 16); + high = strtoul(tmp + 1, NULL, 16); + if (address_max < high) { + address_max = high; + } + } + + fclose(fh); + + if (0 == address_max) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, opal_smsc_base_framework.framework_output, + "mca_smsc_xpmem_component_query: could not determine the address max"); + return OPAL_ERR_NOT_AVAILABLE; + } + + /* save the calcuated maximum */ + mca_smsc_xpmem_component.my_address_max = address_max - 1; + + /* it is safe to use XPMEM_MAXADDR_SIZE here (which is always (size_t)-1 even though + * it is not safe for attach */ + mca_smsc_xpmem_component.my_seg_id = xpmem_make(0, XPMEM_MAXADDR_SIZE, XPMEM_PERMIT_MODE, + (void *) 0666); + if (-1 == mca_smsc_xpmem_component.my_seg_id) { + return OPAL_ERR_NOT_AVAILABLE; + } + + mca_smsc_xpmem_send_modex(); + + return OPAL_SUCCESS; +} + +static mca_smsc_module_t *mca_smsc_xpmem_component_enable(void) +{ + if (0 > mca_smsc_xpmem_component.super.priority) { + return NULL; + } + + /* limit segment alignment to be between 4k and 16M */ + mca_smsc_xpmem_component.log_attach_align + = opal_min(opal_max(mca_smsc_xpmem_component.log_attach_align, 12), 25); + + mca_smsc_xpmem_module.vma_module = mca_rcache_base_vma_module_alloc(); + + return &mca_smsc_xpmem_module.super; +} diff --git a/opal/mca/smsc/xpmem/smsc_xpmem_internal.h b/opal/mca/smsc/xpmem/smsc_xpmem_internal.h new file mode 100644 index 00000000000..42105625c18 --- /dev/null +++ b/opal/mca/smsc/xpmem/smsc_xpmem_internal.h @@ -0,0 +1,81 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021 Google, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_MCA_SMSC_XPMEM_SMSC_XPMEM_INTERNAL_H +#define OPAL_MCA_SMSC_XPMEM_SMSC_XPMEM_INTERNAL_H + +#include "opal/mca/smsc/xpmem/smsc_xpmem.h" + +#include "opal/mca/rcache/base/rcache_base_vma.h" +#if defined(HAVE_XPMEM_H) +# include + +typedef struct xpmem_addr xpmem_addr_t; +#elif defined(HAVE_SN_XPMEM_H) +# include + +typedef int64_t xpmem_segid_t; +typedef int64_t xpmem_apid_t; +#endif + +typedef struct xpmem_addr xpmem_addr_t; + +struct mca_smsc_xpmem_modex_t { + /** XPMEM segment id for this peer */ + xpmem_segid_t seg_id; + /** maximum address we can attach to on this peer */ + uintptr_t address_max; +}; + +typedef struct mca_smsc_xpmem_modex_t mca_smsc_xpmem_modex_t; + +struct mca_smsc_xpmem_endpoint_t { + mca_smsc_endpoint_t super; + /** XPMEM apid for this peer */ + xpmem_apid_t apid; + /** maximum address we can attach to on this peer */ + uintptr_t address_max; +}; + +typedef struct mca_smsc_xpmem_endpoint_t mca_smsc_xpmem_endpoint_t; + +OBJ_CLASS_DECLARATION(mca_smsc_xpmem_endpoint_t); + +struct mca_smsc_xpmem_component_t { + mca_smsc_component_t super; + + /** maximum attachment address for this process. attempts to attach past this value may fail. */ + uintptr_t my_address_max; + /** XPMEM segment id for this process */ + xpmem_segid_t my_seg_id; + /** log base 2 of the attachment alignment. this controls how big the smallest attachment is. a + * larger value will produce fewer entries in the cache but will increase attachment time. */ + unsigned int log_attach_align; + /** maximum size that will be used with a single memcpy call. on some systems we see better + * peformance if we chunk the copy into multiple memcpy calls. */ + uint64_t memcpy_chunk_size; +}; + +typedef struct mca_smsc_xpmem_component_t mca_smsc_xpmem_component_t; + +struct mca_smsc_xpmem_module_t { + mca_smsc_module_t super; + + /** cache of xpmem attachments. this cache holds attachments for all peers. the registrations + * are differentiated by the alloc_base which is set to the endpoint. */ + mca_rcache_base_vma_module_t *vma_module; +}; + +typedef struct mca_smsc_xpmem_module_t mca_smsc_xpmem_module_t; + +extern mca_smsc_xpmem_module_t mca_smsc_xpmem_module; +extern mca_smsc_xpmem_component_t mca_smsc_xpmem_component; + +#endif /* OPAL_MCA_SMSC_XPMEM_SMSC_XPMEM_INTERNAL_H */ diff --git a/opal/mca/smsc/xpmem/smsc_xpmem_module.c b/opal/mca/smsc/xpmem/smsc_xpmem_module.c new file mode 100644 index 00000000000..d2954c1e31f --- /dev/null +++ b/opal/mca/smsc/xpmem/smsc_xpmem_module.c @@ -0,0 +1,311 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2014 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020-2021 Google, LLC. All rights reserved. + * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "opal/include/opal/align.h" +#include "opal/mca/memchecker/base/base.h" +#include "opal/mca/pmix/pmix-internal.h" +#include "opal/mca/rcache/rcache.h" +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/smsc/xpmem/smsc_xpmem_internal.h" +#include "opal/util/minmax.h" + +OBJ_CLASS_INSTANCE(mca_smsc_xpmem_endpoint_t, opal_object_t, NULL, NULL); + +mca_smsc_endpoint_t *mca_smsc_xpmem_get_endpoint(opal_proc_t *peer_proc) +{ + mca_smsc_xpmem_endpoint_t *endpoint = OBJ_NEW(mca_smsc_xpmem_endpoint_t); + if (OPAL_UNLIKELY(NULL == endpoint)) { + return NULL; + } + + endpoint->super.proc = peer_proc; + + int rc; + size_t modex_size; + mca_smsc_xpmem_modex_t *modex; + OPAL_MODEX_RECV_IMMEDIATE(rc, &mca_smsc_xpmem_component.super.smsc_version, + &peer_proc->proc_name, (void **) &modex, &modex_size); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + OBJ_RELEASE(endpoint); + return NULL; + } + + endpoint->apid = xpmem_get(modex->seg_id, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666); + endpoint->address_max = modex->address_max; + + return &endpoint->super; +} + +struct mca_smsc_xpmem_cleanup_reg_ctx_t { + mca_smsc_xpmem_endpoint_t *endpoint; + opal_list_t *registrations; +}; + +typedef struct mca_smsc_xpmem_cleanup_reg_ctx_t mca_smsc_xpmem_cleanup_reg_ctx_t; + +struct mca_smsc_xpmem_check_reg_ctx_t { + mca_smsc_xpmem_endpoint_t *endpoint; + mca_rcache_base_registration_t **reg; + uintptr_t base; + uintptr_t bound; +}; +typedef struct mca_smsc_xpmem_check_reg_ctx_t mca_smsc_xpmem_check_reg_ctx_t; + +static int mca_smsc_xpmem_check_reg(mca_rcache_base_registration_t *reg, void *ctx) +{ + mca_smsc_xpmem_check_reg_ctx_t *xpmem_ctx = (mca_smsc_xpmem_check_reg_ctx_t *) ctx; + + if (reg->alloc_base != (void *) xpmem_ctx->endpoint) { + /* ignore this registration */ + return OPAL_SUCCESS; + } + + xpmem_ctx->reg[0] = reg; + + if (xpmem_ctx->bound <= (uintptr_t) reg->bound && xpmem_ctx->base >= (uintptr_t) reg->base) { + if (0 == opal_atomic_fetch_add_32(®->ref_count, 1)) { + /* registration is being deleted by a thread in sm_return_registration. the + * VMA tree implementation will block in mca_rcache_delete until we finish + * iterating over the VMA tree so it is safe to just ignore this registration + * and continue. */ + xpmem_ctx->reg[0] = NULL; + return OPAL_SUCCESS; + } + return 1; + } + + if (MCA_RCACHE_FLAGS_INVALID & opal_atomic_fetch_or_32(®->flags, MCA_RCACHE_FLAGS_INVALID)) { + /* another thread has already marked this registration as invalid. ignore and continue. */ + xpmem_ctx->reg[0] = NULL; + return OPAL_SUCCESS; + } + + /* let the caller know we found an overlapping registration that can be coalesced into + * the requested interval. the caller will remove the last reference and delete the + * registration. */ + return 2; +} + +/* look up the remote pointer in the peer rcache and attach if + * necessary */ +void *mca_smsc_xpmem_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t flags, + void *remote_ptr, size_t size, void **local_ptr) +{ + mca_smsc_xpmem_endpoint_t *xpmem_endpoint = (mca_smsc_xpmem_endpoint_t *) endpoint; + mca_rcache_base_vma_module_t *vma_module = mca_smsc_xpmem_module.vma_module; + uint64_t attach_align = 1 << mca_smsc_xpmem_component.log_attach_align; + mca_rcache_base_registration_t *reg = NULL; + mca_smsc_xpmem_check_reg_ctx_t check_ctx = {.endpoint = xpmem_endpoint, .reg = ®}; + xpmem_addr_t xpmem_addr; + uintptr_t base, bound; + int rc; + + base = OPAL_DOWN_ALIGN((uintptr_t) remote_ptr, attach_align, uintptr_t); + bound = OPAL_ALIGN((uintptr_t) remote_ptr + size - 1, attach_align, uintptr_t) + 1; + if (OPAL_UNLIKELY(bound > xpmem_endpoint->address_max)) { + bound = xpmem_endpoint->address_max; + } + + check_ctx.base = base; + check_ctx.bound = bound; + + /* several segments may match the base pointer */ + rc = mca_rcache_base_vma_iterate(vma_module, (void *) base, bound - base, true, + mca_smsc_xpmem_check_reg, &check_ctx); + if (2 == rc) { + bound = bound < (uintptr_t) reg->bound ? (uintptr_t) reg->bound : bound; + base = base > (uintptr_t) reg->base ? (uintptr_t) reg->base : base; + mca_smsc_xpmem_unmap_peer_region(reg); + reg = NULL; + } + + if (NULL == reg) { + reg = OBJ_NEW(mca_rcache_base_registration_t); + if (OPAL_LIKELY(NULL != reg)) { + /* stick around for awhile */ + reg->ref_count = 2; + reg->base = (unsigned char *) base; + reg->bound = (unsigned char *) bound; + reg->alloc_base = (void *) endpoint; + +#if defined(HAVE_SN_XPMEM_H) + xpmem_addr.id = xpmem_endpoint->apid; +#else + xpmem_addr.apid = xpmem_endpoint->apid; +#endif + xpmem_addr.offset = base; + + opal_output_verbose(MCA_BASE_VERBOSE_INFO, opal_smsc_base_framework.framework_output, + "mca_smsc_xpmem_map_peer_region: creating region mapping " + "for endpoint %p address range %p-%p", + endpoint, reg->base, reg->bound); + + reg->rcache_context = xpmem_attach(xpmem_addr, bound - base, NULL); + if (OPAL_UNLIKELY((void *) -1 == reg->rcache_context)) { + OBJ_RELEASE(reg); + return NULL; + } + + opal_memchecker_base_mem_defined(reg->rcache_context, bound - base); + + mca_rcache_base_vma_insert(vma_module, reg, 0); + } + } + + opal_atomic_wmb(); + *local_ptr = (void *) ((uintptr_t) reg->rcache_context + + (ptrdiff_t)((uintptr_t) remote_ptr - (uintptr_t) reg->base)); + + return (void *) reg; +} + +void mca_smsc_xpmem_unmap_peer_region(void *ctx) +{ + mca_rcache_base_registration_t *reg = (mca_rcache_base_registration_t *) ctx; + mca_rcache_base_vma_module_t *vma_module = mca_smsc_xpmem_module.vma_module; + int32_t ref_count; + + ref_count = opal_atomic_add_fetch_32(®->ref_count, -1); + if (OPAL_UNLIKELY(0 == ref_count && !(reg->flags & MCA_RCACHE_FLAGS_PERSIST))) { + opal_output_verbose(MCA_BASE_VERBOSE_INFO, opal_smsc_base_framework.framework_output, + "mca_smsc_xpmem_unmap_peer_region: deleting region mapping for " + "endpoint %p address range %p-%p", + reg->alloc_base, reg->base, reg->bound); +#if OPAL_ENABLE_DEBUG + int ret = mca_rcache_base_vma_delete(vma_module, reg); + assert(OPAL_SUCCESS == ret); +#else + (void) mca_rcache_base_vma_delete(vma_module, reg); +#endif + opal_memchecker_base_mem_noaccess(reg->rcache_context, (uintptr_t)(reg->bound - reg->base)); + (void) xpmem_detach(reg->rcache_context); + OBJ_RELEASE(reg); + } +} + +static int mca_smsc_xpmem_endpoint_rcache_cleanup(mca_rcache_base_registration_t *reg, void *ctx) +{ + mca_smsc_xpmem_cleanup_reg_ctx_t *cleanup_ctx = (mca_smsc_xpmem_cleanup_reg_ctx_t *) ctx; + if (reg->alloc_base == (void *) cleanup_ctx->endpoint) { + opal_list_append(cleanup_ctx->registrations, ®->super.super); + } + + return OPAL_SUCCESS; +} + +static void mca_smsc_xpmem_cleanup_endpoint(mca_smsc_xpmem_endpoint_t *endpoint) +{ + mca_rcache_base_registration_t *reg; + opal_list_t registrations; + mca_smsc_xpmem_cleanup_reg_ctx_t cleanup_ctx = {.endpoint = endpoint, + .registrations = ®istrations}; + + opal_output_verbose(MCA_BASE_VERBOSE_INFO, opal_smsc_base_framework.framework_output, + "mca_smsc_xpmem_cleanup_endpoint: cleaning up endpoint %p", endpoint); + + OBJ_CONSTRUCT(®istrations, opal_list_t); + + /* clean out the registration cache */ + (void) mca_rcache_base_vma_iterate(mca_smsc_xpmem_module.vma_module, NULL, (size_t) -1, true, + mca_smsc_xpmem_endpoint_rcache_cleanup, + (void *) &cleanup_ctx); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, opal_smsc_base_framework.framework_output, + "mca_smsc_xpmem_cleanup_endpoint: deleting %" PRIsize_t " region mappings", + opal_list_get_size(®istrations)); + while (NULL + != (reg = (mca_rcache_base_registration_t *) opal_list_remove_first(®istrations))) { + mca_smsc_xpmem_unmap_peer_region(reg); + } + OBJ_DESTRUCT(®istrations); + + xpmem_release(endpoint->apid); + endpoint->apid = 0; +} + +void mca_smsc_xpmem_return_endpoint(mca_smsc_endpoint_t *endpoint) +{ + mca_smsc_xpmem_cleanup_endpoint((mca_smsc_xpmem_endpoint_t *) endpoint); + OBJ_RELEASE(endpoint); +} + +/* memcpy is faster at larger sizes but is undefined if the + pointers are aliased (TODO -- readd alias check) */ +static inline void mca_smsc_xpmem_memmove(void *dst, void *src, size_t size) +{ + while (size > 0) { + size_t copy_size = opal_min(size, mca_smsc_xpmem_component.memcpy_chunk_size); + memcpy(dst, src, copy_size); + dst = (void *) ((uintptr_t) dst + copy_size); + src = (void *) ((uintptr_t) src + copy_size); + size -= copy_size; + } +} + +int mca_smsc_xpmem_copy_to(mca_smsc_endpoint_t *endpoint, void *local_address, void *remote_address, + size_t size, void *reg_handle) +{ + /* ignore the registration handle as it is not used for XPMEM */ + (void) reg_handle; + + void *remote_ptr, *ctx; + ctx = mca_smsc_xpmem_map_peer_region(endpoint, /*flags=*/0, remote_address, size, &remote_ptr); + mca_smsc_xpmem_memmove(remote_ptr, local_address, size); + + mca_smsc_xpmem_unmap_peer_region(ctx); + + return OPAL_SUCCESS; +} + +int mca_smsc_xpmem_copy_from(mca_smsc_endpoint_t *endpoint, void *local_address, + void *remote_address, size_t size, void *reg_handle) +{ + /* ignore the registration handle as it is not used for XPMEM */ + (void) reg_handle; + + void *remote_ptr, *ctx; + + struct timespec start, stop; + ctx = mca_smsc_xpmem_map_peer_region(endpoint, /*flags=*/0, remote_address, size, &remote_ptr); + mca_smsc_xpmem_memmove(local_address, remote_ptr, size); + + mca_smsc_xpmem_unmap_peer_region(ctx); + + return OPAL_SUCCESS; +} + +/* unsupported interfaces defined to support MCA direct */ +void *mca_smsc_xpmem_register_region(void *local_address, size_t size) +{ + return NULL; +} + +void mca_smsc_xpmem_deregister_region(void *reg_data) +{ +} + +mca_smsc_xpmem_module_t mca_smsc_xpmem_module = { + .super = { + .features = MCA_SMSC_FEATURE_CAN_MAP, + .get_endpoint = mca_smsc_xpmem_get_endpoint, + .return_endpoint = mca_smsc_xpmem_return_endpoint, + .copy_to = mca_smsc_xpmem_copy_to, + .copy_from = mca_smsc_xpmem_copy_from, + .map_peer_region = mca_smsc_xpmem_map_peer_region, + .unmap_peer_region = mca_smsc_xpmem_unmap_peer_region, + }, +}; diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index 33e756039f6..169cd89af7a 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -54,6 +54,7 @@ #include "opal/mca/pmix/base/base.h" #include "opal/mca/reachable/base/base.h" #include "opal/mca/shmem/base/base.h" +#include "opal/mca/smsc/base/base.h" #include "opal/mca/threads/threads.h" #include "opal/mca/threads/tsd.h" #include "opal/mca/timer/base/base.h" @@ -614,11 +615,12 @@ int opal_init_util(int *pargc, char ***pargv) * versions of memcpy correctly configured. */ static mca_base_framework_t *opal_init_frameworks[] = { - &opal_threads_base_framework, &opal_hwloc_base_framework, - &opal_memcpy_base_framework, &opal_memchecker_base_framework, + &opal_threads_base_framework, &opal_hwloc_base_framework, + &opal_memcpy_base_framework, &opal_memchecker_base_framework, &opal_backtrace_base_framework, &opal_timer_base_framework, - &opal_shmem_base_framework, &opal_reachable_base_framework, - &opal_pmix_base_framework, NULL, + &opal_shmem_base_framework, &opal_reachable_base_framework, + &opal_pmix_base_framework, &opal_smsc_base_framework, + NULL, }; int opal_init(int *pargc, char ***pargv)