Skip to content

Fix some SM OSC component initialization issues #9991

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,7 @@ test/class/ompi_rb_tree
test/class/ompi_bitmap
test/class/opal_bitmap
test/class/opal_fifo
test/class/opal_cstring
test/class/opal_hash_table
test/class/opal_lifo
test/class/opal_list
Expand Down
4 changes: 0 additions & 4 deletions ompi/mca/osc/base/osc_base_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,6 @@ ompi_osc_base_select(ompi_win_t *win,
priority = component->osc_query(win, base, size, disp_unit, comm,
win->super.s_info, flavor);
if (priority < 0) {
if (MPI_WIN_FLAVOR_SHARED == flavor && OMPI_ERR_RMA_SHARED == priority) {
/* NTH: quick fix to return OMPI_ERR_RMA_SHARED */
return OMPI_ERR_RMA_SHARED;
}
continue;
}

Expand Down
4 changes: 0 additions & 4 deletions ompi/mca/osc/monitoring/osc_monitoring_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,6 @@ static int mca_osc_monitoring_component_select(struct ompi_win_t *win, void **ba

priority = component->osc_query(win, base, size, disp_unit, comm, info, flavor);
if (priority < 0) {
if (MPI_WIN_FLAVOR_SHARED == flavor && OMPI_ERR_RMA_SHARED == priority) {
/* NTH: quick fix to return OMPI_ERR_RMA_SHARED */
return OMPI_ERR_RMA_SHARED;
}
continue;
}

Expand Down
4 changes: 2 additions & 2 deletions ompi/mca/osc/rdma/osc_rdma_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
{

if (MPI_WIN_FLAVOR_SHARED == flavor) {
return OMPI_ERR_RMA_SHARED;
return -1;
}

#if OPAL_CUDA_SUPPORT
Expand All @@ -403,7 +403,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
return mca_osc_rdma_component.priority;
}

return OMPI_ERROR;
return -1;
}

static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void **base, size_t size) {
Expand Down
3 changes: 3 additions & 0 deletions ompi/mca/osc/sm/osc_sm.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ typedef struct ompi_osc_sm_node_state_t ompi_osc_sm_node_state_t;
struct ompi_osc_sm_component_t {
ompi_osc_base_component_t super;

/** Priority of the osc/sm component */
unsigned int priority;

char *backing_directory;
};
typedef struct ompi_osc_sm_component_t ompi_osc_sm_component_t;
Expand Down
54 changes: 32 additions & 22 deletions ompi/mca/osc/sm/osc_sm_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ ompi_osc_sm_module_t ompi_osc_sm_module_template = {

static int component_register (void)
{
char *description_str;

if (0 == access ("/dev/shm", W_OK)) {
mca_osc_sm_component.backing_directory = "/dev/shm";
} else {
Expand All @@ -128,6 +130,16 @@ static int component_register (void)
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_READONLY, &mca_osc_sm_component.backing_directory);

mca_osc_sm_component.priority = 100;
opal_asprintf(&description_str, "Priority of the osc/sm component (default: %d)",
mca_osc_sm_component.priority);
(void)mca_base_component_var_register(&mca_osc_sm_component.super.osc_version,
"priority", description_str,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP,
&mca_osc_sm_component.priority);
free(description_str);

return OPAL_SUCCESS;
}

Expand All @@ -154,36 +166,32 @@ component_finalize(void)
}


static int
check_win_ok(ompi_communicator_t *comm, int flavor)
{
if (! (MPI_WIN_FLAVOR_SHARED == flavor
|| MPI_WIN_FLAVOR_ALLOCATE == flavor) ) {
return OMPI_ERR_NOT_SUPPORTED;
}

if (ompi_group_have_remote_peers (comm->c_local_group)) {
return OMPI_ERR_RMA_SHARED;
}

return OMPI_SUCCESS;
}


static int
component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit,
struct ompi_communicator_t *comm, struct opal_info_t *info,
int flavor)
{
int ret;
if (OMPI_SUCCESS != (ret = check_win_ok(comm, flavor))) {
if (OMPI_ERR_NOT_SUPPORTED == ret) {

/* component only supports shared or allocate flavors */
if (! (MPI_WIN_FLAVOR_SHARED == flavor ||
MPI_WIN_FLAVOR_ALLOCATE == flavor)) {
return -1;
}

/* If flavor is win_allocate, we can't run if there are remote
* peers in the group. The same check for flavor_shared happens
* in select(), so that we can return an error to the user (since
* we should be able to run for all flavor_shared use cases.
* There's no way to return an error from component_query to the
* user, hence the delayed check. */
if (MPI_WIN_FLAVOR_ALLOCATE == flavor) {
if (ompi_group_have_remote_peers(comm->c_local_group)) {
return -1;
}
return ret;
}

return 100;
return mca_osc_sm_component.priority;
}


Expand All @@ -198,8 +206,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
int ret = OMPI_ERROR;
size_t memory_alignment = OPAL_ALIGN_MIN;

if (OMPI_SUCCESS != (ret = check_win_ok(comm, flavor))) {
return ret;
assert(MPI_WIN_FLAVOR_SHARED == flavor || MPI_WIN_FLAVOR_ALLOCATE == flavor);

if (ompi_group_have_remote_peers(comm->c_local_group)) {
return OMPI_ERR_RMA_SHARED;
}

/* create module structure */
Expand Down
46 changes: 29 additions & 17 deletions opal/class/opal_cstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ static void opal_cstring_ctor(opal_cstring_t *obj)
{
*(size_t *) &(obj->length) = 0;
/* make sure the string is null-terminated */
((char *) obj->string)[0] = '\n';
((char *) obj->string)[0] = '\0';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mea culpa, thanks for catching that :(

}

static inline size_t opal_cstring_alloc_size(size_t len)
Expand Down Expand Up @@ -90,10 +90,18 @@ int opal_cstring_to_int(opal_cstring_t *string, int *interp)
if (*endp != '\0') {
return OPAL_ERR_BAD_PARAM;
}
/* underflow */
/* base errors */
if (tmp == 0 && errno == EINVAL) {
return OPAL_ERR_BAD_PARAM;
}
/* overflow/underflow of long int (return of strtol) */
if (errno == ERANGE && (tmp == LONG_MIN || tmp == LONG_MAX)) {
return OPAL_ERR_BAD_PARAM;
}
/* overflow/underflow of int (for cases where long int is larger) */
if (tmp < INT_MIN || tmp > INT_MAX) {
return OPAL_ERR_BAD_PARAM;
}

*interp = (int) tmp;

Expand All @@ -104,24 +112,28 @@ static int opal_str_to_bool_impl(const char *string, bool *interp)
{
const char *ptr = string;

/* Trim leading whitespace */
while (isspace(*ptr)) {
++ptr;
}
if (NULL != string) {
/* Trim leading whitespace */
while (isspace(*ptr)) {
++ptr;
}

if ('\0' != *ptr) {
if (isdigit(*ptr)) {
*interp = (bool) atoi(ptr);
} else if (0 == strncasecmp(ptr, "yes", 3) || 0 == strncasecmp(ptr, "true", 4)) {
*interp = true;
} else if (0 == strncasecmp(ptr, "no", 2) && 0 == strncasecmp(ptr, "false", 5)) {
*interp = false;
} else {
*interp = false;
return OPAL_ERR_BAD_PARAM;
if ('\0' != *ptr) {
if (isdigit(*ptr)) {
*interp = (bool) atoi(ptr);
return OPAL_SUCCESS;
} else if (0 == strncasecmp(ptr, "yes", 3) || 0 == strncasecmp(ptr, "true", 4)) {
*interp = true;
return OPAL_SUCCESS;
} else if (0 == strncasecmp(ptr, "no", 2) || 0 == strncasecmp(ptr, "false", 5)) {
*interp = false;
return OPAL_SUCCESS;
}
}
}
return OPAL_SUCCESS;

*interp = false;
return OPAL_ERR_BAD_PARAM;
}

int opal_cstring_to_bool(opal_cstring_t *string, bool *interp)
Expand Down
1 change: 0 additions & 1 deletion opal/class/opal_cstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
#ifndef OPAL_STRING_H
#define OPAL_STRING_H

#include "opal_config.h"
#include "opal/class/opal_object.h"
#include "opal/mca/base/mca_base_var_enum.h"

Expand Down
9 changes: 8 additions & 1 deletion test/class/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ check_PROGRAMS = \
opal_value_array \
opal_pointer_array \
opal_lifo \
opal_fifo
opal_fifo \
opal_cstring

TESTS = $(check_PROGRAMS)

Expand Down Expand Up @@ -94,6 +95,12 @@ opal_fifo_LDADD = \
$(top_builddir)/test/support/libsupport.a
opal_fifo_DEPENDENCIES = $(opal_fifo_LDADD)

opal_cstring_SOURCES = opal_cstring.c
opal_cstring_LDADD = \
$(top_builddir)/opal/lib@[email protected] \
$(top_builddir)/test/support/libsupport.a
opal_cstring_DEPENDENCIES = $(opal_cstring_LDADD)

clean-local:
rm -f opal_bitmap_test_out.txt opal_hash_table_test_out.txt opal_proc_table_test_out.txt

Expand Down
Loading