Skip to content

Commit e1a70f6

Browse files
MithunMohanKadavilamd-nithyavs
authored andcommitted
Tracking framework for xpmem rcache registrations in acoll.
A hash table, as part of the acoll modules struct, is used to track the rcache registrations done as part of the register_and_cache api called from acoll collective components. This hash table is then iterated over during module destruct and each rcache registration is deregistered to ensure that the rcache module destroy proceeds correctly. Signed-off-by: Mithun Mohan <[email protected]>
1 parent 71b87a5 commit e1a70f6

File tree

3 files changed

+55
-0
lines changed

3 files changed

+55
-0
lines changed

ompi/mca/coll/acoll/coll_acoll.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#ifdef HAVE_XPMEM_H
2424
#include "opal/mca/rcache/base/base.h"
25+
#include "opal/class/opal_hash_table.h"
2526
#include <xpmem.h>
2627
#endif
2728

@@ -125,6 +126,7 @@ typedef struct coll_acoll_data {
125126
void **xpmem_raddr;
126127
mca_rcache_base_module_t **rcache;
127128
void *scratch;
129+
opal_hash_table_t **xpmem_reg_tracker_ht;
128130
#endif
129131
opal_shmem_ds_t *allshmseg_id;
130132
void **allshmmmap_sbuf;

ompi/mca/coll/acoll/coll_acoll_component.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,8 +243,24 @@ static void mca_coll_acoll_module_destruct(mca_coll_acoll_module_t *module)
243243
if (ompi_comm_rank(subc->orig_comm) == j) {
244244
continue;
245245
}
246+
// Dereg all rcache regs.
247+
uint64_t key = 0;
248+
uint64_t value = 0;
249+
uint64_t zero_value = 0;
250+
OPAL_HASH_TABLE_FOREACH(key,uint64,value,(data->xpmem_reg_tracker_ht[j])) {
251+
mca_rcache_base_registration_t* reg =
252+
(mca_rcache_base_registration_t*) key;
253+
254+
for (uint64_t d_i = 0; d_i < value; ++d_i) {
255+
(data->rcache[j])->rcache_deregister(data->rcache[j], reg);
256+
}
257+
opal_hash_table_set_value_uint64(data->xpmem_reg_tracker_ht[j],
258+
key, (void*)(zero_value));
259+
}
246260
xpmem_release(data->all_apid[j]);
247261
mca_rcache_base_module_destroy(data->rcache[j]);
262+
opal_hash_table_remove_all(data->xpmem_reg_tracker_ht[j]);
263+
OBJ_RELEASE(data->xpmem_reg_tracker_ht[j]);
248264
}
249265
xpmem_remove(data->allseg_id[ompi_comm_rank(subc->orig_comm)]);
250266

@@ -262,6 +278,8 @@ static void mca_coll_acoll_module_destruct(mca_coll_acoll_module_t *module)
262278
data->xpmem_raddr = NULL;
263279
free(data->scratch);
264280
data->scratch = NULL;
281+
free(data->xpmem_reg_tracker_ht);
282+
data->xpmem_reg_tracker_ht = NULL;
265283
free(data->rcache);
266284
data->rcache = NULL;
267285
#endif

ompi/mca/coll/acoll/coll_acoll_utils.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,14 @@ static inline int coll_acoll_init(mca_coll_base_module_t *module, ompi_communica
692692
ret = OMPI_ERR_OUT_OF_RESOURCE;
693693
goto error_hndl;
694694
}
695+
data->xpmem_reg_tracker_ht = NULL;
696+
data->xpmem_reg_tracker_ht = (opal_hash_table_t **) malloc(sizeof(opal_hash_table_t*) * size);
697+
if (NULL == data->xpmem_reg_tracker_ht) {
698+
line = __LINE__;
699+
ret = OMPI_ERR_OUT_OF_RESOURCE;
700+
goto error_hndl;
701+
}
702+
695703
seg_id = xpmem_make(0, XPMEM_MAXADDR_SIZE, XPMEM_PERMIT_MODE, (void *) 0666);
696704
if (-1 == seg_id) {
697705
line = __LINE__;
@@ -733,6 +741,8 @@ static inline int coll_acoll_init(mca_coll_base_module_t *module, ompi_communica
733741
line = __LINE__;
734742
goto error_hndl;
735743
}
744+
data->xpmem_reg_tracker_ht[i] = OBJ_NEW(opal_hash_table_t);
745+
opal_hash_table_init(data->xpmem_reg_tracker_ht[i], 2048);
736746
}
737747
}
738748
#endif
@@ -831,6 +841,8 @@ static inline int coll_acoll_init(mca_coll_base_module_t *module, ompi_communica
831841
data->xpmem_saddr = NULL;
832842
free(data->xpmem_raddr);
833843
data->xpmem_raddr = NULL;
844+
free(data->xpmem_reg_tracker_ht);
845+
data->xpmem_reg_tracker_ht = NULL;
834846
free(data->rcache);
835847
data->rcache = NULL;
836848
free(data->scratch);
@@ -851,6 +863,25 @@ static inline int coll_acoll_init(mca_coll_base_module_t *module, ompi_communica
851863
}
852864

853865
#ifdef HAVE_XPMEM_H
866+
static inline void update_rcache_reg_hashtable_entry
867+
(struct acoll_xpmem_rcache_reg_t *reg,
868+
opal_hash_table_t* ht)
869+
{
870+
// Converting pointer to uint64 to use as key.
871+
uint64_t key = (uint64_t)reg;
872+
// Converting uint64_t to pointer type to use for value.
873+
uint64_t value = 1;
874+
int ht_ret = opal_hash_table_get_value_uint64(ht, key, (void**)(&value));
875+
876+
if (OPAL_ERR_NOT_FOUND == ht_ret) {
877+
value = 1;
878+
opal_hash_table_set_value_uint64(ht, key, (void*)(value));
879+
} else if (OPAL_SUCCESS == ht_ret) {
880+
value += 1;
881+
opal_hash_table_set_value_uint64(ht, key, (void*)(value));
882+
}
883+
}
884+
854885
static inline void register_and_cache(int size, size_t total_dsize, int rank,
855886
coll_acoll_data_t *data)
856887
{
@@ -870,6 +901,8 @@ static inline void register_and_cache(int size, size_t total_dsize, int rank,
870901
sbuf_reg = NULL;
871902
return;
872903
}
904+
update_rcache_reg_hashtable_entry(sbuf_reg, data->xpmem_reg_tracker_ht[i]);
905+
873906
data->xpmem_saddr[i] = (void *) ((uintptr_t) sbuf_reg->xpmem_vaddr
874907
+ ((uintptr_t) data->allshm_sbuf[i]
875908
- (uintptr_t) sbuf_reg->base.base));
@@ -884,6 +917,8 @@ static inline void register_and_cache(int size, size_t total_dsize, int rank,
884917
rbuf_reg = NULL;
885918
return;
886919
}
920+
update_rcache_reg_hashtable_entry(rbuf_reg, data->xpmem_reg_tracker_ht[i]);
921+
887922
data->xpmem_raddr[i] = (void *) ((uintptr_t) rbuf_reg->xpmem_vaddr
888923
+ ((uintptr_t) data->allshm_rbuf[i]
889924
- (uintptr_t) rbuf_reg->base.base));

0 commit comments

Comments
 (0)