@@ -446,6 +446,60 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
446
446
return OMPI_SUCCESS ;
447
447
}
448
448
449
+ /**
450
+ * @brief gather information of module state and module state handle inside a shared comm
451
+ *
452
+ * @param module[in] ompi osc rdma module
453
+ * @param peer_state_array
454
+ */
455
+ static int gather_peer_state_and_handle (ompi_osc_rdma_module_t * module )
456
+ {
457
+ int ret , handle_size , comm_size ;
458
+
459
+ comm_size = ompi_comm_size (module -> comm );
460
+
461
+ module -> peer_state_array = calloc (comm_size , sizeof (uintptr_t ));
462
+ if (NULL == module -> peer_state_array ) {
463
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to allocate memory for module state array!" );
464
+ return OMPI_ERR_OUT_OF_RESOURCE ;
465
+ }
466
+
467
+ ret = module -> comm -> c_coll -> coll_allgather (& module -> state , sizeof (uintptr_t ), MPI_BYTE ,
468
+ module -> peer_state_array , sizeof (uintptr_t ), MPI_BYTE ,
469
+ module -> comm , module -> comm -> c_coll -> coll_allgather_module );
470
+ if (OMPI_SUCCESS != ret ) {
471
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "module state allgather failed with ompi error code %d" , ret );
472
+ return ret ;
473
+ }
474
+
475
+ if (module -> use_memory_registration ) {
476
+ handle_size = module -> selected_btls [0 ]-> btl_registration_handle_size ;
477
+ module -> peer_state_handle_array = calloc (comm_size , handle_size );
478
+ if (NULL == module -> peer_state_handle_array ) {
479
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to allocate memory for module state handle array!" );
480
+ return OMPI_ERR_OUT_OF_RESOURCE ;
481
+ }
482
+
483
+ ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , module -> state , module -> state_size ,
484
+ MCA_BTL_REG_FLAG_ACCESS_ANY , & module -> state_handle );
485
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
486
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "module state memory registration failed with ompi error code %d" , ret );
487
+ return ret ;
488
+ }
489
+
490
+ ret = module -> comm -> c_coll -> coll_allgather (module -> state_handle , handle_size , MPI_BYTE ,
491
+ module -> peer_state_handle_array , handle_size , MPI_BYTE ,
492
+ module -> comm , module -> comm -> c_coll -> coll_allgather_module );
493
+ if (OMPI_SUCCESS != ret ) {
494
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "module state allgather failed with ompi error code %d" , ret );
495
+ return ret ;
496
+ }
497
+ }
498
+
499
+ return 0 ;
500
+ }
501
+
502
+
449
503
static int allocate_state_single (ompi_osc_rdma_module_t * module , void * * base , size_t size )
450
504
{
451
505
size_t total_size , local_rank_array_size , leader_peer_data_size ;
@@ -491,20 +545,19 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
491
545
* base = (void * ) ((intptr_t ) module -> node_comm_info + leader_peer_data_size );
492
546
}
493
547
494
- /* just go ahead and register the whole segment */
495
- ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , module -> rank_array , total_size ,
496
- MCA_BTL_REG_FLAG_ACCESS_ANY , & module -> state_handle );
497
- if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
498
- return ret ;
499
- }
500
-
501
548
if (MPI_WIN_FLAVOR_DYNAMIC != module -> flavor ) {
502
549
ret = ompi_osc_rdma_initialize_region (module , base , size );
503
550
if (OMPI_SUCCESS != ret ) {
504
551
return ret ;
505
552
}
506
553
}
507
554
555
+ ret = gather_peer_state_and_handle (module );
556
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
557
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
558
+ return ret ;
559
+ }
560
+
508
561
ret = ompi_osc_rdma_new_peer (module , my_rank , & my_peer );
509
562
if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
510
563
return ret ;
@@ -711,16 +764,6 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
711
764
if (0 == local_rank ) {
712
765
/* unlink the shared memory backing file */
713
766
opal_shmem_unlink (& module -> seg_ds );
714
- /* just go ahead and register the whole segment */
715
- ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , module -> segment_base , total_size ,
716
- MCA_BTL_REG_FLAG_ACCESS_ANY , & module -> state_handle );
717
- if (OPAL_LIKELY (OMPI_SUCCESS == ret )) {
718
- state_region -> base = (intptr_t ) module -> segment_base ;
719
- if (module -> state_handle ) {
720
- memcpy (state_region -> btl_handle_data , module -> state_handle ,
721
- module -> selected_btls [0 ]-> btl_registration_handle_size );
722
- }
723
- }
724
767
}
725
768
726
769
/* synchronization to make sure memory is registered */
@@ -749,6 +792,11 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
749
792
break ;
750
793
}
751
794
795
+ ret = gather_peer_state_and_handle (module );
796
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
797
+ break ;
798
+ }
799
+
752
800
offset = data_base ;
753
801
ompi_osc_rdma_peer_t * local_leader ;
754
802
for (int i = 0 ; i < local_size ; ++ i ) {
@@ -777,18 +825,15 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
777
825
peer -> state = (osc_rdma_counter_t ) peer_state ;
778
826
peer -> state_endpoint = NULL ;
779
827
} else {
780
- /* use my endpoint handle to modify the peer's state */
781
828
if (module -> use_memory_registration ) {
782
- peer -> state_handle = (mca_btl_base_registration_handle_t * ) state_region -> btl_handle_data ;
783
- }
784
- peer -> state = (osc_rdma_counter_t ) ((uintptr_t ) state_region -> base + state_base + module -> state_size * i );
785
- if (i == 0 ) {
786
- peer -> state_endpoint = peer -> data_endpoint ;
787
- peer -> state_btl_index = peer -> data_btl_index ;
788
- } else {
789
- peer -> state_endpoint = local_leader -> state_endpoint ;
790
- peer -> state_btl_index = local_leader -> state_btl_index ;
829
+ assert (module -> peer_state_handle_array );
830
+ peer -> state_handle = (mca_btl_base_registration_handle_t * )(module -> peer_state_handle_array + peer_rank * module -> selected_btls [0 ]-> btl_registration_handle_size );
791
831
}
832
+
833
+ assert (NULL != module -> peer_state_array );
834
+ peer -> state = (osc_rdma_counter_t )module -> peer_state_array [peer_rank ];
835
+ peer -> state_endpoint = peer -> data_endpoint ;
836
+ peer -> state_btl_index = peer -> data_btl_index ;
792
837
}
793
838
794
839
if (my_rank == peer_rank ) {
0 commit comments