Closed
Description
Hi.
When I run a program in which one rank sends and another rank receives a large message, btl_register_mem
is called twice on each rank with same base
and size
. This behavior can be seen on both master branch and v2.x branch (since BTL interface change by @hjelmn?). Is this an intended behavior?
Here's the call stack when mca_btl_openib_register_mem
is called using openib BTL. I think the first call is for the RDMA Direct protocol and the second call is for the RDMA Pipeline protocol.
v2.x branch
sender
first call:
#0 mca_btl_openib_register_mem (btl=0x73c470, endpoint=0x8450f0, base=0x7fffffff1d60, size=50000, flags=2)
at opal/mca/btl/openib/btl_openib.c:1926
#1 0x00007fffee02955a in mca_pml_ob1_rdma_btls (bml_endpoint=0x85a6d0, base=0x7fffffff1d60 "", size=50000, rdma_btls=0x78f248)
at ompi/mca/pml/ob1/pml_ob1_rdma.c:88
#2 0x00007fffee0269b4 in mca_pml_ob1_send_request_start_btl (sendreq=0x78ef00, bml_btl=0x7acf50)
at ompi/mca/pml/ob1/pml_ob1_sendreq.h:424
#3 0x00007fffee026b35 in mca_pml_ob1_send_request_start_seq (sendreq=0x78ef00, endpoint=0x85a6d0, seqn=1)
at ompi/mca/pml/ob1/pml_ob1_sendreq.h:470
#4 0x00007fffee027b3e in mca_pml_ob1_send (buf=0x7fffffff1d60, count=50000, datatype=0x601880, dst=1, tag=0, sendmode=MCA_PML_BASE_SEND_STANDARD, comm=0x601280)
at ompi/mca/pml/ob1/pml_ob1_isend.c:259
#5 0x00007ffff7d233b7 in PMPI_Send (buf=0x7fffffff1d60, count=50000, type=0x601880, dest=1, tag=0, comm=0x601280)
at psend.c:77
#6 0x00000000004009cf in main ()
second call:
#0 mca_btl_openib_register_mem (btl=0x73c470, endpoint=0x8450f0, base=0x7fffffff1d60, size=50000, flags=0)
at opal/mca/btl/openib/btl_openib.c:1926
#1 0x00007fffee032700 in mca_bml_base_register_mem (bml_btl=0x7ac200, base=0x7fffffff1d60, size=50000, flags=0, handle=0x78c4d0)
at ompi/mca/bml/bml.h:352
#2 0x00007fffee0355a7 in mca_pml_ob1_send_request_put_frag (frag=0x78c400)
at ompi/mca/pml/ob1/pml_ob1_sendreq.c:1158
#3 0x00007fffee035897 in mca_pml_ob1_send_request_put (sendreq=0x78ef00, btl=0x73c470, hdr=0x7a4f04)
at ompi/mca/pml/ob1/pml_ob1_sendreq.c:1236
#4 0x00007fffee02be7d in mca_pml_ob1_recv_frag_callback_put (btl=0x73c470, tag=72 'H', des=0x7a2e80, cbdata=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:414
#5 0x00007fffee66ee3e in btl_openib_handle_incoming (openib_btl=0x73c470, ep=0x8450f0, frag=0x7a2e80, byte_len=68)
at opal/mca/btl/openib/btl_openib_component.c:3080
#6 0x00007fffee66fb28 in handle_wc (device=0x7334d0, cq=0, wc=0x7ffffffeea30)
at opal/mca/btl/openib/btl_openib_component.c:3501
#7 0x00007fffee67013d in poll_device (device=0x7334d0, count=1)
at opal/mca/btl/openib/btl_openib_component.c:3628
#8 0x00007fffee67058a in progress_one_device (device=0x7334d0)
at opal/mca/btl/openib/btl_openib_component.c:3738
#9 0x00007fffee670629 in btl_openib_component_progress ()
at opal/mca/btl/openib/btl_openib_component.c:3762
#10 0x00007ffff769610a in opal_progress ()
at opal/runtime/opal_progress.c:225
#11 0x00007fffee025a55 in ompi_request_wait_completion (req=0x78ef00)
at ompi/request/request.h:397
#12 0x00007fffee027b5c in mca_pml_ob1_send (buf=0x7fffffff1d60, count=50000, datatype=0x601880, dst=1, tag=0, sendmode=MCA_PML_BASE_SEND_STANDARD, comm=0x601280)
at ompi/mca/pml/ob1/pml_ob1_isend.c:261
#13 0x00007ffff7d233b7 in PMPI_Send (buf=0x7fffffff1d60, count=50000, type=0x601880, dest=1, tag=0, comm=0x601280)
at psend.c:77
#14 0x00000000004009cf in main ()
receiver
first call:
#0 mca_btl_openib_register_mem (btl=0x73c430, endpoint=0x7fffe8012670, base=0x7fffffff1d60, size=50000, flags=2)
at opal/mca/btl/openib/btl_openib.c:1926
#1 0x00007fffee02955a in mca_pml_ob1_rdma_btls (bml_endpoint=0x86fd10, base=0x7fffffff1d60 'c' <repeats 200 times>..., size=50000, rdma_btls=0x790158)
at ompi/mca/pml/ob1/pml_ob1_rdma.c:88
#2 0x00007fffee02f2ce in mca_pml_ob1_recv_request_ack (recvreq=0x78fe80, hdr=0x7a4f04, bytes_received=0)
at ompi/mca/pml/ob1/pml_ob1_recvreq.c:279
#3 0x00007fffee030198 in mca_pml_ob1_recv_request_progress_rndv (recvreq=0x78fe80, btl=0x73c430, segments=0x7a2f08, num_segments=1)
at ompi/mca/pml/ob1/pml_ob1_recvreq.c:791
#4 0x00007fffee02c762 in mca_pml_ob1_recv_frag_match (btl=0x73c430, hdr=0x7a4f04, segments=0x7a2f08, num_segments=1, type=66)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:715
#5 0x00007fffee02baf3 in mca_pml_ob1_recv_frag_callback_rndv (btl=0x73c430, tag=66 'B', des=0x7a2e80, cbdata=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:268
#6 0x00007fffee66ee3e in btl_openib_handle_incoming (openib_btl=0x73c430, ep=0x7fffe8012670, frag=0x7a2e80, byte_len=36)
at opal/mca/btl/openib/btl_openib_component.c:3080
#7 0x00007fffee66fb28 in handle_wc (device=0x733490, cq=0, wc=0x7ffffffeea90)
at opal/mca/btl/openib/btl_openib_component.c:3501
#8 0x00007fffee67013d in poll_device (device=0x733490, count=1)
at opal/mca/btl/openib/btl_openib_component.c:3628
#9 0x00007fffee67058a in progress_one_device (device=0x733490)
at opal/mca/btl/openib/btl_openib_component.c:3738
#10 0x00007fffee670629 in btl_openib_component_progress ()
at opal/mca/btl/openib/btl_openib_component.c:3762
#11 0x00007ffff7696172 in opal_progress ()
at opal/runtime/opal_progress.c:231
#12 0x00007fffee022a7a in ompi_request_wait_completion (req=0x78fe80)
at ompi/request/request.h:397
#13 0x00007fffee023c9a in mca_pml_ob1_recv (addr=0x7fffffff1d60, count=50000, datatype=0x601880, src=0, tag=0, comm=0x601280, status=0x0)
at ompi/mca/pml/ob1/pml_ob1_irecv.c:129
#14 0x00007ffff7d1a107 in PMPI_Recv (buf=0x7fffffff1d60, count=50000, type=0x601880, source=0, tag=0, comm=0x601280, status=0x0)
at precv.c:77
#15 0x0000000000400a14 in main ()
second call:
#0 mca_btl_openib_register_mem (btl=0x73c430, endpoint=0x7fffe8012670, base=0x7fffffff1d60, size=50000, flags=4)
at opal/mca/btl/openib/btl_openib.c:1926
#1 0x00007fffee02d9ff in mca_bml_base_register_mem (bml_btl=0x86fef0, base=0x7fffffff1d60, size=50000, flags=4, handle=0x78c450)
at ompi/mca/bml/bml.h:352
#2 0x00007fffee0308f2 in mca_pml_ob1_recv_request_schedule_once (recvreq=0x78fe80, start_bml_btl=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvreq.c:1010
#3 0x00007fffee02e858 in mca_pml_ob1_recv_request_schedule_exclusive (req=0x78fe80, start_bml_btl=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvreq.h:388
#4 0x00007fffee02e8c4 in mca_pml_ob1_recv_request_schedule (req=0x78fe80, start_bml_btl=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvreq.h:406
#5 0x00007fffee03033b in mca_pml_ob1_recv_request_progress_rndv (recvreq=0x78fe80, btl=0x73c430, segments=0x7a2f08, num_segments=1)
at ompi/mca/pml/ob1/pml_ob1_recvreq.c:823
#6 0x00007fffee02c762 in mca_pml_ob1_recv_frag_match (btl=0x73c430, hdr=0x7a4f04, segments=0x7a2f08, num_segments=1, type=66)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:715
#7 0x00007fffee02baf3 in mca_pml_ob1_recv_frag_callback_rndv (btl=0x73c430, tag=66 'B', des=0x7a2e80, cbdata=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:268
#8 0x00007fffee66ee3e in btl_openib_handle_incoming (openib_btl=0x73c430, ep=0x7fffe8012670, frag=0x7a2e80, byte_len=36)
at opal/mca/btl/openib/btl_openib_component.c:3080
#9 0x00007fffee66fb28 in handle_wc (device=0x733490, cq=0, wc=0x7ffffffeea90)
at opal/mca/btl/openib/btl_openib_component.c:3501
#10 0x00007fffee67013d in poll_device (device=0x733490, count=1)
at opal/mca/btl/openib/btl_openib_component.c:3628
#11 0x00007fffee67058a in progress_one_device (device=0x733490)
at opal/mca/btl/openib/btl_openib_component.c:3738
#12 0x00007fffee670629 in btl_openib_component_progress ()
at opal/mca/btl/openib/btl_openib_component.c:3762
#13 0x00007ffff7696172 in opal_progress ()
at opal/runtime/opal_progress.c:231
#14 0x00007fffee022a7a in ompi_request_wait_completion (req=0x78fe80)
at ompi/request/request.h:397
#15 0x00007fffee023c9a in mca_pml_ob1_recv (addr=0x7fffffff1d60, count=50000, datatype=0x601880, src=0, tag=0, comm=0x601280, status=0x0)
at ompi/mca/pml/ob1/pml_ob1_irecv.c:129
#16 0x00007ffff7d1a107 in PMPI_Recv (buf=0x7fffffff1d60, count=50000, type=0x601880, source=0, tag=0, comm=0x601280, status=0x0)
at precv.c:77
#17 0x0000000000400a14 in main ()
master branch
sender
first call:
#0 mca_btl_openib_register_mem (btl=0x74cf20, endpoint=0x86c160, base=0x7fffffff1c90, size=50000, flags=2)
at opal/mca/btl/openib/btl_openib.c:1930
#1 0x00007fffed1d1792 in mca_pml_ob1_rdma_btls (bml_endpoint=0x7be010, base=0x7fffffff1c90 "", size=50000, rdma_btls=0x7a0f48)
at ompi/mca/pml/ob1/pml_ob1_rdma.c:88
#2 0x00007fffed1ce7c2 in mca_pml_ob1_send_request_start_btl (sendreq=0x7a0c00, bml_btl=0x86c7f0)
at ompi/mca/pml/ob1/pml_ob1_sendreq.h:420
#3 0x00007fffed1ce975 in mca_pml_ob1_send_request_start_seq (sendreq=0x7a0c00, endpoint=0x7be010, seqn=1)
at ompi/mca/pml/ob1/pml_ob1_sendreq.h:466
#4 0x00007fffed1cf99c in mca_pml_ob1_send (buf=0x7fffffff1c90, count=50000, datatype=0x601880, dst=1, tag=0, sendmode=MCA_PML_BASE_SEND_STANDARD, comm=0x601280)
at ompi/mca/pml/ob1/pml_ob1_isend.c:260
#5 0x00007ffff7d04da0 in PMPI_Send (buf=0x7fffffff1c90, count=50000,
type=0x601880, dest=1, tag=0, comm=0x601280) at psend.c:78
#6 0x00000000004009cf in main ()
second call:
#0 mca_btl_openib_register_mem (btl=0x74cf20, endpoint=0x86c160, base=0x7fffffff1c90, size=50000, flags=0)
at opal/mca/btl/openib/btl_openib.c:1930
#1 0x00007fffed1dc494 in mca_bml_base_register_mem (bml_btl=0x86c710, base=0x7fffffff1c90, size=50000, flags=0, handle=0x79e4d0)
at ompi/mca/bml/bml.h:354
#2 0x00007fffed1df4b5 in mca_pml_ob1_send_request_put_frag (frag=0x79e400)
at ompi/mca/pml/ob1/pml_ob1_sendreq.c:1160
#3 0x00007fffed1df7a5 in mca_pml_ob1_send_request_put (sendreq=0x7a0c00, btl=0x74cf20, hdr=0x7b6f04)
at ompi/mca/pml/ob1/pml_ob1_sendreq.c:1238
#4 0x00007fffed1d478b in mca_pml_ob1_recv_frag_callback_put (btl=0x74cf20, tag=72 'H', des=0x7b4e80, cbdata=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:414
#5 0x00007fffeda23b61 in btl_openib_handle_incoming (openib_btl=0x74cf20, ep=0x86c160, frag=0x7b4e80, byte_len=68)
at opal/mca/btl/openib/btl_openib_component.c:3062
#6 0x00007fffeda2491a in handle_wc (device=0x740b70, cq=0, wc=0x7ffffffee960)
at opal/mca/btl/openib/btl_openib_component.c:3483
#7 0x00007fffeda24f2f in poll_device (device=0x740b70, count=1)
at opal/mca/btl/openib/btl_openib_component.c:3610
#8 0x00007fffeda253e3 in progress_one_device (device=0x740b70)
at opal/mca/btl/openib/btl_openib_component.c:3720
#9 0x00007fffeda25482 in btl_openib_component_progress ()
at opal/mca/btl/openib/btl_openib_component.c:3744
#10 0x00007ffff762b80d in opal_progress ()
at opal/runtime/opal_progress.c:221
#11 0x00007fffed1cd665 in ompi_request_wait_completion (req=0x7a0c00)
at ompi/request/request.h:397
#12 0x00007fffed1cf9ba in mca_pml_ob1_send (buf=0x7fffffff1c90, count=50000, datatype=0x601880, dst=1, tag=0, sendmode=MCA_PML_BASE_SEND_STANDARD, comm=0x601280)
at ompi/mca/pml/ob1/pml_ob1_isend.c:262
#13 0x00007ffff7d04da0 in PMPI_Send (buf=0x7fffffff1c90, count=50000, type=0x601880, dest=1, tag=0, comm=0x601280)
at psend.c:78
#14 0x00000000004009cf in main ()
receiver
first call:
#0 mca_btl_openib_register_mem (btl=0x74cf30, endpoint=0x7ffff00123b0, base=0x7fffffff1c90, size=50000, flags=2)
at opal/mca/btl/openib/btl_openib.c:1930
#1 0x00007fffed1d1792 in mca_pml_ob1_rdma_btls (bml_endpoint=0x881500, base=0x7fffffff1c90 'c' <repeats 200 times>..., size=50000, rdma_btls=0x7a1e58)
at ompi/mca/pml/ob1/pml_ob1_rdma.c:88
#2 0x00007fffed1d852b in mca_pml_ob1_recv_request_ack (recvreq=0x7a1b80, hdr=0x7b6f04, bytes_received=0)
at ompi/mca/pml/ob1/pml_ob1_recvreq.c:277
#3 0x00007fffed1d9567 in mca_pml_ob1_recv_request_progress_rndv (recvreq=0x7a1b80, btl=0x74cf30, segments=0x7b4f08, num_segments=1)
at ompi/mca/pml/ob1/pml_ob1_recvreq.c:789
#4 0x00007fffed1d50b8 in mca_pml_ob1_recv_frag_match (btl=0x74cf30, hdr=0x7b6f04, segments=0x7b4f08, num_segments=1, type=66)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:715
#5 0x00007fffed1d4401 in mca_pml_ob1_recv_frag_callback_rndv (btl=0x74cf30, tag=66 'B', des=0x7b4e80, cbdata=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:268
#6 0x00007fffeda23b61 in btl_openib_handle_incoming (openib_btl=0x74cf30, ep=0x7ffff00123b0, frag=0x7b4e80, byte_len=36)
at opal/mca/btl/openib/btl_openib_component.c:3062
#7 0x00007fffeda2491a in handle_wc (device=0x740b80, cq=0, wc=0x7ffffffee9c0)
at opal/mca/btl/openib/btl_openib_component.c:3483
#8 0x00007fffeda24f2f in poll_device (device=0x740b80, count=1)
at opal/mca/btl/openib/btl_openib_component.c:3610
#9 0x00007fffeda253e3 in progress_one_device (device=0x740b80)
at opal/mca/btl/openib/btl_openib_component.c:3720
#10 0x00007fffeda25482 in btl_openib_component_progress ()
at opal/mca/btl/openib/btl_openib_component.c:3744
#11 0x00007ffff762b86a in opal_progress ()
at opal/runtime/opal_progress.c:227
#12 0x00007fffed1c9e92 in ompi_request_wait_completion (req=0x7a1b80)
at ompi/request/request.h:397
#13 0x00007fffed1cb122 in mca_pml_ob1_recv (addr=0x7fffffff1c90, count=50000, datatype=0x601880, src=0, tag=0, comm=0x601280, status=0x0)
at ompi/mca/pml/ob1/pml_ob1_irecv.c:135
#14 0x00007ffff7cfa514 in PMPI_Recv (buf=0x7fffffff1c90, count=50000, type=0x601880, source=0, tag=0, comm=0x601280, status=0x0)
at precv.c:79
#15 0x0000000000400a14 in main ()
second call:
#0 mca_btl_openib_register_mem (btl=0x74cf30, endpoint=0x7ffff00123b0, base=0x7fffffff1c90, size=50000, flags=4)
at opal/mca/btl/openib/btl_openib.c:1930
#1 0x00007fffed1d6a68 in mca_bml_base_register_mem (bml_btl=0x8816a0, base=0x7fffffff1c90, size=50000, flags=4, handle=0x79e4d0)
at ompi/mca/bml/bml.h:354
#2 0x00007fffed1d9df1 in mca_pml_ob1_recv_request_schedule_once (recvreq=0x7a1b80, start_bml_btl=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvreq.c:1008
#3 0x00007fffed1d798a in mca_pml_ob1_recv_request_schedule_exclusive (req=0x7a1b80, start_bml_btl=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvreq.h:388
#4 0x00007fffed1d79f6 in mca_pml_ob1_recv_request_schedule (req=0x7a1b80, start_bml_btl=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvreq.h:406
#5 0x00007fffed1d9758 in mca_pml_ob1_recv_request_progress_rndv (recvreq=0x7a1b80, btl=0x74cf30, segments=0x7b4f08, num_segments=1)
at ompi/mca/pml/ob1/pml_ob1_recvreq.c:821
#6 0x00007fffed1d50b8 in mca_pml_ob1_recv_frag_match (btl=0x74cf30, hdr=0x7b6f04, segments=0x7b4f08, num_segments=1, type=66)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:715
#7 0x00007fffed1d4401 in mca_pml_ob1_recv_frag_callback_rndv (btl=0x74cf30, tag=66 'B', des=0x7b4e80, cbdata=0x0)
at ompi/mca/pml/ob1/pml_ob1_recvfrag.c:268
#8 0x00007fffeda23b61 in btl_openib_handle_incoming (openib_btl=0x74cf30, ep=0x7ffff00123b0, frag=0x7b4e80, byte_len=36)
at opal/mca/btl/openib/btl_openib_component.c:3062
#9 0x00007fffeda2491a in handle_wc (device=0x740b80, cq=0, wc=0x7ffffffee9c0)
at opal/mca/btl/openib/btl_openib_component.c:3483
#10 0x00007fffeda24f2f in poll_device (device=0x740b80, count=1)
at opal/mca/btl/openib/btl_openib_component.c:3610
#11 0x00007fffeda253e3 in progress_one_device (device=0x740b80)
at opal/mca/btl/openib/btl_openib_component.c:3720
#12 0x00007fffeda25482 in btl_openib_component_progress ()
at opal/mca/btl/openib/btl_openib_component.c:3744
#13 0x00007ffff762b86a in opal_progress ()
at opal/runtime/opal_progress.c:227
#14 0x00007fffed1c9e92 in ompi_request_wait_completion (req=0x7a1b80)
at ompi/request/request.h:397
#15 0x00007fffed1cb122 in mca_pml_ob1_recv (addr=0x7fffffff1c90, count=50000, datatype=0x601880, src=0, tag=0, comm=0x601280, status=0x0)
at ompi/mca/pml/ob1/pml_ob1_irecv.c:135
#16 0x00007ffff7cfa514 in PMPI_Recv (buf=0x7fffffff1c90, count=50000, type=0x601880, source=0, tag=0, comm=0x601280, status=0x0)
at precv.c:79
#17 0x0000000000400a14 in main ()
Metadata
Metadata
Assignees
Labels
No labels