open-mpi
diff --git a/‎ompi/mca/mtl/mtl.h
Lines changed: 4 additions & 0 deletions b/‎ompi/mca/mtl/mtl.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎ompi/mca/mtl/psm2/mtl_psm2.c
Lines changed: 13 additions & 1 deletion b/‎ompi/mca/mtl/psm2/mtl_psm2.c
Lines changed: 13 additions & 1 deletion
diff --git a/‎ompi/mca/pml/cm/pml_cm.h
Lines changed: 33 additions & 11 deletions b/‎ompi/mca/pml/cm/pml_cm.h
Lines changed: 33 additions & 11 deletions
diff --git a/‎ompi/mca/pml/cm/pml_cm_recvreq.h
Lines changed: 15 additions & 6 deletions b/‎ompi/mca/pml/cm/pml_cm_recvreq.h
Lines changed: 15 additions & 6 deletions
diff --git a/‎ompi/mca/pml/cm/pml_cm_request.h
Lines changed: 17 additions & 0 deletions b/‎ompi/mca/pml/cm/pml_cm_request.h
Lines changed: 17 additions & 0 deletions
@@ -5,6 +5,7 @@
  * Copyright (c) 2012      Sandia National Laboratories.  All rights reserved.
  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -61,6 +62,9 @@ typedef struct mca_mtl_request_t mca_mtl_request_t;
  * MTL module flags
  */
 #define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001
+#if OPAL_CUDA_SUPPORT
+#define MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE 0x00000002
+#endif
 
 /**
  * Initialization routine for MTL component
 
@@ -11,7 +11,7 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006      QLogic Corporation. All rights reserved.
- * Copyright (c) 2013-2015 Intel, Inc. All rights reserved
+ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved
  * Copyright (c) 2014      Los Alamos National Security, LLC. All rights
  *                         reserved.
  * Copyright (c) 2016      Research Organization for Information Science
@@ -100,6 +100,9 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
     char *generated_key;
     char env_string[256];
     int rc;
+#if OPAL_CUDA_SUPPORT
+    char *cuda_env;
+#endif
 
     generated_key = getenv(OPAL_MCA_PREFIX"orte_precondition_transports");
     memset(uu, 0, sizeof(psm2_uuid_t));
@@ -173,6 +176,15 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
     /* register the psm2 progress function */
     opal_progress_register(ompi_mtl_psm2_progress);
 
+#if OPAL_CUDA_SUPPORT
+    ompi_mtl_psm2.super.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE;
+
+    cuda_env = getenv("PSM2_CUDA");
+    if (!cuda_env || ( strcmp(cuda_env, "0") == 0) )
+        opal_output(0, "Warning: If running with device buffers, there is a"
+                    " chance the application might fail. Try setting PSM2_CUDA=1.\n");
+#endif
+
     return OMPI_SUCCESS;
 }
 
 
@@ -6,6 +6,7 @@
  *                         reserved.
  * Copyright (c) 2015      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -79,6 +80,7 @@ mca_pml_cm_irecv_init(void *addr,
                       struct ompi_request_t **request)
 {
     mca_pml_cm_hvy_recv_request_t *recvreq;
+    uint32_t flags = 0;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc;
 #endif
@@ -87,7 +89,7 @@ mca_pml_cm_irecv_init(void *addr,
     if( OPAL_UNLIKELY(NULL == recvreq) ) return OMPI_ERR_OUT_OF_RESOURCE;
 
     MCA_PML_CM_HVY_RECV_REQUEST_INIT(recvreq, ompi_proc, comm, tag, src,
-                                     datatype, addr, count, true);
+                                     datatype, addr, count, flags, true);
 
     *request = (ompi_request_t*) recvreq;
 
@@ -104,6 +106,7 @@ mca_pml_cm_irecv(void *addr,
                  struct ompi_request_t **request)
 {
     int ret;
+    uint32_t flags = 0;
     mca_pml_cm_thin_recv_request_t *recvreq;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc = NULL;
@@ -118,7 +121,8 @@ mca_pml_cm_irecv(void *addr,
                                       src,
                                       datatype,
                                       addr,
-                                      count);
+                                      count,
+                                      flags);
 
     MCA_PML_CM_THIN_RECV_REQUEST_START(recvreq, comm, tag, src, ret);
 
@@ -145,6 +149,7 @@ mca_pml_cm_recv(void *addr,
                 ompi_status_public_t * status)
 {
     int ret;
+    uint32_t flags = 0;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t *ompi_proc;
 #endif
@@ -173,20 +178,24 @@ mca_pml_cm_recv(void *addr,
         ompi_proc = ompi_comm_peer_lookup( comm, src );
     }
 
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+
     opal_convertor_copy_and_prepare_for_recv(
 	ompi_proc->super.proc_convertor,
 		&(datatype->super),
 		count,
 		addr,
-		0,
+		flags,
 		&convertor );
 #else
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+
     opal_convertor_copy_and_prepare_for_recv(
 	ompi_mpi_local_convertor,
 		&(datatype->super),
 		count,
 		addr,
-		0,
+		flags,
 		&convertor );
 #endif
 
@@ -222,6 +231,7 @@ mca_pml_cm_isend_init(const void* buf,
                         ompi_request_t** request)
 {
     mca_pml_cm_hvy_send_request_t *sendreq;
+    uint32_t flags = 0;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc;
 #endif
@@ -230,7 +240,7 @@ mca_pml_cm_isend_init(const void* buf,
     if (OPAL_UNLIKELY(NULL == sendreq)) return OMPI_ERR_OUT_OF_RESOURCE;
 
     MCA_PML_CM_HVY_SEND_REQUEST_INIT(sendreq, ompi_proc, comm, tag, dst,
-                                     datatype, sendmode, true, false, buf, count);
+                                     datatype, sendmode, true, false, buf, count, flags);
 
     /* Work around a leak in start by marking this request as complete. The
      * problem occured because we do not have a way to differentiate an
@@ -254,6 +264,7 @@ mca_pml_cm_isend(const void* buf,
                    ompi_request_t** request)
 {
     int ret;
+    uint32_t flags = 0;
 
     if(sendmode == MCA_PML_BASE_SEND_BUFFERED ) {
         mca_pml_cm_hvy_send_request_t* sendreq;
@@ -274,7 +285,8 @@ mca_pml_cm_isend(const void* buf,
                                          false,
                                          false,
                                          buf,
-                                         count);
+                                         count,
+                                         flags);
 
         MCA_PML_CM_HVY_SEND_REQUEST_START( sendreq, ret);
 
@@ -296,7 +308,8 @@ mca_pml_cm_isend(const void* buf,
                                           datatype,
                                           sendmode,
                                           buf,
-                                          count);
+                                          count,
+                                          flags);
 
         MCA_PML_CM_THIN_SEND_REQUEST_START(
                                            sendreq,
@@ -324,6 +337,7 @@ mca_pml_cm_send(const void *buf,
                 ompi_communicator_t* comm)
 {
     int ret = OMPI_ERROR;
+    uint32_t flags = 0;
     ompi_proc_t * ompi_proc;
 
     if(sendmode == MCA_PML_BASE_SEND_BUFFERED) {
@@ -342,7 +356,8 @@ mca_pml_cm_send(const void *buf,
                                          false,
                                          false,
                                          buf,
-                                         count);
+                                         count,
+                                         flags);
         MCA_PML_CM_HVY_SEND_REQUEST_START(sendreq, ret);
         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
             MCA_PML_CM_HVY_SEND_REQUEST_RETURN(sendreq);
@@ -368,9 +383,12 @@ mca_pml_cm_send(const void *buf,
 #endif
 	{
 		ompi_proc = ompi_comm_peer_lookup(comm, dst);
+
+                MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+
 		opal_convertor_copy_and_prepare_for_send(
 		ompi_proc->super.proc_convertor,
-			&datatype->super, count, buf, 0,
+			&datatype->super, count, buf, flags,
 			&convertor);
 	}
 
@@ -459,6 +477,7 @@ mca_pml_cm_imrecv(void *buf,
                   struct ompi_request_t **request)
 {
     int ret;
+    uint32_t flags = 0;
     mca_pml_cm_thin_recv_request_t *recvreq;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc;
@@ -474,7 +493,8 @@ mca_pml_cm_imrecv(void *buf,
                                       (*message)->peer,
                                       datatype,
                                       buf,
-                                      count);
+                                      count,
+                                      flags);
 
     MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq, message, ret);
 
@@ -491,6 +511,7 @@ mca_pml_cm_mrecv(void *buf,
                  ompi_status_public_t* status)
 {
     int ret;
+    uint32_t flags = 0;
     mca_pml_cm_thin_recv_request_t *recvreq;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc;
@@ -506,7 +527,8 @@ mca_pml_cm_mrecv(void *buf,
                                       (*message)->peer,
                                       datatype,
                                       buf,
-                                      count);
+                                      count,
+                                      flags);
 
     MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq,
                                                message, ret);
 
@@ -13,6 +13,7 @@
  * Copyright (c) 2012      Sandia National Laboratories.  All rights reserved.
  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -92,7 +93,8 @@ do {                                                                           \
                                            src,                         \
                                            datatype,                    \
                                            addr,                        \
-                                           count )                      \
+                                           count,                       \
+					   flags )                      \
 do {                                                                    \
     OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false);            \
     (request)->req_base.req_ompi.req_mpi_object.comm = comm;            \
@@ -108,12 +110,13 @@ do {                                                                    \
     } else {                                                            \
         ompi_proc = ompi_comm_peer_lookup( comm, src );                 \
     }                                                                   \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
     opal_convertor_copy_and_prepare_for_recv(                           \
                                   ompi_proc->super.proc_convertor,      \
                                   &(datatype->super),                   \
                                   count,                                \
                                   addr,                                 \
-                                  0,                                    \
+                                  flags,                                    \
                                   &(request)->req_base.req_convertor ); \
 } while(0)
 #else
@@ -123,7 +126,8 @@ do {                                                                    \
                                            src,                         \
                                            datatype,                    \
                                            addr,                        \
-                                           count )                      \
+                                           count,                       \
+					   flags )                      \
 do {                                                                    \
     OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false);            \
     (request)->req_base.req_ompi.req_mpi_object.comm = comm;            \
@@ -134,12 +138,13 @@ do {                                                                    \
     OBJ_RETAIN(comm);                                                   \
     OMPI_DATATYPE_RETAIN(datatype);                                     \
                                                                         \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
     opal_convertor_copy_and_prepare_for_recv(                           \
         ompi_mpi_local_convertor,                                       \
         &(datatype->super),                                             \
         count,                                                          \
         addr,                                                           \
-        0,                                                              \
+        flags,                                                              \
         &(request)->req_base.req_convertor );                           \
 } while(0)
 #endif
@@ -153,6 +158,7 @@ do {                                                                    \
                                           datatype,                     \
                                           addr,                         \
                                           count,                        \
+					  flags,                        \
                                           persistent)                   \
 do {                                                                    \
     OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent);       \
@@ -173,12 +179,13 @@ do {                                                                    \
     } else {                                                            \
         ompi_proc = ompi_comm_peer_lookup( comm, src );                 \
     }                                                                   \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
     opal_convertor_copy_and_prepare_for_recv(                           \
                                   ompi_proc->super.proc_convertor,      \
                                   &(datatype->super),                   \
                                   count,                                \
                                   addr,                                 \
-                                  0,                                    \
+                                  flags,                                \
                                   &(request)->req_base.req_convertor ); \
  } while(0)
 #else
@@ -190,6 +197,7 @@ do {                                                                    \
                                           datatype,                     \
                                           addr,                         \
                                           count,                        \
+					  flags,                        \
                                           persistent)                   \
 do {                                                                    \
     OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent);       \
@@ -205,12 +213,13 @@ do {                                                                    \
     OBJ_RETAIN(comm);                                                   \
     OMPI_DATATYPE_RETAIN(datatype);                                     \
                                                                         \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
     opal_convertor_copy_and_prepare_for_recv(                           \
         ompi_mpi_local_convertor,                                       \
         &(datatype->super),                                             \
         count,                                                          \
         addr,                                                           \
-        0,                                                              \
+        flags,                                                              \
         &(request)->req_base.req_convertor );                           \
  } while(0)
 #endif
 
@@ -9,6 +9,7 @@
  *                         University of Stuttgart.  All rights reserved.
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -53,4 +54,20 @@ struct mca_pml_cm_request_t {
 typedef struct mca_pml_cm_request_t mca_pml_cm_request_t;
 OBJ_CLASS_DECLARATION(mca_pml_cm_request_t);
 
+/*
+ * Avoid CUDA convertor inits only for contiguous memory and if indicated by
+ * the MTL. For non-contiguous memory, do not skip CUDA convertor init phases.
+ */
+#if OPAL_CUDA_SUPPORT
+#define MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count)            \
+    {                                                                           \
+        if (opal_datatype_is_contiguous_memory_layout(&datatype->super, count)  \
+            && (ompi_mtl->mtl_flags & MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE)) {   \
+            flags |= CONVERTOR_SKIP_CUDA_INIT;                                  \
+        }                                                                       \
+    }
+#else
+#define MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count)
+#endif
+
 #endif