[SYCL] Improve memobj creation for devices w/o host unified memory (#2761)

sergey-semenov · web-flow · commit a4f092417ef9 · 2020-11-18T15:01:38.000+03:00
The aggressive use of USE_HOST_PTR memory object creation option led to
significant performance overhead on devices without host unified memory
support. Furthermore, any discard access modes were ignored during the
allocation leading to unnecessary copy.

This patch makes it so that allocations for such devices are made with
COPY_HOST_PTR instead, and only if necessitated by the access mode.
These allocations are also not linked anymore, since linked allocations
rely on USE_HOST_PTR option.
diff --git a/sycl/include/CL/sycl/detail/sycl_mem_obj_t.hpp b/sycl/include/CL/sycl/detail/sycl_mem_obj_t.hpp
@@ -295,6 +295,10 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI {
   ContextImplPtr getInteropContext() const override { return MInteropContext; }
 
 protected:
+  // An allocateMem helper that determines which host ptr to use
+  void determineHostPtr(const ContextImplPtr &Context, bool InitFromUserData,
+                        void *&HostPtr, bool &HostPtrReadOnly);
+
   // Allocator used for allocation memory on host.
   unique_ptr_class<SYCLMemObjAllocator> MAllocator;
   // Properties passed by user.
diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp
@@ -16,18 +16,8 @@ namespace sycl {
 namespace detail {
 void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData,
                   void *HostPtr, RT::PiEvent &OutEventToWait) {
-  // The host pointer for the allocation can be provided in 2 ways:
-  // 1. Initialize the allocation from user data. Check if the user pointer is
-  // read-only.
-  // 2. Use a HostPtr allocated by the runtime. Assume any such pointer to be
-  // read-write.
   bool HostPtrReadOnly = false;
-  if (InitFromUserData) {
-    assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "
-                       "simultaneously");
-    HostPtr = BaseT::getUserPtr();
-    HostPtrReadOnly = BaseT::MHostPtrReadOnly;
-  }
+  BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);
 
   assert(!(nullptr == HostPtr && BaseT::useHostPtr() && Context->is_host()) &&
          "Internal error. Allocating memory on the host "
diff --git a/sycl/source/detail/image_impl.cpp b/sycl/source/detail/image_impl.cpp
@@ -307,18 +307,8 @@ template <int Dimensions>
 void *image_impl<Dimensions>::allocateMem(ContextImplPtr Context,
                                           bool InitFromUserData, void *HostPtr,
                                           RT::PiEvent &OutEventToWait) {
-  // The host pointer for the allocation can be provided in 2 ways:
-  // 1. Initialize the allocation from user data. Check if the user pointer is
-  // read-only.
-  // 2. Use a HostPtr allocated by the runtime. Assume any such pointer to be
-  // read-write.
   bool HostPtrReadOnly = false;
-  if (InitFromUserData) {
-    assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "
-                       "simultaneously");
-    HostPtr = BaseT::getUserPtr();
-    HostPtrReadOnly = BaseT::MHostPtrReadOnly;
-  }
+  BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);
 
   RT::PiMemImageDesc Desc = getImageDesc(HostPtr != nullptr);
   assert(checkImageDesc(Desc, Context, HostPtr) &&
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
@@ -119,16 +119,38 @@ void *MemoryManager::allocateInteropMemObject(
   return UserPtr;
 }
 
+RT::PiMemFlags getMemObjCreationFlags(const ContextImplPtr &TargetContext,
+                                      void *UserPtr, bool HostPtrReadOnly) {
+  // Create read_write mem object to handle arbitrary uses.
+  RT::PiMemFlags Result = PI_MEM_FLAGS_ACCESS_RW;
+  if (UserPtr) {
+    if (HostPtrReadOnly)
+      Result |= PI_MEM_FLAGS_HOST_PTR_COPY;
+    else {
+      // Create the memory object using the host pointer only if the devices
+      // support host_unified_memory to avoid potential copy overhead.
+      // TODO This check duplicates the one performed in the GraphBuilder during
+      // AllocaCommand creation. This information should be propagated here
+      // instead, which would be a breaking ABI change.
+      bool HostUnifiedMemory = true;
+      for (const device &Device : TargetContext->getDevices())
+        HostUnifiedMemory &=
+            Device.get_info<info::device::host_unified_memory>();
+      Result |= HostUnifiedMemory ? PI_MEM_FLAGS_HOST_PTR_USE
+                                  : PI_MEM_FLAGS_HOST_PTR_COPY;
+    }
+  }
+
+  return Result;
+}
+
 void *MemoryManager::allocateImageObject(ContextImplPtr TargetContext,
                                          void *UserPtr, bool HostPtrReadOnly,
                                          const RT::PiMemImageDesc &Desc,
                                          const RT::PiMemImageFormat &Format,
                                          const sycl::property_list &) {
-  // Create read_write mem object by default to handle arbitrary uses.
-  RT::PiMemFlags CreationFlags = PI_MEM_FLAGS_ACCESS_RW;
-  if (UserPtr)
-    CreationFlags |= HostPtrReadOnly ? PI_MEM_FLAGS_HOST_PTR_COPY
-                                     : PI_MEM_FLAGS_HOST_PTR_USE;
+  RT::PiMemFlags CreationFlags =
+      getMemObjCreationFlags(TargetContext, UserPtr, HostPtrReadOnly);
 
   RT::PiMem NewMem;
   const detail::plugin &Plugin = TargetContext->getPlugin();
@@ -142,13 +164,10 @@ void *
 MemoryManager::allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr,
                                     bool HostPtrReadOnly, const size_t Size,
                                     const sycl::property_list &PropsList) {
-  // Create read_write mem object by default to handle arbitrary uses.
-  RT::PiMemFlags CreationFlags = PI_MEM_FLAGS_ACCESS_RW;
-  if (UserPtr)
-    CreationFlags |= HostPtrReadOnly ? PI_MEM_FLAGS_HOST_PTR_COPY
-                                     : PI_MEM_FLAGS_HOST_PTR_USE;
-  else if (PropsList.has_property<
-               sycl::ext::oneapi::property::buffer::use_pinned_host_memory>())
+  RT::PiMemFlags CreationFlags =
+      getMemObjCreationFlags(TargetContext, UserPtr, HostPtrReadOnly);
+  if (PropsList.has_property<
+          sycl::ext::oneapi::property::buffer::use_pinned_host_memory>())
     CreationFlags |= PI_MEM_FLAGS_HOST_PTR_ALLOC;
 
   RT::PiMem NewMem = nullptr;
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -69,6 +69,22 @@ static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
   return false;
 }
 
+/// Combines two access modes into a single one that allows both.
+static access::mode combineAccessModes(access::mode A, access::mode B) {
+  if (A == B)
+    return A;
+
+  if (A == access::mode::discard_write &&
+      (B == access::mode::discard_read_write || B == access::mode::write))
+    return B;
+
+  if (B == access::mode::discard_write &&
+      (A == access::mode::discard_read_write || A == access::mode::write))
+    return A;
+
+  return access::mode::read_write;
+}
+
 Scheduler::GraphBuilder::GraphBuilder() {
   if (const char *EnvVarCStr = SYCLConfig<SYCL_PRINT_EXECUTION_GRAPH>::get()) {
     std::string GraphPrintOpts(EnvVarCStr);
@@ -574,6 +590,14 @@ Scheduler::GraphBuilder::findAllocaForReq(MemObjRecord *Record,
   return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
 }
 
+static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
+  for (const device &Device : Ctx->getDevices()) {
+    if (!Device.get_info<info::device::host_unified_memory>())
+      return false;
+  }
+  return true;
+}
+
 // The function searches for the alloca command matching context and
 // requirement. If none exists, new allocation command is created.
 // Note, creation of new allocation command can lead to the current context
@@ -603,8 +627,18 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
                                 Req->MMemoryRange, access::mode::read_write,
                                 Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
                                 0 /*ReMOffsetInBytes*/, false /*MIsSubBuffer*/);
-      // Can reuse user data for the first allocation
-      const bool InitFromUserData = Record->MAllocaCommands.empty();
+      // Can reuse user data for the first allocation. Do so if host unified
+      // memory is supported regardless of the access mode (the pointer will be
+      // reused) or if it's not and the access mode is not discard (the pointer
+      // will be copied).
+      // TODO the case where the first alloca is made with a discard mode and
+      // the user pointer is read-only is still not handled: it leads to
+      // unnecessary copy on devices with unified host memory support.
+      const bool InitFromUserData =
+          Record->MAllocaCommands.empty() &&
+          (checkHostUnifiedMemory(Queue->getContextImplPtr()) ||
+           (Req->MAccessMode != access::mode::discard_write &&
+            Req->MAccessMode != access::mode::discard_read_write));
 
       AllocaCommandBase *LinkedAllocaCmd = nullptr;
       // If it is not the first allocation, try to setup a link
@@ -617,13 +651,22 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
         // "not" current allocation, but it will require memory copy.
         // Can setup link between cl and host allocations only
         if (Queue->is_host() != Record->MCurContext->is_host()) {
-
-          AllocaCommandBase *LinkedAllocaCmdCand =
-              findAllocaForReq(Record, Req, Record->MCurContext);
-
-          // Cannot setup link if candidate is linked already
-          if (LinkedAllocaCmdCand && !LinkedAllocaCmdCand->MLinkedAllocaCmd)
-            LinkedAllocaCmd = LinkedAllocaCmdCand;
+          // Linked commands assume that the host allocation is reused by the
+          // plugin runtime and that can lead to unnecessary copy overhead on
+          // devices that do not support host unified memory. Do not link the
+          // allocations in this case.
+          const ContextImplPtr &NonHostCtx = Queue->is_host()
+                                                 ? Record->MCurContext
+                                                 : Queue->getContextImplPtr();
+          if (checkHostUnifiedMemory(NonHostCtx)) {
+            AllocaCommandBase *LinkedAllocaCmdCand =
+                findAllocaForReq(Record, Req, Record->MCurContext);
+
+            // Cannot setup link if candidate is linked already
+            if (LinkedAllocaCmdCand && !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
+              LinkedAllocaCmd = LinkedAllocaCmdCand;
+            }
+          }
         }
 
       AllocaCmd =
@@ -732,10 +775,30 @@ static bool isInteropHostTask(const std::unique_ptr<ExecCGCommand> &Cmd) {
   return HT.MHostTask->isInteropTask();
 }
 
+static void combineAccessModesOfReqs(std::vector<Requirement *> &Reqs) {
+  std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
+  bool HasDuplicateMemObjects = false;
+  for (const Requirement *Req : Reqs) {
+    auto Result = CombinedModes.insert(
+        std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
+    if (!Result.second) {
+      Result.first->second =
+          combineAccessModes(Result.first->second, Req->MAccessMode);
+      HasDuplicateMemObjects = true;
+    }
+  }
+
+  if (!HasDuplicateMemObjects)
+    return;
+  for (Requirement *Req : Reqs) {
+    Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
+  }
+}
+
 Command *
 Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
                                QueueImplPtr Queue) {
-  const std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
+  std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
   const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
   const CG::CGTYPE CGType = CommandGroup->getType();
 
@@ -747,6 +810,10 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
   if (MPrintOptionsArray[BeforeAddCG])
     printGraphAsDot("before_addCG");
 
+  // If there are multiple requirements for the same memory object, its
+  // AllocaCommand creation will be dependent on the access mode of the first
+  // requirement. Combine these access modes to take all of them into account.
+  combineAccessModesOfReqs(Reqs);
   for (Requirement *Req : Reqs) {
     MemObjRecord *Record = nullptr;
     AllocaCommandBase *AllocaCmd = nullptr;
diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp
@@ -99,6 +99,35 @@ size_t SYCLMemObjT::getBufSizeForContext(const ContextImplPtr &Context,
       sizeof(size_t), &BufSize, nullptr);
   return BufSize;
 }
+
+void SYCLMemObjT::determineHostPtr(const ContextImplPtr &Context,
+                                   bool InitFromUserData, void *&HostPtr,
+                                   bool &HostPtrReadOnly) {
+  // The data for the allocation can be provided via either the user pointer
+  // (InitFromUserData, can be read-only) or a runtime-allocated read-write
+  // HostPtr. We can have one of these scenarios:
+  // 1. The allocation is the first one and on host. InitFromUserData == true.
+  // 2. The allocation is the first one and isn't on host. InitFromUserData
+  // varies based on unified host memory support and whether or not the data can
+  // be discarded.
+  // 3. The allocation is not the first one and is on host. InitFromUserData ==
+  // false, HostPtr == nullptr. This can only happen if the allocation command
+  // is not linked since it would be a no-op otherwise. Attempt to reuse the
+  // user pointer if it's read-write, but do not copy its contents if it's not.
+  // 4. The allocation is not the first one and not on host. InitFromUserData ==
+  // false, HostPtr is provided if the command is linked. The host pointer is
+  // guaranteed to be reused in this case.
+  if (Context->is_host() && !MOpenCLInterop && !MHostPtrReadOnly)
+    InitFromUserData = true;
+
+  if (InitFromUserData) {
+    assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "
+                       "simultaneously");
+    HostPtr = getUserPtr();
+    HostPtrReadOnly = MHostPtrReadOnly;
+  } else
+    HostPtrReadOnly = false;
+}
 } // namespace detail
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3714,6 +3714,7 @@ _ZN2cl4sycl6detail10image_implILi3EED1Ev
 _ZN2cl4sycl6detail10image_implILi3EED2Ev
 _ZN2cl4sycl6detail10waitEventsESt6vectorINS0_5eventESaIS3_EE
 _ZN2cl4sycl6detail11SYCLMemObjT10releaseMemESt10shared_ptrINS1_12context_implEEPv
+_ZN2cl4sycl6detail11SYCLMemObjT16determineHostPtrERKSt10shared_ptrINS1_12context_implEEbRPvRb
 _ZN2cl4sycl6detail11SYCLMemObjT16updateHostMemoryEPv
 _ZN2cl4sycl6detail11SYCLMemObjT16updateHostMemoryEv
 _ZN2cl4sycl6detail11SYCLMemObjT20getBufSizeForContextERKSt10shared_ptrINS1_12context_implEEP7_cl_mem
diff --git a/sycl/unittests/scheduler/CMakeLists.txt b/sycl/unittests/scheduler/CMakeLists.txt
@@ -8,5 +8,6 @@ add_sycl_unittest(SchedulerTests OBJECT
     WaitAfterCleanup.cpp
     LinkedAllocaDependencies.cpp
     LeavesCollection.cpp
+    NoUnifiedHostMemory.cpp
     utils.cpp
 )
diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
@@ -36,18 +36,30 @@ class MemObjMock : public cl::sycl::detail::SYCLMemObjI {
   detail::ContextImplPtr getInteropContext() const override { return nullptr; }
 };
 
+static cl::sycl::device getDeviceWithHostUnifiedMemory() {
+  for (cl::sycl::device &D : cl::sycl::device::get_devices()) {
+    if (!D.is_host() &&
+        D.get_info<cl::sycl::info::device::host_unified_memory>())
+      return D;
+  }
+  return {};
+}
+
 TEST_F(SchedulerTest, LinkedAllocaDependencies) {
-  default_selector Selector{};
-  if (Selector.select_device().is_host()) {
-    std::cerr << "Not run due to host-only environment\n";
+  cl::sycl::device Dev = getDeviceWithHostUnifiedMemory();
+  if (Dev.is_host()) {
+    std::cerr << "Not run: no non-host devices with host unified memory support"
+              << std::endl;
     return;
   }
 
   // 1. create two commands: alloca + alloca and link them
   // 2. call Scheduler::GraphBuilder::getOrCreateAllocaForReq
   detail::Requirement Req = getMockRequirement();
 
-  cl::sycl::queue Queue1;
+  // Commands are linked only if the device supports host unified memory.
+
+  cl::sycl::queue Queue1{Dev};
   cl::sycl::detail::QueueImplPtr Q1 = cl::sycl::detail::getSyclObjImpl(Queue1);
 
   sycl::device HostDevice;
diff --git a/sycl/unittests/scheduler/NoUnifiedHostMemory.cpp b/sycl/unittests/scheduler/NoUnifiedHostMemory.cpp
diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp

Original file line number	Diff line number	Diff line change
`@@ -8,5 +8,6 @@ add_sycl_unittest(SchedulerTests OBJECT`
`8`	`8`	`WaitAfterCleanup.cpp`
`9`	`9`	`LinkedAllocaDependencies.cpp`
`10`	`10`	`LeavesCollection.cpp`
	`11`	`+ NoUnifiedHostMemory.cpp`
`11`	`12`	`utils.cpp`
`12`	`13`	`)`