@@ -69,6 +69,22 @@ static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
69
69
return false ;
70
70
}
71
71
72
+ // / Combines two access modes into a single one that allows both.
73
+ static access::mode combineAccessModes (access::mode A, access::mode B) {
74
+ if (A == B)
75
+ return A;
76
+
77
+ if (A == access::mode::discard_write &&
78
+ (B == access::mode::discard_read_write || B == access::mode::write))
79
+ return B;
80
+
81
+ if (B == access::mode::discard_write &&
82
+ (A == access::mode::discard_read_write || A == access::mode::write))
83
+ return A;
84
+
85
+ return access::mode::read_write;
86
+ }
87
+
72
88
Scheduler::GraphBuilder::GraphBuilder () {
73
89
if (const char *EnvVarCStr = SYCLConfig<SYCL_PRINT_EXECUTION_GRAPH>::get ()) {
74
90
std::string GraphPrintOpts (EnvVarCStr);
@@ -574,6 +590,14 @@ Scheduler::GraphBuilder::findAllocaForReq(MemObjRecord *Record,
574
590
return (Record->MAllocaCommands .end () != It) ? *It : nullptr ;
575
591
}
576
592
593
+ static bool checkHostUnifiedMemory (const ContextImplPtr &Ctx) {
594
+ for (const device &Device : Ctx->getDevices ()) {
595
+ if (!Device.get_info <info::device::host_unified_memory>())
596
+ return false ;
597
+ }
598
+ return true ;
599
+ }
600
+
577
601
// The function searches for the alloca command matching context and
578
602
// requirement. If none exists, new allocation command is created.
579
603
// Note, creation of new allocation command can lead to the current context
@@ -603,8 +627,18 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
603
627
Req->MMemoryRange , access::mode::read_write,
604
628
Req->MSYCLMemObj , Req->MDims , Req->MElemSize ,
605
629
0 /* ReMOffsetInBytes*/ , false /* MIsSubBuffer*/ );
606
- // Can reuse user data for the first allocation
607
- const bool InitFromUserData = Record->MAllocaCommands .empty ();
630
+ // Can reuse user data for the first allocation. Do so if host unified
631
+ // memory is supported regardless of the access mode (the pointer will be
632
+ // reused) or if it's not and the access mode is not discard (the pointer
633
+ // will be copied).
634
+ // TODO the case where the first alloca is made with a discard mode and
635
+ // the user pointer is read-only is still not handled: it leads to
636
+ // unnecessary copy on devices with unified host memory support.
637
+ const bool InitFromUserData =
638
+ Record->MAllocaCommands .empty () &&
639
+ (checkHostUnifiedMemory (Queue->getContextImplPtr ()) ||
640
+ (Req->MAccessMode != access::mode::discard_write &&
641
+ Req->MAccessMode != access::mode::discard_read_write));
608
642
609
643
AllocaCommandBase *LinkedAllocaCmd = nullptr ;
610
644
// If it is not the first allocation, try to setup a link
@@ -617,13 +651,22 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
617
651
// "not" current allocation, but it will require memory copy.
618
652
// Can setup link between cl and host allocations only
619
653
if (Queue->is_host () != Record->MCurContext ->is_host ()) {
620
-
621
- AllocaCommandBase *LinkedAllocaCmdCand =
622
- findAllocaForReq (Record, Req, Record->MCurContext );
623
-
624
- // Cannot setup link if candidate is linked already
625
- if (LinkedAllocaCmdCand && !LinkedAllocaCmdCand->MLinkedAllocaCmd )
626
- LinkedAllocaCmd = LinkedAllocaCmdCand;
654
+ // Linked commands assume that the host allocation is reused by the
655
+ // plugin runtime and that can lead to unnecessary copy overhead on
656
+ // devices that do not support host unified memory. Do not link the
657
+ // allocations in this case.
658
+ const ContextImplPtr &NonHostCtx = Queue->is_host ()
659
+ ? Record->MCurContext
660
+ : Queue->getContextImplPtr ();
661
+ if (checkHostUnifiedMemory (NonHostCtx)) {
662
+ AllocaCommandBase *LinkedAllocaCmdCand =
663
+ findAllocaForReq (Record, Req, Record->MCurContext );
664
+
665
+ // Cannot setup link if candidate is linked already
666
+ if (LinkedAllocaCmdCand && !LinkedAllocaCmdCand->MLinkedAllocaCmd ) {
667
+ LinkedAllocaCmd = LinkedAllocaCmdCand;
668
+ }
669
+ }
627
670
}
628
671
629
672
AllocaCmd =
@@ -732,10 +775,30 @@ static bool isInteropHostTask(const std::unique_ptr<ExecCGCommand> &Cmd) {
732
775
return HT.MHostTask ->isInteropTask ();
733
776
}
734
777
778
+ static void combineAccessModesOfReqs (std::vector<Requirement *> &Reqs) {
779
+ std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
780
+ bool HasDuplicateMemObjects = false ;
781
+ for (const Requirement *Req : Reqs) {
782
+ auto Result = CombinedModes.insert (
783
+ std::make_pair (Req->MSYCLMemObj , Req->MAccessMode ));
784
+ if (!Result.second ) {
785
+ Result.first ->second =
786
+ combineAccessModes (Result.first ->second , Req->MAccessMode );
787
+ HasDuplicateMemObjects = true ;
788
+ }
789
+ }
790
+
791
+ if (!HasDuplicateMemObjects)
792
+ return ;
793
+ for (Requirement *Req : Reqs) {
794
+ Req->MAccessMode = CombinedModes[Req->MSYCLMemObj ];
795
+ }
796
+ }
797
+
735
798
Command *
736
799
Scheduler::GraphBuilder::addCG (std::unique_ptr<detail::CG> CommandGroup,
737
800
QueueImplPtr Queue) {
738
- const std::vector<Requirement *> &Reqs = CommandGroup->MRequirements ;
801
+ std::vector<Requirement *> &Reqs = CommandGroup->MRequirements ;
739
802
const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents ;
740
803
const CG::CGTYPE CGType = CommandGroup->getType ();
741
804
@@ -747,6 +810,10 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
747
810
if (MPrintOptionsArray[BeforeAddCG])
748
811
printGraphAsDot (" before_addCG" );
749
812
813
+ // If there are multiple requirements for the same memory object, its
814
+ // AllocaCommand creation will be dependent on the access mode of the first
815
+ // requirement. Combine these access modes to take all of them into account.
816
+ combineAccessModesOfReqs (Reqs);
750
817
for (Requirement *Req : Reqs) {
751
818
MemObjRecord *Record = nullptr ;
752
819
AllocaCommandBase *AllocaCmd = nullptr ;
0 commit comments