Skip to content

Commit 2a539ee

Browse files
committed
[OpenMP][libomptarget] Implement memory lock/unlock API in NextGen plugins
This patch implements the memory lock/unlock API, introduced in patch https://reviews.llvm.org/D139208, in the NextGen plugins. Locked buffers feature reference counting and we allow certain overlapping. Given an already locked buffer A, other buffers that are fully contained inside A can be locked again, even if they are smaller than A. In this case, the reference count of locked buffer A will be incremented. However, extending an existing locked buffer is not allowed. The original buffer is actually unlocked once all its users have released the locked buffer and sub-buffers (i.e., the reference counter becomes zero). Differential Revision: https://reviews.llvm.org/D141227
1 parent f1764d5 commit 2a539ee

File tree

5 files changed

+330
-57
lines changed

5 files changed

+330
-57
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1826,14 +1826,33 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
18261826
return Plugin::success();
18271827
}
18281828

1829+
/// Pin the host buffer and return the device pointer that should be used for
1830+
/// device transfers.
1831+
Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
1832+
void *PinnedPtr = nullptr;
1833+
1834+
hsa_status_t Status =
1835+
hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
1836+
if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
1837+
return Err;
1838+
1839+
return PinnedPtr;
1840+
}
1841+
1842+
/// Unpin the host buffer.
1843+
Error dataUnlockImpl(void *HstPtr) override {
1844+
hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
1845+
return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
1846+
}
1847+
18291848
/// Submit data to the device (host to device transfer).
18301849
Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
18311850
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
1832-
18331851
// Use one-step asynchronous operation when host memory is already pinned.
1834-
if (isHostPinnedMemoryBuffer(HstPtr)) {
1852+
if (void *PinnedPtr =
1853+
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
18351854
AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper);
1836-
return Stream.pushPinnedMemoryCopyAsync(TgtPtr, HstPtr, Size);
1855+
return Stream.pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size);
18371856
}
18381857

18391858
void *PinnedHstPtr = nullptr;
@@ -1887,10 +1906,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
18871906
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
18881907

18891908
// Use one-step asynchronous operation when host memory is already pinned.
1890-
if (isHostPinnedMemoryBuffer(HstPtr)) {
1891-
// Use one-step asynchronous operation when host memory is already pinned.
1909+
if (void *PinnedPtr =
1910+
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
18921911
AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper);
1893-
return Stream.pushPinnedMemoryCopyAsync(HstPtr, TgtPtr, Size);
1912+
return Stream.pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size);
18941913
}
18951914

18961915
void *PinnedHstPtr = nullptr;

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp

Lines changed: 130 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,8 @@ GenericDeviceTy::GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
333333
OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 32),
334334
OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 32),
335335
DeviceId(DeviceId), GridValues(OMPGridValues),
336-
PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock() {
336+
PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
337+
PinnedAllocs(*this) {
337338
if (OMP_NumTeams > 0)
338339
GridValues.GV_Max_Teams =
339340
std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams));
@@ -581,23 +582,110 @@ GenericDeviceTy::getExecutionModeForKernel(StringRef Name,
581582
return ExecModeGlobal.getValue();
582583
}
583584

584-
Error GenericDeviceTy::registerHostPinnedMemoryBuffer(const void *Buffer,
585-
size_t Size) {
586-
std::lock_guard<std::shared_mutex> Lock(HostAllocationsMutex);
585+
Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr,
586+
void *DevAccessiblePtr,
587+
size_t Size) {
588+
assert(HstPtr && "Invalid pointer");
589+
assert(DevAccessiblePtr && "Invalid pointer");
587590

588-
auto Res = HostAllocations.insert({Buffer, Size});
591+
std::lock_guard<std::shared_mutex> Lock(Mutex);
592+
593+
// No pinned allocation should intersect.
594+
auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size});
589595
if (!Res.second)
590-
return Plugin::error("Registering an already registered pinned buffer");
596+
return Plugin::error("Cannot register locked buffer");
597+
598+
return Plugin::success();
599+
}
600+
601+
Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) {
602+
assert(HstPtr && "Invalid pointer");
603+
604+
std::lock_guard<std::shared_mutex> Lock(Mutex);
605+
606+
// Find the pinned allocation starting at the host pointer address.
607+
auto It = Allocs.find({HstPtr});
608+
if (It == Allocs.end())
609+
return Plugin::error("Cannot find locked buffer");
610+
611+
const EntryTy &Entry = *It;
612+
613+
// There should be no other references to the pinned allocation.
614+
if (Entry.References > 1)
615+
return Plugin::error("The locked buffer is still being used");
616+
617+
// Remove the entry from the map.
618+
Allocs.erase(It);
591619

592620
return Plugin::success();
593621
}
594622

595-
Error GenericDeviceTy::unregisterHostPinnedMemoryBuffer(const void *Buffer) {
596-
std::lock_guard<std::shared_mutex> Lock(HostAllocationsMutex);
623+
Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
624+
size_t Size) {
625+
assert(HstPtr && "Invalid pointer");
626+
627+
std::lock_guard<std::shared_mutex> Lock(Mutex);
628+
629+
auto It = findIntersecting(HstPtr);
630+
631+
// No intersecting registered allocation found in the map. We must lock and
632+
// register the memory buffer into the map.
633+
if (It == Allocs.end()) {
634+
// First, lock the host buffer and retrieve the device accessible pointer.
635+
auto PinnedPtrOrErr = Device.dataLockImpl(HstPtr, Size);
636+
if (!PinnedPtrOrErr)
637+
return PinnedPtrOrErr.takeError();
638+
639+
// Then, insert the host buffer entry into the map.
640+
auto Res = Allocs.insert({HstPtr, *PinnedPtrOrErr, Size});
641+
if (!Res.second)
642+
return Plugin::error("Cannot register locked buffer");
643+
644+
// Return the device accessible pointer.
645+
return *PinnedPtrOrErr;
646+
}
647+
648+
const EntryTy &Entry = *It;
649+
650+
#ifdef OMPTARGET_DEBUG
651+
// Do not allow partial overlapping among host pinned buffers.
652+
if (advanceVoidPtr(HstPtr, Size) > advanceVoidPtr(Entry.HstPtr, Entry.Size))
653+
return Plugin::error("Partial overlapping not allowed in locked memory");
654+
#endif
655+
656+
// Increase the number of references.
657+
Entry.References++;
658+
659+
// Return the device accessible pointer after applying the correct offset.
660+
return advanceVoidPtr(Entry.DevAccessiblePtr,
661+
getPtrDiff(HstPtr, Entry.HstPtr));
662+
}
663+
664+
Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) {
665+
assert(HstPtr && "Invalid pointer");
666+
667+
std::lock_guard<std::shared_mutex> Lock(Mutex);
597668

598-
size_t Erased = HostAllocations.erase(Buffer);
669+
auto It = findIntersecting(HstPtr);
670+
if (It == Allocs.end())
671+
return Plugin::error("Cannot find locked buffer");
672+
673+
const EntryTy &Entry = *It;
674+
675+
// Decrease the number of references. No need to do anything if there are
676+
// others using the allocation.
677+
if (--Entry.References > 0)
678+
return Plugin::success();
679+
680+
// This was the last user of the allocation. Unlock the original locked memory
681+
// buffer, which is the host pointer stored in the entry.
682+
if (auto Err = Device.dataUnlockImpl(Entry.HstPtr))
683+
return Err;
684+
685+
// Remove the entry from the map.
686+
size_t Erased = Allocs.erase(Entry);
599687
if (!Erased)
600-
return Plugin::error("Cannot find a registered host pinned buffer");
688+
return Plugin::error("Cannot find locked buffer");
601689

602690
return Plugin::success();
603691
}
@@ -648,7 +736,7 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
648736

649737
// Register allocated buffer as pinned memory if the type is host memory.
650738
if (Kind == TARGET_ALLOC_HOST)
651-
if (auto Err = registerHostPinnedMemoryBuffer(Alloc, Size))
739+
if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
652740
return Err;
653741

654742
return Alloc;
@@ -670,7 +758,7 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
670758

671759
// Unregister deallocated pinned memory buffer if the type is host memory.
672760
if (Kind == TARGET_ALLOC_HOST)
673-
if (auto Err = unregisterHostPinnedMemoryBuffer(TgtPtr))
761+
if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr))
674762
return Err;
675763

676764
return Plugin::success();
@@ -998,6 +1086,36 @@ int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
9981086
return OFFLOAD_SUCCESS;
9991087
}
10001088

1089+
int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
1090+
void **LockedPtr) {
1091+
auto LockedPtrOrErr = Plugin::get().getDevice(DeviceId).dataLock(Ptr, Size);
1092+
if (!LockedPtrOrErr) {
1093+
auto Err = LockedPtrOrErr.takeError();
1094+
REPORT("Failure to lock memory %p: %s\n", Ptr,
1095+
toString(std::move(Err)).data());
1096+
return OFFLOAD_FAIL;
1097+
}
1098+
1099+
if (!(*LockedPtrOrErr)) {
1100+
REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr);
1101+
return OFFLOAD_FAIL;
1102+
}
1103+
*LockedPtr = *LockedPtrOrErr;
1104+
1105+
return OFFLOAD_SUCCESS;
1106+
}
1107+
1108+
int32_t __tgt_rtl_data_unlock(int32_t DeviceId, void *Ptr) {
1109+
auto Err = Plugin::get().getDevice(DeviceId).dataUnlock(Ptr);
1110+
if (Err) {
1111+
REPORT("Failure to unlock memory %p: %s\n", Ptr,
1112+
toString(std::move(Err)).data());
1113+
return OFFLOAD_FAIL;
1114+
}
1115+
1116+
return OFFLOAD_SUCCESS;
1117+
}
1118+
10011119
int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
10021120
int64_t Size) {
10031121
return __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size,

0 commit comments

Comments
 (0)