diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp index 7620ce92f7ebd..68bfe5c1ae614 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp @@ -10,20 +10,115 @@ #include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_dense_map.h" +#include "sanitizer_common/sanitizer_libc.h" #include "sanitizer_common/sanitizer_mutex.h" #include "sanitizer_common/sanitizer_placement_new.h" #include "sanitizer_common/sanitizer_thread_safety.h" +#include "sanitizer_common/sanitizer_vector.h" #include using namespace __ctx_profile; +namespace { +// Keep track of all the context roots we actually saw, so we can then traverse +// them when the user asks for the profile in __llvm_ctx_profile_fetch +__sanitizer::SpinMutex AllContextsMutex; +SANITIZER_GUARDED_BY(AllContextsMutex) +__sanitizer::Vector AllContextRoots; + +// utility to taint a pointer by setting the LSB. There is an assumption +// throughout that the addresses of contexts are even (really, they should be +// align(8), but "even"-ness is the minimum assumption) +// "scratch contexts" are buffers that we return in certain cases - they are +// large enough to allow for memory safe counter access, but they don't link +// subcontexts below them (the runtime recognizes them and enforces that) +ContextNode *markAsScratch(const ContextNode *Ctx) { + return reinterpret_cast(reinterpret_cast(Ctx) | 1); +} + +// Used when getting the data from TLS. We don't *really* need to reset, but +// it's a simpler system if we do. +template inline T consume(T &V) { + auto R = V; + V = {0}; + return R; +} + +// We allocate at least kBuffSize Arena pages. The scratch buffer is also that +// large. +constexpr size_t kPower = 20; +constexpr size_t kBuffSize = 1 << kPower; + +// Highly unlikely we need more than kBuffSize for a context. +size_t getArenaAllocSize(size_t Needed) { + if (Needed >= kBuffSize) + return 2 * Needed; + return kBuffSize; +} + +// verify the structural integrity of the context +bool validate(const ContextRoot *Root) { + // all contexts should be laid out in some arena page. Go over each arena + // allocated for this Root, and jump over contained contexts based on + // self-reported sizes. + __sanitizer::DenseMap ContextStartAddrs; + for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { + const auto *Pos = Mem->start(); + while (Pos < Mem->pos()) { + const auto *Ctx = reinterpret_cast(Pos); + if (!ContextStartAddrs.insert({reinterpret_cast(Ctx), true}) + .second) + return false; + Pos += Ctx->size(); + } + } + + // Now traverse the contexts again the same way, but validate all nonull + // subcontext addresses appear in the set computed above. + for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { + const auto *Pos = Mem->start(); + while (Pos < Mem->pos()) { + const auto *Ctx = reinterpret_cast(Pos); + for (uint32_t I = 0; I < Ctx->callsites_size(); ++I) + for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next()) + if (!ContextStartAddrs.find(reinterpret_cast(Sub))) + return false; + + Pos += Ctx->size(); + } + } + return true; +} +} // namespace + +// the scratch buffer - what we give when we can't produce a real context (the +// scratch isn't "real" in that it's expected to be clobbered carelessly - we +// don't read it). The other important thing is that the callees from a scratch +// context also get a scratch context. +// Eventually this can be replaced with per-function buffers, a'la the typical +// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but +// the part about determining the nature of the subcontexts does. +__thread char __Buffer[kBuffSize] = {0}; + +#define TheScratchContext \ + markAsScratch(reinterpret_cast(__Buffer)) + +// init the TLSes +__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr, + nullptr}; +__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0}; + +__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root = + nullptr; + // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce // the dependency on the latter. Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) { assert(!Prev || Prev->Next == nullptr); - Arena *NewArena = - new (__sanitizer::InternalAlloc(Size + sizeof(Arena))) Arena(Size); + Arena *NewArena = new (__sanitizer::InternalAlloc( + Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment)) + Arena(Size); if (Prev) Prev->Next = NewArena; return NewArena; @@ -38,3 +133,187 @@ void Arena::freeArenaList(Arena *&A) { } A = nullptr; } + +inline ContextNode *ContextNode::alloc(char *Place, GUID Guid, + uint32_t NrCounters, + uint32_t NrCallsites, + ContextNode *Next) { + assert(reinterpret_cast(Place) % ExpectedAlignment == 0); + return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next); +} + +void ContextNode::reset() { + // FIXME(mtrofin): this is std::memset, which we can probably use if we + // drop/reduce the dependency on sanitizer_common. + for (uint32_t I = 0; I < NrCounters; ++I) + counters()[I] = 0; + for (uint32_t I = 0; I < NrCallsites; ++I) + for (auto *Next = subContexts()[I]; Next; Next = Next->Next) + Next->reset(); +} + +// If this is the first time we hit a callsite with this (Guid) particular +// callee, we need to allocate. +ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint, + uint32_t NrCounters, uint32_t NrCallsites) { + auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites); + auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem; + char *AllocPlace = Mem->tryBumpAllocate(AllocSize); + if (!AllocPlace) { + // if we failed to allocate on the current arena, allocate a new arena, + // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we + // find it from now on for other cases when we need to getCallsiteSlow. + // Note that allocateNewArena will link the allocated memory in the list of + // Arenas. + __llvm_ctx_profile_current_context_root->CurrentMem = Mem = + Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem); + AllocPlace = Mem->tryBumpAllocate(AllocSize); + } + auto *Ret = ContextNode::alloc(AllocPlace, Guid, NrCounters, NrCallsites, + *InsertionPoint); + *InsertionPoint = Ret; + return Ret; +} + +ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, + uint32_t NrCounters, + uint32_t NrCallsites) { + // fast "out" if we're not even doing contextual collection. + if (!__llvm_ctx_profile_current_context_root) + return TheScratchContext; + + // also fast "out" if the caller is scratch. We can see if it's scratch by + // looking at the interior pointer into the subcontexts vector that the caller + // provided, which, if the context is scratch, so is that interior pointer + // (because all the address calculations are using even values. Or more + // precisely, aligned - 8 values) + auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]); + if (!CallsiteContext || isScratch(CallsiteContext)) + return TheScratchContext; + + // if the callee isn't the expected one, return scratch. + // Signal handler(s) could have been invoked at any point in the execution. + // Should that have happened, and had it (the handler) be built with + // instrumentation, its __llvm_ctx_profile_get_context would have failed here. + // Its sub call graph would have then populated + // __llvm_ctx_profile_{expected_callee | callsite} at index 1. + // The normal call graph may be impacted in that, if the signal handler + // happened somewhere before we read the TLS here, we'd see the TLS reset and + // we'd also fail here. That would just mean we would loose counter values for + // the normal subgraph, this time around. That should be very unlikely, but if + // it happens too frequently, we should be able to detect discrepancies in + // entry counts (caller-callee). At the moment, the design goes on the + // assumption that is so unfrequent, though, that it's not worth doing more + // for that case. + auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]); + if (ExpectedCallee != Callee) + return TheScratchContext; + + auto *Callsite = *CallsiteContext; + // in the case of indirect calls, we will have all seen targets forming a + // linked list here. Find the one corresponding to this callee. + while (Callsite && Callsite->guid() != Guid) { + Callsite = Callsite->next(); + } + auto *Ret = Callsite ? Callsite + : getCallsiteSlow(Guid, CallsiteContext, NrCounters, + NrCallsites); + if (Ret->callsites_size() != NrCallsites || + Ret->counters_size() != NrCounters) + __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: " + "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n", + Ret, Guid, NrCallsites, NrCounters, Ret->guid(), + Ret->callsites_size(), Ret->counters_size()); + Ret->onEntry(); + return Ret; +} + +// This should be called once for a Root. Allocate the first arena, set up the +// first context. +void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters, + uint32_t NrCallsites) { + __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( + &AllContextsMutex); + // Re-check - we got here without having had taken a lock. + if (Root->FirstMemBlock) + return; + const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites); + auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed)); + Root->FirstMemBlock = M; + Root->CurrentMem = M; + Root->FirstNode = ContextNode::alloc(M->tryBumpAllocate(Needed), Guid, + NrCounters, NrCallsites); + AllContextRoots.PushBack(Root); +} + +ContextNode *__llvm_ctx_profile_start_context( + ContextRoot *Root, GUID Guid, uint32_t Counters, + uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { + if (!Root->FirstMemBlock) { + setupContext(Root, Guid, Counters, Callsites); + } + if (Root->Taken.TryLock()) { + __llvm_ctx_profile_current_context_root = Root; + Root->FirstNode->onEntry(); + return Root->FirstNode; + } + // If this thread couldn't take the lock, return scratch context. + __llvm_ctx_profile_current_context_root = nullptr; + return TheScratchContext; +} + +void __llvm_ctx_profile_release_context(ContextRoot *Root) + SANITIZER_NO_THREAD_SAFETY_ANALYSIS { + if (__llvm_ctx_profile_current_context_root) { + __llvm_ctx_profile_current_context_root = nullptr; + Root->Taken.Unlock(); + } +} + +void __llvm_ctx_profile_start_collection() { + size_t NrMemUnits = 0; + __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( + &AllContextsMutex); + for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) { + auto *Root = AllContextRoots[I]; + __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock( + &Root->Taken); + for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) + ++NrMemUnits; + + Root->FirstNode->reset(); + } + __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits); +} + +bool __llvm_ctx_profile_fetch( + void *Data, bool (*Writer)(void *W, const __ctx_profile::ContextNode &)) { + assert(Writer); + __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( + &AllContextsMutex); + + for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) { + auto *Root = AllContextRoots[I]; + __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock( + &Root->Taken); + if (!validate(Root)) { + __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid"); + return false; + } + if (!Writer(Data, *Root->FirstNode)) + return false; + } + return true; +} + +void __llvm_ctx_profile_free() { + __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( + &AllContextsMutex); + for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) + for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) { + auto *C = A; + A = A->next(); + __sanitizer::InternalFree(C); + } + AllContextRoots.Reset(); +} diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h index c1789c32a64c2..8c4be5d8a23a7 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h @@ -9,9 +9,16 @@ #ifndef CTX_PROFILE_CTXINSTRPROFILING_H_ #define CTX_PROFILE_CTXINSTRPROFILING_H_ +#include "sanitizer_common/sanitizer_mutex.h" #include namespace __ctx_profile { +using GUID = uint64_t; +static constexpr size_t ExpectedAlignment = 8; +// We really depend on this, see further below. We currently support x86_64. +// When we want to support other archs, we need to trace the places Alignment is +// used and adjust accordingly. +static_assert(sizeof(void *) == ExpectedAlignment); /// Arena (bump allocator) forming a linked list. Intentionally not thread safe. /// Allocation and de-allocation happen using sanitizer APIs. We make that @@ -51,5 +58,206 @@ class Arena final { const uint64_t Size; }; +// The memory available for allocation follows the Arena header, and we expect +// it to be thus aligned. +static_assert(alignof(Arena) == ExpectedAlignment); + +/// The contextual profile is a directed tree where each node has one parent. A +/// node (ContextNode) corresponds to a function activation. The root of the +/// tree is at a function that was marked as entrypoint to the compiler. A node +/// stores counter values for edges and a vector of subcontexts. These are the +/// contexts of callees. The index in the subcontext vector corresponds to the +/// index of the callsite (as was instrumented via llvm.instrprof.callsite). At +/// that index we find a linked list, potentially empty, of ContextNodes. Direct +/// calls will have 0 or 1 values in the linked list, but indirect callsites may +/// have more. +/// +/// The ContextNode has a fixed sized header describing it - the GUID of the +/// function, the size of the counter and callsite vectors. It is also an +/// (intrusive) linked list for the purposes of the indirect call case above. +/// +/// Allocation is expected to happen on an Arena. The allocation lays out inline +/// the counter and subcontexts vectors. The class offers APIs to correctly +/// reference the latter. +/// +/// The layout is as follows: +/// +/// [[declared fields][counters vector][vector of ptrs to subcontexts]] +/// +/// See also documentation on the counters and subContexts members below. +/// +/// The structure of the ContextNode is known to LLVM, because LLVM needs to: +/// (1) increment counts, and +/// (2) form a GEP for the position in the subcontext list of a callsite +/// This means changes to LLVM contextual profile lowering and changes here +/// must be coupled. +/// Note: the header content isn't interesting to LLVM (other than its size) +/// +/// Part of contextual collection is the notion of "scratch contexts". These are +/// buffers that are "large enough" to allow for memory-safe acceses during +/// counter increments - meaning the counter increment code in LLVM doesn't need +/// to be concerned with memory safety. Their subcontexts never get populated, +/// though. The runtime code here produces and recognizes them. +class ContextNode final { + const GUID Guid; + ContextNode *const Next; + const uint32_t NrCounters; + const uint32_t NrCallsites; + +public: + ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites, + ContextNode *Next = nullptr) + : Guid(Guid), Next(Next), NrCounters(NrCounters), + NrCallsites(NrCallsites) {} + static inline ContextNode *alloc(char *Place, GUID Guid, uint32_t NrCounters, + uint32_t NrCallsites, + ContextNode *Next = nullptr); + + static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) { + return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters + + sizeof(ContextNode *) * NrCallsites; + } + + // The counters vector starts right after the static header. + uint64_t *counters() { + ContextNode *addr_after = &(this[1]); + return reinterpret_cast(addr_after); + } + + uint32_t counters_size() const { return NrCounters; } + uint32_t callsites_size() const { return NrCallsites; } + + const uint64_t *counters() const { + return const_cast(this)->counters(); + } + + // The subcontexts vector starts right after the end of the counters vector. + ContextNode **subContexts() { + return reinterpret_cast(&(counters()[NrCounters])); + } + + ContextNode *const *subContexts() const { + return const_cast(this)->subContexts(); + } + + GUID guid() const { return Guid; } + ContextNode *next() { return Next; } + + size_t size() const { return getAllocSize(NrCounters, NrCallsites); } + + void reset(); + + // since we go through the runtime to get a context back to LLVM, in the entry + // basic block, might as well handle incrementing the entry basic block + // counter. + void onEntry() { ++counters()[0]; } + + uint64_t entrycount() const { return counters()[0]; } +}; + +// Verify maintenance to ContextNode doesn't change this invariant, which makes +// sure the inlined vectors are appropriately aligned. +static_assert(alignof(ContextNode) == ExpectedAlignment); + +/// ContextRoots are allocated by LLVM for entrypoints. LLVM is only concerned +/// with allocating and zero-initializing the global value (as in, GlobalValue) +/// for it. +struct ContextRoot { + ContextNode *FirstNode = nullptr; + Arena *FirstMemBlock = nullptr; + Arena *CurrentMem = nullptr; + // This is init-ed by the static zero initializer in LLVM. + // Taken is used to ensure only one thread traverses the contextual graph - + // either to read it or to write it. On server side, the same entrypoint will + // be entered by numerous threads, but over time, the profile aggregated by + // collecting sequentially on one thread at a time is expected to converge to + // the aggregate profile that may have been observable on all the threads. + // Note that this is node-by-node aggregation, i.e. summing counters of nodes + // at the same position in the graph, not flattening. + // Threads that cannot lock Taken (fail TryLock) are given a "scratch context" + // - a buffer they can clobber, safely from a memory access perspective. + // + // Note about "scratch"-ness: we currently ignore the data written in them + // (which is anyway clobbered). The design allows for that not be the case - + // because "scratch"-ness is first and foremost about not trying to build + // subcontexts, and is captured by tainting the pointer value (pointer to the + // memory treated as context), but right now, we drop that info. + // + // We could consider relaxing the requirement of more than one thread + // entering by holding a few context trees per entrypoint and then aggregating + // them (as explained above) at the end of the profile collection - it's a + // tradeoff between collection time and memory use: higher precision can be + // obtained with either less concurrent collections but more collection time, + // or with more concurrent collections (==more memory) and less collection + // time. Note that concurrent collection does happen for different + // entrypoints, regardless. + ::__sanitizer::StaticSpinMutex Taken; + + // If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM + // instrumentation lowering side because it is responsible for allocating and + // zero-initializing ContextRoots. + static_assert(sizeof(Taken) == 1); +}; + +/// This API is exposed for testing. See the APIs below about the contract with +/// LLVM. +inline bool isScratch(const void *Ctx) { + return (reinterpret_cast(Ctx) & 1); +} + } // namespace __ctx_profile + +extern "C" { + +// LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic. +// position 0 is used when the current context isn't scratch, 1 when it is. They +// are volatile because of signal handlers - we mean to specifically control +// when the data is loaded. +// +/// TLS where LLVM stores the pointer of the called value, as part of lowering a +/// llvm.instrprof.callsite +extern __thread void *volatile __llvm_ctx_profile_expected_callee[2]; +/// TLS where LLVM stores the pointer inside a caller's subcontexts vector that +/// corresponds to the callsite being lowered. +extern __thread __ctx_profile::ContextNode * + *volatile __llvm_ctx_profile_callsite[2]; + +// __llvm_ctx_profile_current_context_root is exposed for unit testing, +// othwerise it's only used internally by compiler-rt/ctx_profile. +extern __thread __ctx_profile::ContextRoot + *volatile __llvm_ctx_profile_current_context_root; + +/// called by LLVM in the entry BB of a "entry point" function. The returned +/// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch. +__ctx_profile::ContextNode * +__llvm_ctx_profile_start_context(__ctx_profile::ContextRoot *Root, + __ctx_profile::GUID Guid, uint32_t Counters, + uint32_t Callsites); + +/// paired with __llvm_ctx_profile_start_context, and called at the exit of the +/// entry point function. +void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root); + +/// called for any other function than entry points, in the entry BB of such +/// function. Same consideration about LSB of returned value as .._start_context +__ctx_profile::ContextNode * +__llvm_ctx_profile_get_context(void *Callee, __ctx_profile::GUID Guid, + uint32_t NrCounters, uint32_t NrCallsites); + +/// Prepares for collection. Currently this resets counter values but preserves +/// internal context tree structure. +void __llvm_ctx_profile_start_collection(); + +/// Completely free allocated memory. +void __llvm_ctx_profile_free(); + +/// Used to obtain the profile. The Writer is called for each root ContextNode, +/// with the ContextRoot::Taken taken. The Writer is responsible for traversing +/// the structure underneath. +/// The Writer's first parameter plays the role of closure for Writer, and is +/// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter. +/// The second parameter is the root of a context tree. +bool __llvm_ctx_profile_fetch( + void *Data, bool (*Writer)(void *, const __ctx_profile::ContextNode &)); +} #endif // CTX_PROFILE_CTXINSTRPROFILING_H_ diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp index 44f37d2576320..f6ebe6ab2e50c 100644 --- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp +++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp @@ -1,8 +1,17 @@ #include "../CtxInstrProfiling.h" #include "gtest/gtest.h" +#include using namespace __ctx_profile; +class ContextTest : public ::testing::Test { + void SetUp() override { memset(&Root, 0, sizeof(ContextRoot)); } + void TearDown() override { __llvm_ctx_profile_free(); } + +public: + ContextRoot Root; +}; + TEST(ArenaTest, Basic) { Arena *A = Arena::allocateNewArena(1024); EXPECT_EQ(A->size(), 1024U); @@ -20,3 +29,186 @@ TEST(ArenaTest, Basic) { Arena::freeArenaList(A); EXPECT_EQ(A, nullptr); } + +TEST_F(ContextTest, Basic) { + auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); + ASSERT_NE(Ctx, nullptr); + EXPECT_NE(Root.CurrentMem, nullptr); + EXPECT_EQ(Root.FirstMemBlock, Root.CurrentMem); + EXPECT_EQ(Ctx->size(), sizeof(ContextNode) + 10 * sizeof(uint64_t) + + 4 * sizeof(ContextNode *)); + EXPECT_EQ(Ctx->counters_size(), 10U); + EXPECT_EQ(Ctx->callsites_size(), 4U); + EXPECT_EQ(__llvm_ctx_profile_current_context_root, &Root); + Root.Taken.CheckLocked(); + EXPECT_FALSE(Root.Taken.TryLock()); + __llvm_ctx_profile_release_context(&Root); + EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr); + EXPECT_TRUE(Root.Taken.TryLock()); + Root.Taken.Unlock(); +} + +TEST_F(ContextTest, Callsite) { + auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); + int FakeCalleeAddress = 0; + const bool IsScratch = isScratch(Ctx); + EXPECT_FALSE(IsScratch); + // This is the sequence the caller performs - it's the lowering of the + // instrumentation of the callsite "2". "2" is arbitrary here. + __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress; + __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2]; + // This is what the callee does + auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1); + // We expect the subcontext to be appropriately placed and dimensioned + EXPECT_EQ(Ctx->subContexts()[2], Subctx); + EXPECT_EQ(Subctx->counters_size(), 3U); + EXPECT_EQ(Subctx->callsites_size(), 1U); + // We reset these in _get_context. + EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr); + EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr); + + EXPECT_EQ(Subctx->size(), sizeof(ContextNode) + 3 * sizeof(uint64_t) + + 1 * sizeof(ContextNode *)); + __llvm_ctx_profile_release_context(&Root); +} + +TEST_F(ContextTest, ScratchNoCollection) { + EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr); + int FakeCalleeAddress = 0; + // this would be the very first function executing this. the TLS is empty, + // too. + auto *Ctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1); + // We never entered a context (_start_context was never called) - so the + // returned context must be scratch. + EXPECT_TRUE(isScratch(Ctx)); +} + +TEST_F(ContextTest, ScratchDuringCollection) { + auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); + int FakeCalleeAddress = 0; + int OtherFakeCalleeAddress = 0; + __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress; + __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2]; + auto *Subctx = + __llvm_ctx_profile_get_context(&OtherFakeCalleeAddress, 2, 3, 1); + // We expected a different callee - so return scratch. It mimics what happens + // in the case of a signal handler - in this case, OtherFakeCalleeAddress is + // the signal handler. + EXPECT_TRUE(isScratch(Subctx)); + EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr); + EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr); + + int ThirdFakeCalleeAddress = 0; + __llvm_ctx_profile_expected_callee[1] = &ThirdFakeCalleeAddress; + __llvm_ctx_profile_callsite[1] = &Subctx->subContexts()[0]; + + auto *Subctx2 = + __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0); + // We again expect scratch because the '0' position is where the runtime + // looks, so it doesn't matter the '1' position is populated correctly. + EXPECT_TRUE(isScratch(Subctx2)); + + __llvm_ctx_profile_expected_callee[0] = &ThirdFakeCalleeAddress; + __llvm_ctx_profile_callsite[0] = &Subctx->subContexts()[0]; + auto *Subctx3 = + __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0); + // We expect scratch here, too, because the value placed in + // __llvm_ctx_profile_callsite is scratch + EXPECT_TRUE(isScratch(Subctx3)); + + __llvm_ctx_profile_release_context(&Root); +} + +TEST_F(ContextTest, NeedMoreMemory) { + auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); + int FakeCalleeAddress = 0; + const bool IsScratch = isScratch(Ctx); + EXPECT_FALSE(IsScratch); + const auto *CurrentMem = Root.CurrentMem; + __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress; + __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2]; + // Allocate a massive subcontext to force new arena allocation + auto *Subctx = + __llvm_ctx_profile_get_context(&FakeCalleeAddress, 3, 1 << 20, 1); + EXPECT_EQ(Ctx->subContexts()[2], Subctx); + EXPECT_NE(CurrentMem, Root.CurrentMem); + EXPECT_NE(Root.CurrentMem, nullptr); +} + +TEST_F(ContextTest, ConcurrentRootCollection) { + std::atomic NonScratch = 0; + std::atomic Executions = 0; + + __sanitizer::Semaphore GotCtx; + + auto Entrypoint = [&]() { + ++Executions; + auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); + GotCtx.Post(); + const bool IS = isScratch(Ctx); + NonScratch += (!IS); + if (!IS) { + GotCtx.Wait(); + GotCtx.Wait(); + } + __llvm_ctx_profile_release_context(&Root); + }; + std::thread T1(Entrypoint); + std::thread T2(Entrypoint); + T1.join(); + T2.join(); + EXPECT_EQ(NonScratch, 1); + EXPECT_EQ(Executions, 2); +} + +TEST_F(ContextTest, Dump) { + auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); + int FakeCalleeAddress = 0; + __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress; + __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2]; + auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1); + (void)Subctx; + __llvm_ctx_profile_release_context(&Root); + + struct Writer { + ContextRoot *const Root; + const size_t Entries; + bool State = false; + Writer(ContextRoot *Root, size_t Entries) : Root(Root), Entries(Entries) {} + + bool write(const ContextNode &Node) { + EXPECT_FALSE(Root->Taken.TryLock()); + EXPECT_EQ(Node.guid(), 1); + EXPECT_EQ(Node.counters()[0], Entries); + EXPECT_EQ(Node.counters_size(), 10); + EXPECT_EQ(Node.callsites_size(), 4); + EXPECT_EQ(Node.subContexts()[0], nullptr); + EXPECT_EQ(Node.subContexts()[1], nullptr); + EXPECT_NE(Node.subContexts()[2], nullptr); + EXPECT_EQ(Node.subContexts()[3], nullptr); + const auto &SN = *Node.subContexts()[2]; + EXPECT_EQ(SN.guid(), 2); + EXPECT_EQ(SN.counters()[0], Entries); + EXPECT_EQ(SN.counters_size(), 3); + EXPECT_EQ(SN.callsites_size(), 1); + EXPECT_EQ(SN.subContexts()[0], nullptr); + State = true; + return true; + } + }; + Writer W(&Root, 1); + EXPECT_FALSE(W.State); + __llvm_ctx_profile_fetch(&W, [](void *W, const ContextNode &Node) -> bool { + return reinterpret_cast(W)->write(Node); + }); + EXPECT_TRUE(W.State); + + // this resets all counters but not the internal structure. + __llvm_ctx_profile_start_collection(); + Writer W2(&Root, 0); + EXPECT_FALSE(W2.State); + __llvm_ctx_profile_fetch(&W2, [](void *W, const ContextNode &Node) -> bool { + return reinterpret_cast(W)->write(Node); + }); + EXPECT_TRUE(W2.State); +}