Skip to content

Commit ccf765c

Browse files
authored
[compiler-rt][ctx_profile] Add the instrumented contextual profiling APIs (#89838)
APIs for contextual profiling. `ContextNode` is the call context-specific counter buffer. `ContextRoot` is associated to those functions that constitute roots into interesting call graphs, and is the object on which we hang off `Arena`s for allocating `ContextNode`s, as well as the `ContextNode` corresponding to such functions. Graphs of `ContextNode`s are accessible by one thread at a time. (Tracking Issue: #89287, more details in the RFC referenced there)
1 parent a5044e6 commit ccf765c

File tree

3 files changed

+681
-2
lines changed

3 files changed

+681
-2
lines changed

compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp

Lines changed: 281 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,115 @@
1010
#include "sanitizer_common/sanitizer_allocator_internal.h"
1111
#include "sanitizer_common/sanitizer_common.h"
1212
#include "sanitizer_common/sanitizer_dense_map.h"
13+
#include "sanitizer_common/sanitizer_libc.h"
1314
#include "sanitizer_common/sanitizer_mutex.h"
1415
#include "sanitizer_common/sanitizer_placement_new.h"
1516
#include "sanitizer_common/sanitizer_thread_safety.h"
17+
#include "sanitizer_common/sanitizer_vector.h"
1618

1719
#include <assert.h>
1820

1921
using namespace __ctx_profile;
2022

23+
namespace {
24+
// Keep track of all the context roots we actually saw, so we can then traverse
25+
// them when the user asks for the profile in __llvm_ctx_profile_fetch
26+
__sanitizer::SpinMutex AllContextsMutex;
27+
SANITIZER_GUARDED_BY(AllContextsMutex)
28+
__sanitizer::Vector<ContextRoot *> AllContextRoots;
29+
30+
// utility to taint a pointer by setting the LSB. There is an assumption
31+
// throughout that the addresses of contexts are even (really, they should be
32+
// align(8), but "even"-ness is the minimum assumption)
33+
// "scratch contexts" are buffers that we return in certain cases - they are
34+
// large enough to allow for memory safe counter access, but they don't link
35+
// subcontexts below them (the runtime recognizes them and enforces that)
36+
ContextNode *markAsScratch(const ContextNode *Ctx) {
37+
return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
38+
}
39+
40+
// Used when getting the data from TLS. We don't *really* need to reset, but
41+
// it's a simpler system if we do.
42+
template <typename T> inline T consume(T &V) {
43+
auto R = V;
44+
V = {0};
45+
return R;
46+
}
47+
48+
// We allocate at least kBuffSize Arena pages. The scratch buffer is also that
49+
// large.
50+
constexpr size_t kPower = 20;
51+
constexpr size_t kBuffSize = 1 << kPower;
52+
53+
// Highly unlikely we need more than kBuffSize for a context.
54+
size_t getArenaAllocSize(size_t Needed) {
55+
if (Needed >= kBuffSize)
56+
return 2 * Needed;
57+
return kBuffSize;
58+
}
59+
60+
// verify the structural integrity of the context
61+
bool validate(const ContextRoot *Root) {
62+
// all contexts should be laid out in some arena page. Go over each arena
63+
// allocated for this Root, and jump over contained contexts based on
64+
// self-reported sizes.
65+
__sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
66+
for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
67+
const auto *Pos = Mem->start();
68+
while (Pos < Mem->pos()) {
69+
const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
70+
if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
71+
.second)
72+
return false;
73+
Pos += Ctx->size();
74+
}
75+
}
76+
77+
// Now traverse the contexts again the same way, but validate all nonull
78+
// subcontext addresses appear in the set computed above.
79+
for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
80+
const auto *Pos = Mem->start();
81+
while (Pos < Mem->pos()) {
82+
const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
83+
for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
84+
for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
85+
if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
86+
return false;
87+
88+
Pos += Ctx->size();
89+
}
90+
}
91+
return true;
92+
}
93+
} // namespace
94+
95+
// the scratch buffer - what we give when we can't produce a real context (the
96+
// scratch isn't "real" in that it's expected to be clobbered carelessly - we
97+
// don't read it). The other important thing is that the callees from a scratch
98+
// context also get a scratch context.
99+
// Eventually this can be replaced with per-function buffers, a'la the typical
100+
// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
101+
// the part about determining the nature of the subcontexts does.
102+
__thread char __Buffer[kBuffSize] = {0};
103+
104+
#define TheScratchContext \
105+
markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
106+
107+
// init the TLSes
108+
__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
109+
nullptr};
110+
__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
111+
112+
__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
113+
nullptr;
114+
21115
// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
22116
// the dependency on the latter.
23117
Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
24118
assert(!Prev || Prev->Next == nullptr);
25-
Arena *NewArena =
26-
new (__sanitizer::InternalAlloc(Size + sizeof(Arena))) Arena(Size);
119+
Arena *NewArena = new (__sanitizer::InternalAlloc(
120+
Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
121+
Arena(Size);
27122
if (Prev)
28123
Prev->Next = NewArena;
29124
return NewArena;
@@ -38,3 +133,187 @@ void Arena::freeArenaList(Arena *&A) {
38133
}
39134
A = nullptr;
40135
}
136+
137+
inline ContextNode *ContextNode::alloc(char *Place, GUID Guid,
138+
uint32_t NrCounters,
139+
uint32_t NrCallsites,
140+
ContextNode *Next) {
141+
assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
142+
return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
143+
}
144+
145+
void ContextNode::reset() {
146+
// FIXME(mtrofin): this is std::memset, which we can probably use if we
147+
// drop/reduce the dependency on sanitizer_common.
148+
for (uint32_t I = 0; I < NrCounters; ++I)
149+
counters()[I] = 0;
150+
for (uint32_t I = 0; I < NrCallsites; ++I)
151+
for (auto *Next = subContexts()[I]; Next; Next = Next->Next)
152+
Next->reset();
153+
}
154+
155+
// If this is the first time we hit a callsite with this (Guid) particular
156+
// callee, we need to allocate.
157+
ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint,
158+
uint32_t NrCounters, uint32_t NrCallsites) {
159+
auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
160+
auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
161+
char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
162+
if (!AllocPlace) {
163+
// if we failed to allocate on the current arena, allocate a new arena,
164+
// and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
165+
// find it from now on for other cases when we need to getCallsiteSlow.
166+
// Note that allocateNewArena will link the allocated memory in the list of
167+
// Arenas.
168+
__llvm_ctx_profile_current_context_root->CurrentMem = Mem =
169+
Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
170+
AllocPlace = Mem->tryBumpAllocate(AllocSize);
171+
}
172+
auto *Ret = ContextNode::alloc(AllocPlace, Guid, NrCounters, NrCallsites,
173+
*InsertionPoint);
174+
*InsertionPoint = Ret;
175+
return Ret;
176+
}
177+
178+
ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
179+
uint32_t NrCounters,
180+
uint32_t NrCallsites) {
181+
// fast "out" if we're not even doing contextual collection.
182+
if (!__llvm_ctx_profile_current_context_root)
183+
return TheScratchContext;
184+
185+
// also fast "out" if the caller is scratch. We can see if it's scratch by
186+
// looking at the interior pointer into the subcontexts vector that the caller
187+
// provided, which, if the context is scratch, so is that interior pointer
188+
// (because all the address calculations are using even values. Or more
189+
// precisely, aligned - 8 values)
190+
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
191+
if (!CallsiteContext || isScratch(CallsiteContext))
192+
return TheScratchContext;
193+
194+
// if the callee isn't the expected one, return scratch.
195+
// Signal handler(s) could have been invoked at any point in the execution.
196+
// Should that have happened, and had it (the handler) be built with
197+
// instrumentation, its __llvm_ctx_profile_get_context would have failed here.
198+
// Its sub call graph would have then populated
199+
// __llvm_ctx_profile_{expected_callee | callsite} at index 1.
200+
// The normal call graph may be impacted in that, if the signal handler
201+
// happened somewhere before we read the TLS here, we'd see the TLS reset and
202+
// we'd also fail here. That would just mean we would loose counter values for
203+
// the normal subgraph, this time around. That should be very unlikely, but if
204+
// it happens too frequently, we should be able to detect discrepancies in
205+
// entry counts (caller-callee). At the moment, the design goes on the
206+
// assumption that is so unfrequent, though, that it's not worth doing more
207+
// for that case.
208+
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
209+
if (ExpectedCallee != Callee)
210+
return TheScratchContext;
211+
212+
auto *Callsite = *CallsiteContext;
213+
// in the case of indirect calls, we will have all seen targets forming a
214+
// linked list here. Find the one corresponding to this callee.
215+
while (Callsite && Callsite->guid() != Guid) {
216+
Callsite = Callsite->next();
217+
}
218+
auto *Ret = Callsite ? Callsite
219+
: getCallsiteSlow(Guid, CallsiteContext, NrCounters,
220+
NrCallsites);
221+
if (Ret->callsites_size() != NrCallsites ||
222+
Ret->counters_size() != NrCounters)
223+
__sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
224+
"Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
225+
Ret, Guid, NrCallsites, NrCounters, Ret->guid(),
226+
Ret->callsites_size(), Ret->counters_size());
227+
Ret->onEntry();
228+
return Ret;
229+
}
230+
231+
// This should be called once for a Root. Allocate the first arena, set up the
232+
// first context.
233+
void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
234+
uint32_t NrCallsites) {
235+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
236+
&AllContextsMutex);
237+
// Re-check - we got here without having had taken a lock.
238+
if (Root->FirstMemBlock)
239+
return;
240+
const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
241+
auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
242+
Root->FirstMemBlock = M;
243+
Root->CurrentMem = M;
244+
Root->FirstNode = ContextNode::alloc(M->tryBumpAllocate(Needed), Guid,
245+
NrCounters, NrCallsites);
246+
AllContextRoots.PushBack(Root);
247+
}
248+
249+
ContextNode *__llvm_ctx_profile_start_context(
250+
ContextRoot *Root, GUID Guid, uint32_t Counters,
251+
uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
252+
if (!Root->FirstMemBlock) {
253+
setupContext(Root, Guid, Counters, Callsites);
254+
}
255+
if (Root->Taken.TryLock()) {
256+
__llvm_ctx_profile_current_context_root = Root;
257+
Root->FirstNode->onEntry();
258+
return Root->FirstNode;
259+
}
260+
// If this thread couldn't take the lock, return scratch context.
261+
__llvm_ctx_profile_current_context_root = nullptr;
262+
return TheScratchContext;
263+
}
264+
265+
void __llvm_ctx_profile_release_context(ContextRoot *Root)
266+
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
267+
if (__llvm_ctx_profile_current_context_root) {
268+
__llvm_ctx_profile_current_context_root = nullptr;
269+
Root->Taken.Unlock();
270+
}
271+
}
272+
273+
void __llvm_ctx_profile_start_collection() {
274+
size_t NrMemUnits = 0;
275+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
276+
&AllContextsMutex);
277+
for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
278+
auto *Root = AllContextRoots[I];
279+
__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
280+
&Root->Taken);
281+
for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
282+
++NrMemUnits;
283+
284+
Root->FirstNode->reset();
285+
}
286+
__sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
287+
}
288+
289+
bool __llvm_ctx_profile_fetch(
290+
void *Data, bool (*Writer)(void *W, const __ctx_profile::ContextNode &)) {
291+
assert(Writer);
292+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
293+
&AllContextsMutex);
294+
295+
for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
296+
auto *Root = AllContextRoots[I];
297+
__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
298+
&Root->Taken);
299+
if (!validate(Root)) {
300+
__sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
301+
return false;
302+
}
303+
if (!Writer(Data, *Root->FirstNode))
304+
return false;
305+
}
306+
return true;
307+
}
308+
309+
void __llvm_ctx_profile_free() {
310+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
311+
&AllContextsMutex);
312+
for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
313+
for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
314+
auto *C = A;
315+
A = A->next();
316+
__sanitizer::InternalFree(C);
317+
}
318+
AllContextRoots.Reset();
319+
}

0 commit comments

Comments
 (0)