diff --git a/ddprof-lib/src/main/cpp/flightRecorder.cpp b/ddprof-lib/src/main/cpp/flightRecorder.cpp index 12472a29..ab233e64 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.cpp +++ b/ddprof-lib/src/main/cpp/flightRecorder.cpp @@ -318,7 +318,7 @@ char *Recording::_jvm_flags = NULL; char *Recording::_java_command = NULL; Recording::Recording(int fd, Arguments &args) - : _fd(fd), _thread_set(), _method_map() { + : _fd(fd), _method_map() { args.save(_args); _chunk_start = lseek(_fd, 0, SEEK_END); @@ -330,6 +330,8 @@ Recording::Recording(int fd, Arguments &args) _bytes_written = 0; _tid = OS::threadId(); + _active_index.store(0, std::memory_order_relaxed); + VM::jvmti()->GetAvailableProcessors(&_available_processors); writeHeader(_buf); @@ -1059,11 +1061,18 @@ void Recording::writeExecutionModes(Buffer *buf) { } void Recording::writeThreads(Buffer *buf) { - addThread(_tid); - std::vector threads; - threads.reserve(_thread_set.size()); - _thread_set.collect(threads); - _thread_set.clear(); + int old_index = _active_index.fetch_xor(1, std::memory_order_acq_rel); + // After flip: new samples go into the new active set + // We flush from old_index (the previous active set) + + std::unordered_set threads; + threads.insert(_tid); + + for (int i = 0; i < CONCURRENCY_LEVEL; ++i) { + // Collect thread IDs from the fixed-size table into the main set + _thread_ids[i][old_index].collect(threads); + _thread_ids[i][old_index].clear(); + } Profiler *profiler = Profiler::instance(); ThreadInfo *t_info = &profiler->_thread_info; @@ -1072,15 +1081,15 @@ void Recording::writeThreads(Buffer *buf) { buf->putVar64(T_THREAD); buf->putVar64(threads.size()); - for (int i = 0; i < threads.size(); i++) { + for (auto tid : threads) { const char *thread_name; jlong thread_id; - std::pair, u64> info = t_info->get(threads[i]); + std::pair, u64> info = t_info->get(tid); if (info.first) { thread_name = info.first->c_str(); thread_id = info.second; } else { - snprintf(name_buf, sizeof(name_buf), "[tid=%d]", threads[i]); + snprintf(name_buf, sizeof(name_buf), "[tid=%d]", tid); thread_name = name_buf; thread_id = 0; } @@ -1090,9 +1099,9 @@ void Recording::writeThreads(Buffer *buf) { (thread_id == 0 ? length + 1 : 2 * length) - 3 * 10; // 3x max varint length flushIfNeeded(buf, required); - buf->putVar64(threads[i]); + buf->putVar64(tid); buf->putUtf8(thread_name, length); - buf->putVar64(threads[i]); + buf->putVar64(tid); if (thread_id == 0) { buf->put8(0); } else { @@ -1442,7 +1451,11 @@ void Recording::recordCpuLoad(Buffer *buf, float proc_user, float proc_system, flushIfNeeded(buf); } -void Recording::addThread(int tid) { _thread_set.add(tid); } +// assumption is that we hold the lock (with lock_index) +void Recording::addThread(int lock_index, int tid) { + int active = _active_index.load(std::memory_order_acquire); + _thread_ids[lock_index][active].insert(tid); +} Error FlightRecorder::start(Arguments &args, bool reset) { const char *file = args.file(); @@ -1599,7 +1612,7 @@ void FlightRecorder::recordEvent(int lock_index, int tid, u32 call_trace_id, break; } _rec->flushIfNeeded(buf); - _rec->addThread(tid); + _rec->addThread(lock_index, tid); } } diff --git a/ddprof-lib/src/main/cpp/flightRecorder.h b/ddprof-lib/src/main/cpp/flightRecorder.h index 22c4b3d3..541c4db1 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.h +++ b/ddprof-lib/src/main/cpp/flightRecorder.h @@ -18,6 +18,7 @@ #define _FLIGHTRECORDER_H #include +#include #include #include @@ -34,6 +35,7 @@ #include "mutex.h" #include "objectSampler.h" #include "threadFilter.h" +#include "threadIdTable.h" #include "vmEntry.h" const u64 MAX_JLONG = 0x7fffffffffffffffULL; @@ -127,9 +129,13 @@ class Recording { static char *_java_command; RecordingBuffer _buf[CONCURRENCY_LEVEL]; + // we have several tables to avoid lock contention + // we have a second dimension to allow a switch in the active table + ThreadIdTable _thread_ids[CONCURRENCY_LEVEL][2]; + std::atomic _active_index{0}; // 0 or 1 globally + int _fd; off_t _chunk_start; - ThreadFilter _thread_set; MethodMap _method_map; Arguments _args; @@ -258,7 +264,8 @@ class Recording { LockEvent *event); void recordCpuLoad(Buffer *buf, float proc_user, float proc_system, float machine_total); - void addThread(int tid); + + void addThread(int lock_index, int tid); }; class Lookup { diff --git a/ddprof-lib/src/main/cpp/javaApi.cpp b/ddprof-lib/src/main/cpp/javaApi.cpp index 017f23c4..3bff4e8d 100644 --- a/ddprof-lib/src/main/cpp/javaApi.cpp +++ b/ddprof-lib/src/main/cpp/javaApi.cpp @@ -123,19 +123,101 @@ Java_com_datadoghq_profiler_JavaProfiler_getSamples(JNIEnv *env, return (jlong)Profiler::instance()->total_samples(); } +// some duplication between add and remove, though we want to avoid having an extra branch in the hot path +extern "C" DLLEXPORT void JNICALL +Java_com_datadoghq_profiler_JavaProfiler_filterThreadAdd0(JNIEnv *env, + jobject unused) { + ProfiledThread *current = ProfiledThread::current(); + if (unlikely(current == nullptr)) { + assert(false); + return; + } + int tid = current->tid(); + if (unlikely(tid < 0)) { + return; + } + ThreadFilter *thread_filter = Profiler::instance()->threadFilter(); + if (unlikely(!thread_filter->enabled())) { + return; + } + + int slot_id = current->filterSlotId(); + if (unlikely(slot_id == -1)) { + // Thread doesn't have a slot ID yet (e.g., main thread), so register it + slot_id = thread_filter->registerThread(); + current->setFilterSlotId(slot_id); + } + + if (unlikely(slot_id == -1)) { + return; // Failed to register thread + } + thread_filter->add(tid, slot_id); +} + +extern "C" DLLEXPORT void JNICALL +Java_com_datadoghq_profiler_JavaProfiler_filterThreadRemove0(JNIEnv *env, + jobject unused) { + ProfiledThread *current = ProfiledThread::current(); + if (unlikely(current == nullptr)) { + assert(false); + return; + } + int tid = current->tid(); + if (unlikely(tid < 0)) { + return; + } + ThreadFilter *thread_filter = Profiler::instance()->threadFilter(); + if (unlikely(!thread_filter->enabled())) { + return; + } + + int slot_id = current->filterSlotId(); + if (unlikely(slot_id == -1)) { + // Thread doesn't have a slot ID yet - nothing to remove + return; + } + thread_filter->remove(slot_id); +} + +// Backward compatibility for existing code extern "C" DLLEXPORT void JNICALL Java_com_datadoghq_profiler_JavaProfiler_filterThread0(JNIEnv *env, jobject unused, jboolean enable) { - int tid = ProfiledThread::currentTid(); - if (tid < 0) { + ProfiledThread *current = ProfiledThread::current(); + if (unlikely(current == nullptr)) { + assert(false); + return; + } + int tid = current->tid(); + if (unlikely(tid < 0)) { return; } ThreadFilter *thread_filter = Profiler::instance()->threadFilter(); + if (unlikely(!thread_filter->enabled())) { + return; + } + + int slot_id = current->filterSlotId(); + if (unlikely(slot_id == -1)) { + if (enable) { + // Thread doesn't have a slot ID yet, so register it + slot_id = thread_filter->registerThread(); + current->setFilterSlotId(slot_id); + } else { + // Thread doesn't have a slot ID yet - nothing to remove + return; + } + } + + if (unlikely(slot_id == -1)) { + return; // Failed to register thread + } + if (enable) { - thread_filter->add(tid); + thread_filter->add(tid, slot_id); } else { - thread_filter->remove(tid); + thread_filter->remove(slot_id); } } @@ -406,24 +488,3 @@ Java_com_datadoghq_profiler_JVMAccess_healthCheck0(JNIEnv *env, jobject unused) { return true; } - -extern "C" DLLEXPORT jlong JNICALL -Java_com_datadoghq_profiler_ActiveBitmap_bitmapAddressFor0(JNIEnv *env, - jclass unused, - jint tid) { - u64* bitmap = Profiler::instance()->threadFilter()->bitmapAddressFor((int)tid); - return (jlong)bitmap; -} - -extern "C" DLLEXPORT jboolean JNICALL -Java_com_datadoghq_profiler_ActiveBitmap_isActive0(JNIEnv *env, - jclass unused, - jint tid) { - return Profiler::instance()->threadFilter()->accept((int)tid) ? JNI_TRUE : JNI_FALSE; -} - -extern "C" DLLEXPORT jlong JNICALL -Java_com_datadoghq_profiler_ActiveBitmap_getActiveCountAddr0(JNIEnv *env, - jclass unused) { - return (jlong)Profiler::instance()->threadFilter()->addressOfSize(); -} diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index d268d73c..135b523a 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -104,10 +104,12 @@ void Profiler::addRuntimeStub(const void *address, int length, void Profiler::onThreadStart(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) { ProfiledThread::initCurrentThread(); - - int tid = ProfiledThread::currentTid(); + ProfiledThread *current = ProfiledThread::current(); + int tid = current->tid(); if (_thread_filter.enabled()) { - _thread_filter.remove(tid); + int slot_id = _thread_filter.registerThread(); + current->setFilterSlotId(slot_id); + _thread_filter.remove(slot_id); // Remove from filtering initially } updateThreadName(jvmti, jni, thread, true); @@ -116,16 +118,33 @@ void Profiler::onThreadStart(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) { } void Profiler::onThreadEnd(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) { - int tid = ProfiledThread::currentTid(); - if (_thread_filter.enabled()) { - _thread_filter.remove(tid); + ProfiledThread *current = ProfiledThread::current(); + int tid = -1; + + if (current != nullptr) { + // ProfiledThread is alive - do full cleanup and use efficient tid access + int slot_id = current->filterSlotId(); + tid = current->tid(); + + if (_thread_filter.enabled()) { + _thread_filter.unregisterThread(slot_id); + current->setFilterSlotId(-1); + } + + ProfiledThread::release(); + } else { + // ProfiledThread already cleaned up - try to get tid from JVMTI as fallback + tid = VMThread::nativeThreadId(jni, thread); + if (tid < 0) { + // No ProfiledThread AND can't get tid from JVMTI - nothing we can do + return; + } } - updateThreadName(jvmti, jni, thread, true); - + + // These can run if we have a valid tid + updateThreadName(jvmti, jni, thread, false); // false = not self _cpu_engine->unregisterThread(tid); - // unregister here because JNI callers generally don't know about thread exits _wall_engine->unregisterThread(tid); - ProfiledThread::release(); } int Profiler::registerThread(int tid) { diff --git a/ddprof-lib/src/main/cpp/thread.h b/ddprof-lib/src/main/cpp/thread.h index 264ad6b8..bba5145a 100644 --- a/ddprof-lib/src/main/cpp/thread.h +++ b/ddprof-lib/src/main/cpp/thread.h @@ -43,11 +43,12 @@ class ProfiledThread : public ThreadLocalData { u32 _wall_epoch; u32 _call_trace_id; u32 _recording_epoch; + int _filter_slot_id; // Slot ID for thread filtering UnwindFailures _unwind_failures; ProfiledThread(int buffer_pos, int tid) : ThreadLocalData(), _pc(0), _span_id(0), _crash_depth(0), _buffer_pos(buffer_pos), _tid(tid), _cpu_epoch(0), - _wall_epoch(0), _call_trace_id(0), _recording_epoch(0) {}; + _wall_epoch(0), _call_trace_id(0), _recording_epoch(0), _filter_slot_id(-1) {}; void releaseFromBuffer(); @@ -120,6 +121,9 @@ class ProfiledThread : public ThreadLocalData { } static void signalHandler(int signo, siginfo_t *siginfo, void *ucontext); + + int filterSlotId() { return _filter_slot_id; } + void setFilterSlotId(int slotId) { _filter_slot_id = slotId; } }; #endif // _THREAD_H diff --git a/ddprof-lib/src/main/cpp/threadFilter.cpp b/ddprof-lib/src/main/cpp/threadFilter.cpp index 13e6c2ae..825fdbeb 100644 --- a/ddprof-lib/src/main/cpp/threadFilter.cpp +++ b/ddprof-lib/src/main/cpp/threadFilter.cpp @@ -1,175 +1,270 @@ -/* - * Copyright 2020 Andrei Pangin - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +// Copyright (C) Datadog 2025 +// High-performance lock-free thread filter implementation #include "threadFilter.h" -#include "counters.h" -#include "os.h" -#include "reverse_bits.h" -#include -#include -#include - -void trackPage() { - Counters::increment(THREAD_FILTER_PAGES, 1); - Counters::increment(THREAD_FILTER_BYTES, BITMAP_SIZE); -} -ThreadFilter::ThreadFilter() { - _max_thread_id = OS::getMaxThreadId(128 * 1024); - _max_bitmaps = (_max_thread_id + BITMAP_SIZE - 1) / BITMAP_SIZE; - u32 capacity = _max_bitmaps * sizeof(u64 *); - _bitmap = (u64 **)OS::safeAlloc(capacity); - memset(_bitmap, 0, capacity); - _bitmap[0] = (u64 *)OS::safeAlloc(BITMAP_SIZE); - trackPage(); - _enabled = false; - _size = 0; +#include +#include +#include +#include + +ThreadFilter::ShardHead ThreadFilter::_free_heads[ThreadFilter::kShardCount] {}; + +ThreadFilter::ThreadFilter() : _enabled(false) { + // Initialize chunk pointers to null (lazy allocation) + for (int i = 0; i < kMaxChunks; ++i) { + _chunks[i].store(nullptr, std::memory_order_relaxed); + } + _free_list = std::make_unique(kFreeListSize); + + // Initialize the first chunk + initializeChunk(0); + clear(); } ThreadFilter::~ThreadFilter() { - for (int i = 0; i < _max_bitmaps; i++) { - if (_bitmap[i] != NULL) { - OS::safeFree(_bitmap[i], BITMAP_SIZE); - } - } - if (_bitmap) { - OS::safeFree(_bitmap, _max_bitmaps * sizeof(u64 *)); - } + // Clean up allocated chunks + for (int i = 0; i < kMaxChunks; ++i) { + ChunkStorage* chunk = _chunks[i].load(std::memory_order_relaxed); + delete chunk; + } } -void ThreadFilter::init(const char *filter) { - if (filter == NULL) { - _enabled = false; - return; - } +void ThreadFilter::initializeChunk(int chunk_idx) { + if (chunk_idx >= kMaxChunks) return; + + // Check if chunk already exists + ChunkStorage* existing = _chunks[chunk_idx].load(std::memory_order_acquire); + if (existing != nullptr) return; - char *end; - do { - int id = strtol(filter, &end, 0); - if (id <= 0) { - break; + // Allocate and initialize new chunk completely before swapping + ChunkStorage* new_chunk = new ChunkStorage(); + for (auto& slot : new_chunk->slots) { + slot.value.store(-1, std::memory_order_relaxed); } - if (*end == '-') { - int to = strtol(end + 1, &end, 0); - while (id <= to) { - add(id++); - } + // Try to install it atomically + ChunkStorage* expected = nullptr; + if (_chunks[chunk_idx].compare_exchange_strong(expected, new_chunk, std::memory_order_acq_rel)) { + // Successfully installed } else { - add(id); + // Another thread beat us to it - clean up our allocation + delete new_chunk; + } +} + +ThreadFilter::SlotID ThreadFilter::registerThread() { + // First, try to get a slot from the free list (lock-free stack) + SlotID reused_slot = popFromFreeList(); + if (reused_slot >= 0) { + return reused_slot; } - filter = end + 1; - } while (*end); + // Allocate a new slot + SlotID index = _next_index.fetch_add(1, std::memory_order_relaxed); + if (index >= kMaxThreads) { + // Revert the increment and return failure + _next_index.fetch_sub(1, std::memory_order_relaxed); + return -1; + } - _enabled = true; + const int chunk_idx = index >> kChunkShift; + + // Ensure the chunk is initialized (lock-free) + if (chunk_idx >= _num_chunks.load(std::memory_order_acquire)) { + // Update the chunk count atomically + int expected_chunks = chunk_idx; + int desired_chunks = chunk_idx + 1; + while (!_num_chunks.compare_exchange_weak(expected_chunks, desired_chunks, + std::memory_order_acq_rel)) { + if (expected_chunks > chunk_idx) { + break; // Another thread already updated it + } + desired_chunks = expected_chunks + 1; + } + } + + // Initialize the chunk if needed + initializeChunk(chunk_idx); + + return index; } void ThreadFilter::clear() { - for (int i = 0; i < _max_bitmaps; i++) { - if (_bitmap[i] != NULL) { - memset(_bitmap[i], 0, BITMAP_SIZE); + // Clear all initialized chunks + int num_chunks = _num_chunks.load(std::memory_order_relaxed); + for (int i = 0; i < num_chunks; ++i) { + ChunkStorage* chunk = _chunks[i].load(std::memory_order_relaxed); + if (chunk != nullptr) { + for (auto& slot : chunk->slots) { + slot.value.store(-1, std::memory_order_relaxed); + } + } } - } - _size = 0; -} -// The mapping has to be reversible: f(f(x)) == x -int ThreadFilter::mapThreadId(int thread_id) { - // We want to map the thread_id inside the same bitmap - static_assert(BITMAP_SIZE >= (u16)0xffff, "Potential verflow"); - u16 lower16 = (u16)(thread_id & 0xffff); - lower16 = reverse16(lower16); - int tid = (thread_id & ~0xffff) | lower16; - return tid; + // Clear the free list + for (int i = 0; i < kFreeListSize; ++i) { + _free_list[i].value.store(-1, std::memory_order_relaxed); + _free_list[i].next.store(-1, std::memory_order_relaxed); + } + + // Reset the free heads for each shard + for (int s = 0; s < kShardCount; ++s) { + _free_heads[s].head.store(-1, std::memory_order_relaxed); + } } -// Get bitmap that contains the thread id, create one if it does not exist -u64* ThreadFilter::getBitmapFor(int thread_id) { - int index = thread_id / BITMAP_CAPACITY; - assert(index >= 0 && index < (int)_max_bitmaps); - u64* b = _bitmap[index]; - if (b == NULL) { - b = (u64 *)OS::safeAlloc(BITMAP_SIZE); - u64 *oldb = __sync_val_compare_and_swap( - &_bitmap[index], NULL, b); - if (oldb != NULL) { - OS::safeFree(b, BITMAP_SIZE); - b = oldb; - } else { - trackPage(); +bool ThreadFilter::accept(SlotID slot_id) const { + if (!_enabled) { + return true; } - } - return b; + if (slot_id < 0) return false; + + int chunk_idx = slot_id >> kChunkShift; + int slot_idx = slot_id & kChunkMask; + + if (chunk_idx >= kMaxChunks) return false; + if (slot_idx >= kChunkSize) return false; + + ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed); + if (chunk == nullptr) return false; // Fail-fast if not allocated + + return chunk->slots[slot_idx].value.load(std::memory_order_acquire) != -1; } -u64* ThreadFilter::bitmapAddressFor(int thread_id) { - u64* b = getBitmapFor(thread_id); - thread_id = mapThreadId(thread_id); - assert(b == bitmap(thread_id)); - return wordAddress(b, thread_id); +void ThreadFilter::add(int tid, SlotID slot_id) { + if (slot_id < 0) return; + + int chunk_idx = slot_id >> kChunkShift; + int slot_idx = slot_id & kChunkMask; + + if (chunk_idx >= kMaxChunks) return; + if (slot_idx >= kChunkSize) return; + + ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed); + if (chunk == nullptr) return; // Fail-fast if not allocated + + // Store the tid + chunk->slots[slot_idx].value.store(tid, std::memory_order_release); } -bool ThreadFilter::accept(int thread_id) { - thread_id = mapThreadId(thread_id); - u64 *b = bitmap(thread_id); - return b != NULL && (word(b, thread_id) & (1ULL << (thread_id & 0x3f))); +void ThreadFilter::remove(SlotID slot_id) { + if (slot_id < 0) return; + + int chunk_idx = slot_id >> kChunkShift; + int slot_idx = slot_id & kChunkMask; + + if (chunk_idx >= kMaxChunks) return; + if (slot_idx >= kChunkSize) return; + + ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed); + if (chunk == nullptr) return; // Fail-fast if not allocated + + // Remove the tid + chunk->slots[slot_idx].value.store(-1, std::memory_order_acq_rel); } -void ThreadFilter::add(int thread_id) { - u64 *b = getBitmapFor(thread_id); - assert (b != NULL); - thread_id = mapThreadId(thread_id); - assert(b == bitmap(thread_id)); - u64 bit = 1ULL << (thread_id & 0x3f); - if (!(__atomic_fetch_or(&word(b, thread_id), bit, __ATOMIC_RELAXED) & bit)) { - atomicInc(_size); - } +void ThreadFilter::unregisterThread(SlotID slot_id) { + if (slot_id < 0) return; + + // Clear the slot first + remove(slot_id); + + // Add to free list for reuse + pushToFreeList(slot_id); +} + +bool ThreadFilter::pushToFreeList(SlotID slot_id) { + // Lock-free sharded Treiber stack push + const int shard = shardOfSlot(slot_id); + auto& head = _free_heads[shard].head; // private cache-line + + for (int i = 0; i < kFreeListSize; ++i) { + int expected = -1; + if (_free_list[i].value.compare_exchange_strong( + expected, slot_id, std::memory_order_acq_rel)) { + // Link node into this shard’s Treiber stack + int old_head = head.load(std::memory_order_acquire); + do { + _free_list[i].next.store(old_head, std::memory_order_relaxed); + } while (!head.compare_exchange_weak(old_head, i, + std::memory_order_acq_rel, std::memory_order_relaxed)); + return true; + } + } + return false; // Free list full, slot is lost but this is rare } -void ThreadFilter::remove(int thread_id) { - thread_id = mapThreadId(thread_id); - u64 *b = bitmap(thread_id); - if (b == NULL) { - return; - } - - u64 bit = 1ULL << (thread_id & 0x3f); - if (__atomic_fetch_and(&word(b, thread_id), ~bit, __ATOMIC_RELAXED) & bit) { - atomicInc(_size, -1); - } +ThreadFilter::SlotID ThreadFilter::popFromFreeList() { + // Lock-free sharded Treiber stack pop + int hash = static_cast(std::hash{}(std::this_thread::get_id())); + int start = shardOf(hash); + + for (int pass = 0; pass < kShardCount; ++pass) { + int s = (start + pass) & (kShardCount - 1); + auto& head = _free_heads[s].head; + + while (true) { + int node = head.load(std::memory_order_acquire); + if (node == -1) break; // shard empty → try next + + int next = _free_list[node].next.load(std::memory_order_relaxed); + if (head.compare_exchange_weak(node, next, + std::memory_order_acq_rel, + std::memory_order_relaxed)) + { + int id = _free_list[node].value.exchange(-1, + std::memory_order_relaxed); + _free_list[node].next.store(-1, std::memory_order_relaxed); + return id; + } + } + } + return -1; // Empty list } -void ThreadFilter::collect(std::vector &v) { - for (int i = 0; i < _max_bitmaps; i++) { - u64 *b = _bitmap[i]; - if (b != NULL) { - int start_id = i * BITMAP_CAPACITY; - for (int j = 0; j < BITMAP_SIZE / sizeof(u64); j++) { - // Considering the functional impact, relaxed could be a reasonable - // order here - u64 word = __atomic_load_n(&b[j], __ATOMIC_ACQUIRE); - while (word != 0) { - int tid = start_id + j * 64 + __builtin_ctzl(word); - // restore thread id - tid = mapThreadId(tid); - v.push_back(tid); - word &= (word - 1); +void ThreadFilter::collect(std::vector& tids) const { + tids.clear(); + + // Reserve space for efficiency + // The eventual resize is not the bottleneck, so we reserve a reasonable size + tids.reserve(512); + + // Scan only initialized chunks + int num_chunks = _num_chunks.load(std::memory_order_relaxed); + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed); + if (chunk == nullptr) { + continue; // Skip unallocated chunks + } + + for (const auto& slot : chunk->slots) { + int slot_tid = slot.value.load(std::memory_order_acquire); + if (slot_tid != -1) { + tids.push_back(slot_tid); + } } - } } - } + + // Optional: shrink if we over-reserved significantly + if (tids.capacity() > tids.size() * 2) { + tids.shrink_to_fit(); + } +} + +void ThreadFilter::init(const char* filter) { + if (!filter) { + _enabled = false; + return; + } + + // Simple logic: any filter value (including "0") enables filtering + // Only explicitly registered threads via addThread() will be sampled + // Previously we had a syntax where we could manually force some thread IDs. + // This is no longer supported. + _enabled = true; +} + +bool ThreadFilter::enabled() const { + return _enabled; } diff --git a/ddprof-lib/src/main/cpp/threadFilter.h b/ddprof-lib/src/main/cpp/threadFilter.h index 7454be57..9151f199 100644 --- a/ddprof-lib/src/main/cpp/threadFilter.h +++ b/ddprof-lib/src/main/cpp/threadFilter.h @@ -1,85 +1,76 @@ -/* - * Copyright 2020 Andrei Pangin - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - #ifndef _THREADFILTER_H #define _THREADFILTER_H -#include "arch_dd.h" +#include +#include #include +#include +#include -// The size of thread ID bitmap in bytes. Must be at least 64K to allow mmap() -const u32 BITMAP_SIZE = 65536; -// How many thread IDs one bitmap can hold -const u32 BITMAP_CAPACITY = BITMAP_SIZE * 8; - -// ThreadFilter query operations must be lock-free and signal-safe; -// update operations are mostly lock-free, except rare bitmap allocations class ThreadFilter { -private: - // Total number of bitmaps required to hold the entire range of thread IDs - u32 _max_thread_id; - u32 _max_bitmaps; - u64 **_bitmap; - bool _enabled; - volatile int _size; - - u64 *bitmap(int thread_id) { - if (thread_id >= _max_thread_id) { - return NULL; - } - return __atomic_load_n( - &(_bitmap[static_cast(thread_id) / BITMAP_CAPACITY]), - __ATOMIC_ACQUIRE); - } - - static int mapThreadId(int thread_id); - - u64 &word(u64 *bitmap, int thread_id) { - // todo: add thread safe APIs - return bitmap[((u32)thread_id % BITMAP_CAPACITY) >> 6]; - } - - u64* const wordAddress(u64 *bitmap, int thread_id) const { - return &bitmap[((u32)thread_id % BITMAP_CAPACITY) >> 6]; - } - - u64* getBitmapFor(int thread_id); public: - ThreadFilter(); - ThreadFilter(ThreadFilter &threadFilter) = delete; - ~ThreadFilter(); - - bool enabled() const { return _enabled; } - - int size() const { return _size; } - const volatile int* addressOfSize() const { return &_size; } - - void init(const char *filter); - void clear(); + using SlotID = int; + + // Optimized limits for reasonable memory usage + static constexpr int kChunkSize = 256; + static constexpr int kChunkShift = 8; // log2(256) + static constexpr int kChunkMask = kChunkSize - 1; + static constexpr int kMaxThreads = 2048; + static constexpr int kMaxChunks = (kMaxThreads + kChunkSize - 1) / kChunkSize; // = 8 chunks + // High-performance free list using Treiber stack, 64 shards + static constexpr int kFreeListSize = 1024; + static constexpr int kShardCount = 64; // power-of-two + ThreadFilter(); + ~ThreadFilter(); + + void init(const char* filter); + void clear(); + bool enabled() const; + bool accept(SlotID slot_id) const; + void add(int tid, SlotID slot_id); + void remove(SlotID slot_id); + void collect(std::vector& tids) const; + + SlotID registerThread(); + void unregisterThread(SlotID slot_id); - inline bool isValid(int thread_id) { - return thread_id >= 0 && thread_id < _max_thread_id; - } - - bool accept(int thread_id); - void add(int thread_id); - void remove(int thread_id); - u64* bitmapAddressFor(int thread_id); - - void collect(std::vector &v); +private: + // Optimized slot structure with padding to avoid false sharing + struct alignas(64) Slot { + std::atomic value{-1}; + char padding[60]; // Pad to cache line size + }; + + // Lock-free free list using a stack-like structure + struct FreeListNode { + std::atomic value{-1}; + std::atomic next{-1}; + }; + + // Pre-allocated chunk storage to eliminate mutex contention + struct ChunkStorage { + std::array slots; + }; + + bool _enabled = false; + + // Lazily allocated storage for chunks + std::atomic _chunks[kMaxChunks]; + std::atomic _num_chunks{1}; + + // Lock-free slot allocation + std::atomic _next_index{0}; + std::unique_ptr _free_list; + + struct alignas(64) ShardHead { std::atomic head{-1}; }; + static ShardHead _free_heads[kShardCount]; // one cache-line each + + static inline int shardOf(int tid) { return tid & (kShardCount - 1); } + static inline int shardOfSlot(int s){ return s & (kShardCount - 1); } + // Helper methods for lock-free operations + void initializeChunk(int chunk_idx); + bool pushToFreeList(SlotID slot_id); + SlotID popFromFreeList(); }; #endif // _THREADFILTER_H diff --git a/ddprof-lib/src/main/cpp/threadIdTable.h b/ddprof-lib/src/main/cpp/threadIdTable.h new file mode 100644 index 00000000..1856c652 --- /dev/null +++ b/ddprof-lib/src/main/cpp/threadIdTable.h @@ -0,0 +1,73 @@ +/* + * Copyright The async-profiler authors + * SPDX-License-Identifier: Apache-2.0 + * Copyright 2021, 2025 Datadog, Inc + */ + +#ifndef _THREADIDTABLE_H +#define _THREADIDTABLE_H + +#include +#include + +// Signal-safe thread ID table with fixed size +class ThreadIdTable { +private: + // We have 256 slots per concurrency level (currently 16) + // This should cater for 4096 threads - if it turns out to be too small, we + // can increase it or make it configurable + static const int TABLE_SIZE = 256; + std::atomic table[TABLE_SIZE]; + + int hash(int tid) const { + // Simple hash function - could be improved if needed + return tid % TABLE_SIZE; + } + +public: + ThreadIdTable() { + clear(); + } + + // Signal-safe insertion using atomic operations only + void insert(int tid) { + if (tid == 0) return; // Invalid thread ID, 0 is reserved for empty slots + + int start_slot = hash(tid); + for (int probe = 0; probe < TABLE_SIZE; probe++) { + int slot = (start_slot + probe) % TABLE_SIZE; + int expected = 0; + + // Try to claim empty slot + if (table[slot].compare_exchange_strong(expected, tid, std::memory_order_relaxed)) { + return; // Successfully inserted + } + + // Check if already present + if (table[slot].load(std::memory_order_relaxed) == tid) { + return; // Already exists + } + } + // Table full - thread ID will be lost, but this is rare and non-critical + // Could increment a counter here for diagnostics if needed + } + + // Clear the table (not signal-safe, called during buffer switch) + void clear() { + for (int i = 0; i < TABLE_SIZE; i++) { + table[i].store(0, std::memory_order_relaxed); + } + } + + // Collect all thread IDs into a set (not signal-safe, called during buffer switch) + void collect(std::unordered_set& result) { + for (int i = 0; i < TABLE_SIZE; i++) { + int tid = table[i].load(std::memory_order_relaxed); + if (tid != 0) { + result.insert(tid); + } + } + } +}; + +#endif // _THREADIDTABLE_H \ No newline at end of file diff --git a/ddprof-lib/src/main/cpp/wallClock.cpp b/ddprof-lib/src/main/cpp/wallClock.cpp index d261cdce..0b16d0db 100644 --- a/ddprof-lib/src/main/cpp/wallClock.cpp +++ b/ddprof-lib/src/main/cpp/wallClock.cpp @@ -26,6 +26,7 @@ #include "vmStructs_dd.h" #include #include +#include // For std::sort and std::binary_search std::atomic BaseWallClock::_enabled{false}; @@ -196,6 +197,14 @@ void WallClockJVMTI::timerLoop() { bool do_filter = threadFilter->enabled(); int self = OS::threadId(); + // If filtering is enabled, collect the filtered TIDs first + std::vector filtered_tids; + if (do_filter) { + Profiler::instance()->threadFilter()->collect(filtered_tids); + // Sort the TIDs for efficient lookup + std::sort(filtered_tids.begin(), filtered_tids.end()); + } + for (int i = 0; i < threads_count; i++) { jthread thread = threads_ptr[i]; if (thread != nullptr) { @@ -204,12 +213,10 @@ void WallClockJVMTI::timerLoop() { continue; } int tid = nThread->osThreadId(); - if (!threadFilter->isValid(tid)) { - continue; - } - - if (tid != self && (!do_filter || threadFilter->accept(tid))) { - threads.push_back({nThread, thread, tid}); + if (tid != self && (!do_filter || + // Use binary search to efficiently find if tid is in filtered_tids + std::binary_search(filtered_tids.begin(), filtered_tids.end(), tid))) { + threads.push_back({nThread, thread}); } } } @@ -264,13 +271,17 @@ void WallClockJVMTI::timerLoop() { } void WallClockASGCT::timerLoop() { + // todo: re-allocating the vector every time is not efficient auto collectThreads = [&](std::vector& tids) { + // Get thread IDs from the filter if it's enabled + // Otherwise list all threads in the system if (Profiler::instance()->threadFilter()->enabled()) { Profiler::instance()->threadFilter()->collect(tids); } else { ThreadList *thread_list = OS::listThreads(); int tid = thread_list->next(); while (tid != -1) { + // Don't include the current thread if (tid != OS::threadId()) { tids.push_back(tid); } diff --git a/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java b/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java index a7cda6bc..fbc96555 100644 --- a/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java +++ b/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java @@ -109,7 +109,6 @@ public static synchronized JavaProfiler getInstance(String libLocation, String s throw new IOException("Failed to load Datadog Java profiler library", result.error); } init0(); - ActiveBitmap.initialize(); profiler.initializeContextStorage(); instance = profiler; @@ -210,11 +209,7 @@ public boolean recordTraceRoot(long rootSpanId, String endpoint, int sizeLimit) * 'filter' option must be enabled to use this method. */ public void addThread() { - if (UNSAFE != null) { - ActiveBitmap.setActive(TID.get(), true); - } else { - filterThread0(true); - } + filterThreadAdd0(); } /** @@ -222,14 +217,9 @@ public void addThread() { * 'filter' option must be enabled to use this method. */ public void removeThread() { - if (UNSAFE != null) { - ActiveBitmap.setActive(TID.get(), false); - } else { - filterThread0(false); - } + filterThreadRemove0(); } - /** * Passing context identifier to a profiler. This ID is thread-local and is dumped in * the JFR output only. 0 is a reserved value for "no-context". @@ -479,6 +469,10 @@ public Map getDebugCounters() { private static native boolean init0(); private native void stop0() throws IllegalStateException; private native String execute0(String command) throws IllegalArgumentException, IllegalStateException, IOException; + + private native void filterThreadAdd0(); + private native void filterThreadRemove0(); + // Backward compatibility for existing code private native void filterThread0(boolean enable); private static native int getTid0(); diff --git a/ddprof-lib/src/test/cpp/ddprof_ut.cpp b/ddprof-lib/src/test/cpp/ddprof_ut.cpp index a754c9b0..88d2e657 100644 --- a/ddprof-lib/src/test/cpp/ddprof_ut.cpp +++ b/ddprof-lib/src/test/cpp/ddprof_ut.cpp @@ -14,6 +14,9 @@ #include #include #include + #include // For std::sort + #include + #include ssize_t callback(char* ptr, int len) { return len; @@ -121,31 +124,109 @@ } TEST(ThreadFilter, testThreadFilter) { - int maxTid = OS::getMaxThreadId(); ThreadFilter filter; filter.init(""); ASSERT_TRUE(filter.enabled()); - EXPECT_EQ(0, filter.size()); - // increase step gradually to create different bit densities - int step = 1; - int size = 0; - for (int tid = 1; tid < maxTid - step - 1; tid += step, size++) { - EXPECT_FALSE(filter.accept(tid)); - filter.add(tid); - EXPECT_TRUE(filter.accept(tid)); - step++; + + const int num_threads = 10; + const int num_ops = 100; + std::vector threads; + std::atomic completed_threads{0}; + + // Each thread will add and remove its own thread ID multiple times + for (int i = 1; i <= num_threads; i++) { + threads.emplace_back([&filter, i, &completed_threads]() { + for (int j = 0; j < num_ops; j++) { + // Register a new slot for this thread + int slot_id = filter.registerThread(); + + // Add thread ID to slot + filter.add(i, slot_id); + bool accepted = filter.accept(slot_id); + if (!accepted) { + fprintf(stderr, "FAIL: Thread %d, op %d, slot %d: accept(slot=%d) returned false after add\n", + i, j, slot_id, slot_id); + } + EXPECT_TRUE(accepted); + + // Remove thread ID + filter.remove(slot_id); + accepted = filter.accept(slot_id); + if (accepted) { + fprintf(stderr, "FAIL: Thread %d, op %d, slot %d: accept(slot=%d) returned true after remove\n", + i, j, slot_id, slot_id); + } + EXPECT_FALSE(accepted); + } + completed_threads++; + }); + } + + // Wait for all threads to complete + for (auto& t : threads) { + t.join(); } - ASSERT_EQ(size, filter.size()); + + // Verify all threads completed + ASSERT_EQ(completed_threads.load(), num_threads); + + // Collect and verify all thread IDs were properly removed std::vector tids; - tids.reserve(size); filter.collect(tids); - ASSERT_EQ(size, tids.size()); - for (int tid : tids) { - ASSERT_TRUE(filter.accept(tid)); - filter.remove(tid); - ASSERT_FALSE(filter.accept(tid)); + ASSERT_EQ(tids.size(), 0); + } + + TEST(ThreadFilter, testThreadFilterCollect) { + ThreadFilter filter; + filter.init(""); + ASSERT_TRUE(filter.enabled()); + + const int num_threads = 10; + std::vector threads; + std::atomic completed_threads{0}; + std::vector expected_tids; + std::vector slots(num_threads); // Track slot IDs + + // Pre-register slots for each thread + for (int i = 0; i < num_threads; i++) { + slots[i] = filter.registerThread(); + } + + // Each thread will add its thread ID + for (int i = 1; i <= num_threads; i++) { + expected_tids.push_back(i); + int slot_id = slots[i-1]; // Use the pre-registered slot + + threads.emplace_back([&filter, i, slot_id, &completed_threads]() { + filter.add(i, slot_id); + EXPECT_TRUE(filter.accept(slot_id)); + completed_threads++; + }); + } + + // Wait for all threads to complete + for (auto& t : threads) { + t.join(); + } + + // Verify all threads completed + ASSERT_EQ(completed_threads.load(), num_threads); + + // Collect and verify all thread IDs are present + std::vector collected_tids; + filter.collect(collected_tids); + + // Sort both vectors for comparison + std::sort(expected_tids.begin(), expected_tids.end()); + std::sort(collected_tids.begin(), collected_tids.end()); + + ASSERT_EQ(expected_tids.size(), collected_tids.size()); + for (size_t i = 0; i < expected_tids.size(); i++) { + EXPECT_EQ(expected_tids[i], collected_tids[i]) + << "Mismatch at index " << i + << ": expected " << expected_tids[i] + << ", got " << collected_tids[i]; } - EXPECT_EQ(0, filter.size()); } TEST(ThreadInfoTest, testThreadInfoCleanupAllDead) { diff --git a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/counters/ThreadFilterBenchmark.java b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/counters/ThreadFilterBenchmark.java index eab18e7a..eb0df4d9 100644 --- a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/counters/ThreadFilterBenchmark.java +++ b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/counters/ThreadFilterBenchmark.java @@ -15,7 +15,10 @@ @State(Scope.Benchmark) public class ThreadFilterBenchmark extends Configuration { - private static final int NUM_THREADS = 4; + @Param({"true", "false"}) // Parameterize the filter usage + public boolean useThreadFilters; + + private static final int NUM_THREADS = 15; private ExecutorService executorService; private JavaProfiler profiler; private AtomicBoolean running; @@ -30,13 +33,13 @@ public class ThreadFilterBenchmark extends Configuration { private static final AtomicIntegerArray atomicArray = new AtomicIntegerArray(ARRAY_SIZE); private static final int CACHE_LINE_SIZE = 64; // Typical cache line size private static final int STRIDE = CACHE_LINE_SIZE / Integer.BYTES; // Elements per cache line - private boolean useThreadFilters = true; // Flag to control the use of thread filters private AtomicLong addThreadCount = new AtomicLong(0); private AtomicLong removeThreadCount = new AtomicLong(0); @Setup(Level.Trial) public void setup() throws IOException { System.out.println("Setting up benchmark..."); + System.out.println("Thread filters enabled: " + useThreadFilters); System.out.println("Creating thread pool with " + NUM_THREADS + " threads"); executorService = Executors.newFixedThreadPool(NUM_THREADS); System.out.println("Getting profiler instance"); @@ -53,6 +56,7 @@ public void setup() throws IOException { System.out.println("Starting profiler with " + config); profiler.execute(config); System.out.println("Started profiler with output file"); + running = new AtomicBoolean(true); operationCount = new AtomicLong(0); startTime = System.currentTimeMillis(); @@ -139,7 +143,7 @@ public void setUseThreadFilters(boolean useThreadFilters) { @Benchmark @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 0) + @Fork(value = 1, warmups = 1) @Warmup(iterations = 1, time = 1) @Measurement(iterations = 1, time = 2) @Threads(1) @@ -149,14 +153,13 @@ public long threadFilterStress() throws InterruptedException { startLatch = new CountDownLatch(NUM_THREADS); stopLatch = new CountDownLatch(NUM_THREADS); - // Start all worker threads + // Start all worker threads[] for (int i = 0; i < NUM_THREADS; i++) { final int threadId = i; executorService.submit(() -> { try { startLatch.countDown(); startLatch.await(30, TimeUnit.SECONDS); - String startMsg = String.format("Thread %d started%n", threadId); System.out.print(startMsg); if (logWriter != null) { @@ -216,14 +219,14 @@ public long threadFilterStress() throws InterruptedException { operationCount.incrementAndGet(); } - if (operationCount.get() % 1000 == 0) { - String progressMsg = String.format("Thread %d completed %d operations%n", threadId, operationCount.get()); - System.out.print(progressMsg); - if (logWriter != null) { - logWriter.print(progressMsg); - logWriter.flush(); - } - } + // if (operationCount.get() % 1000 == 0) { + // String progressMsg = String.format("Thread %d completed %d operations%n", threadId, operationCount.get()); + // System.out.print(progressMsg); + // if (logWriter != null) { + // logWriter.print(progressMsg); + // logWriter.flush(); + // } + // } } } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -236,4 +239,4 @@ public long threadFilterStress() throws InterruptedException { stopLatch.await(); return operationCount.get(); } -} +} \ No newline at end of file diff --git a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java index bc03e0ca..f56bd258 100644 --- a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java +++ b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java @@ -13,6 +13,7 @@ import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; @@ -39,6 +40,17 @@ public class ThreadFilterBenchmark extends Configuration { public void setup() throws IOException { profiler = JavaProfiler.getInstance(); workloadNum = Long.parseLong(workload); + // Start profiling to enable JVMTI ThreadStart callbacks + // Add JFR file parameter to satisfy profiler requirements + profiler.execute("start," + command + ",jfr,file=/tmp/thread-filter-benchmark.jfr"); + } + + @TearDown(Level.Trial) + public void tearDown() throws IOException { + // Stop profiling and clean up + if (profiler != null) { + profiler.execute("stop"); + } } @Benchmark