diff --git a/ddprof-lib/src/main/cpp/flightRecorder.cpp b/ddprof-lib/src/main/cpp/flightRecorder.cpp
index 12472a29..ab233e64 100644
--- a/ddprof-lib/src/main/cpp/flightRecorder.cpp
+++ b/ddprof-lib/src/main/cpp/flightRecorder.cpp
@@ -318,7 +318,7 @@ char *Recording::_jvm_flags = NULL;
 char *Recording::_java_command = NULL;
 
 Recording::Recording(int fd, Arguments &args)
-    : _fd(fd), _thread_set(), _method_map() {
+    : _fd(fd), _method_map() {
 
   args.save(_args);
   _chunk_start = lseek(_fd, 0, SEEK_END);
@@ -330,6 +330,8 @@ Recording::Recording(int fd, Arguments &args)
   _bytes_written = 0;
 
   _tid = OS::threadId();
+  _active_index.store(0, std::memory_order_relaxed);
+
   VM::jvmti()->GetAvailableProcessors(&_available_processors);
 
   writeHeader(_buf);
@@ -1059,11 +1061,18 @@ void Recording::writeExecutionModes(Buffer *buf) {
 }
 
 void Recording::writeThreads(Buffer *buf) {
-  addThread(_tid);
-  std::vector<int> threads;
-  threads.reserve(_thread_set.size());
-  _thread_set.collect(threads);
-  _thread_set.clear();
+  int old_index = _active_index.fetch_xor(1, std::memory_order_acq_rel);
+  // After flip: new samples go into the new active set
+  // We flush from old_index (the previous active set)
+
+  std::unordered_set<int> threads;
+  threads.insert(_tid);
+
+  for (int i = 0; i < CONCURRENCY_LEVEL; ++i) {
+    // Collect thread IDs from the fixed-size table into the main set
+    _thread_ids[i][old_index].collect(threads);
+    _thread_ids[i][old_index].clear();
+  }
 
   Profiler *profiler = Profiler::instance();
   ThreadInfo *t_info = &profiler->_thread_info;
@@ -1072,15 +1081,15 @@ void Recording::writeThreads(Buffer *buf) {
 
   buf->putVar64(T_THREAD);
   buf->putVar64(threads.size());
-  for (int i = 0; i < threads.size(); i++) {
+  for (auto tid : threads) {
     const char *thread_name;
     jlong thread_id;
-    std::pair<std::shared_ptr<std::string>, u64> info = t_info->get(threads[i]);
+    std::pair<std::shared_ptr<std::string>, u64> info = t_info->get(tid);
     if (info.first) {
       thread_name = info.first->c_str();
       thread_id = info.second;
     } else {
-      snprintf(name_buf, sizeof(name_buf), "[tid=%d]", threads[i]);
+      snprintf(name_buf, sizeof(name_buf), "[tid=%d]", tid);
       thread_name = name_buf;
       thread_id = 0;
     }
@@ -1090,9 +1099,9 @@ void Recording::writeThreads(Buffer *buf) {
                    (thread_id == 0 ? length + 1 : 2 * length) -
                    3 * 10; // 3x max varint length
     flushIfNeeded(buf, required);
-    buf->putVar64(threads[i]);
+    buf->putVar64(tid);
     buf->putUtf8(thread_name, length);
-    buf->putVar64(threads[i]);
+    buf->putVar64(tid);
     if (thread_id == 0) {
       buf->put8(0);
     } else {
@@ -1442,7 +1451,11 @@ void Recording::recordCpuLoad(Buffer *buf, float proc_user, float proc_system,
   flushIfNeeded(buf);
 }
 
-void Recording::addThread(int tid) { _thread_set.add(tid); }
+// assumption is that we hold the lock (with lock_index)
+void Recording::addThread(int lock_index, int tid) {
+    int active = _active_index.load(std::memory_order_acquire);
+    _thread_ids[lock_index][active].insert(tid);
+}
 
 Error FlightRecorder::start(Arguments &args, bool reset) {
   const char *file = args.file();
@@ -1599,7 +1612,7 @@ void FlightRecorder::recordEvent(int lock_index, int tid, u32 call_trace_id,
       break;
     }
     _rec->flushIfNeeded(buf);
-    _rec->addThread(tid);
+    _rec->addThread(lock_index, tid);
   }
 }
 
diff --git a/ddprof-lib/src/main/cpp/flightRecorder.h b/ddprof-lib/src/main/cpp/flightRecorder.h
index 22c4b3d3..541c4db1 100644
--- a/ddprof-lib/src/main/cpp/flightRecorder.h
+++ b/ddprof-lib/src/main/cpp/flightRecorder.h
@@ -18,6 +18,7 @@
 #define _FLIGHTRECORDER_H
 
 #include <map>
+#include <unordered_set>
 
 #include <limits.h>
 #include <string.h>
@@ -34,6 +35,7 @@
 #include "mutex.h"
 #include "objectSampler.h"
 #include "threadFilter.h"
+#include "threadIdTable.h"
 #include "vmEntry.h"
 
 const u64 MAX_JLONG = 0x7fffffffffffffffULL;
@@ -127,9 +129,13 @@ class Recording {
   static char *_java_command;
 
   RecordingBuffer _buf[CONCURRENCY_LEVEL];
+  // we have several tables to avoid lock contention
+  // we have a second dimension to allow a switch in the active table
+  ThreadIdTable _thread_ids[CONCURRENCY_LEVEL][2];
+  std::atomic<int> _active_index{0};  // 0 or 1 globally
+
   int _fd;
   off_t _chunk_start;
-  ThreadFilter _thread_set;
   MethodMap _method_map;
 
   Arguments _args;
@@ -258,7 +264,8 @@ class Recording {
                         LockEvent *event);
   void recordCpuLoad(Buffer *buf, float proc_user, float proc_system,
                      float machine_total);
-  void addThread(int tid);
+
+  void addThread(int lock_index, int tid);
 };
 
 class Lookup {
diff --git a/ddprof-lib/src/main/cpp/javaApi.cpp b/ddprof-lib/src/main/cpp/javaApi.cpp
index 017f23c4..3bff4e8d 100644
--- a/ddprof-lib/src/main/cpp/javaApi.cpp
+++ b/ddprof-lib/src/main/cpp/javaApi.cpp
@@ -123,19 +123,101 @@ Java_com_datadoghq_profiler_JavaProfiler_getSamples(JNIEnv *env,
   return (jlong)Profiler::instance()->total_samples();
 }
 
+// some duplication between add and remove, though we want to avoid having an extra branch in the hot path
+extern "C" DLLEXPORT void JNICALL
+Java_com_datadoghq_profiler_JavaProfiler_filterThreadAdd0(JNIEnv *env,
+                                                          jobject unused) {
+  ProfiledThread *current = ProfiledThread::current();
+  if (unlikely(current == nullptr)) {
+    assert(false);
+    return;
+  }
+  int tid = current->tid();
+  if (unlikely(tid < 0)) {
+    return;
+  }
+  ThreadFilter *thread_filter = Profiler::instance()->threadFilter();
+  if (unlikely(!thread_filter->enabled())) {
+    return;
+  }
+  
+  int slot_id = current->filterSlotId();
+  if (unlikely(slot_id == -1)) {
+    // Thread doesn't have a slot ID yet (e.g., main thread), so register it
+    slot_id = thread_filter->registerThread();
+    current->setFilterSlotId(slot_id);
+  }
+  
+  if (unlikely(slot_id == -1)) {
+    return;  // Failed to register thread
+  }
+  thread_filter->add(tid, slot_id);
+}
+
+extern "C" DLLEXPORT void JNICALL
+Java_com_datadoghq_profiler_JavaProfiler_filterThreadRemove0(JNIEnv *env,
+                                                             jobject unused) {
+  ProfiledThread *current = ProfiledThread::current();
+  if (unlikely(current == nullptr)) {
+    assert(false);
+    return;
+  }
+  int tid = current->tid();
+  if (unlikely(tid < 0)) {
+    return;
+  }
+  ThreadFilter *thread_filter = Profiler::instance()->threadFilter();
+  if (unlikely(!thread_filter->enabled())) {
+    return;
+  }
+  
+  int slot_id = current->filterSlotId();
+  if (unlikely(slot_id == -1)) {
+    // Thread doesn't have a slot ID yet - nothing to remove
+    return;
+  }
+  thread_filter->remove(slot_id);
+}
+
+// Backward compatibility for existing code
 extern "C" DLLEXPORT void JNICALL
 Java_com_datadoghq_profiler_JavaProfiler_filterThread0(JNIEnv *env,
                                                        jobject unused,
                                                        jboolean enable) {
-  int tid = ProfiledThread::currentTid();
-  if (tid < 0) {
+  ProfiledThread *current = ProfiledThread::current();
+  if (unlikely(current == nullptr)) {
+    assert(false);
+    return;
+  }
+  int tid = current->tid();
+  if (unlikely(tid < 0)) {
     return;
   }
   ThreadFilter *thread_filter = Profiler::instance()->threadFilter();
+  if (unlikely(!thread_filter->enabled())) {
+    return;
+  }
+  
+  int slot_id = current->filterSlotId();
+  if (unlikely(slot_id == -1)) {
+    if (enable) {
+      // Thread doesn't have a slot ID yet, so register it
+      slot_id = thread_filter->registerThread();
+      current->setFilterSlotId(slot_id);
+    } else {
+      // Thread doesn't have a slot ID yet - nothing to remove
+      return;
+    }
+  }
+  
+  if (unlikely(slot_id == -1)) {
+    return;  // Failed to register thread
+  }
+  
   if (enable) {
-    thread_filter->add(tid);
+    thread_filter->add(tid, slot_id);
   } else {
-    thread_filter->remove(tid);
+    thread_filter->remove(slot_id);
   }
 }
 
@@ -406,24 +488,3 @@ Java_com_datadoghq_profiler_JVMAccess_healthCheck0(JNIEnv *env,
                                                          jobject unused) {
   return true;
 }
-
-extern "C" DLLEXPORT jlong JNICALL
-Java_com_datadoghq_profiler_ActiveBitmap_bitmapAddressFor0(JNIEnv *env,
-		                                                    jclass unused,
-						                                    jint tid) {
-  u64* bitmap = Profiler::instance()->threadFilter()->bitmapAddressFor((int)tid);
-  return (jlong)bitmap;
-}
-
-extern "C" DLLEXPORT jboolean JNICALL
-Java_com_datadoghq_profiler_ActiveBitmap_isActive0(JNIEnv *env,
-                                                   jclass unused,
-                                                   jint tid) {
-  return Profiler::instance()->threadFilter()->accept((int)tid) ? JNI_TRUE : JNI_FALSE;
-}
-
-extern "C" DLLEXPORT jlong JNICALL
-Java_com_datadoghq_profiler_ActiveBitmap_getActiveCountAddr0(JNIEnv *env,
-                                                              jclass unused) {
-  return (jlong)Profiler::instance()->threadFilter()->addressOfSize();
-}
diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp
index d268d73c..135b523a 100644
--- a/ddprof-lib/src/main/cpp/profiler.cpp
+++ b/ddprof-lib/src/main/cpp/profiler.cpp
@@ -104,10 +104,12 @@ void Profiler::addRuntimeStub(const void *address, int length,
 
 void Profiler::onThreadStart(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) {
   ProfiledThread::initCurrentThread();
-
-  int tid = ProfiledThread::currentTid();
+  ProfiledThread *current = ProfiledThread::current();
+  int tid = current->tid();
   if (_thread_filter.enabled()) {
-    _thread_filter.remove(tid);
+    int slot_id = _thread_filter.registerThread();
+    current->setFilterSlotId(slot_id);
+    _thread_filter.remove(slot_id);  // Remove from filtering initially
   }
   updateThreadName(jvmti, jni, thread, true);
 
@@ -116,16 +118,33 @@ void Profiler::onThreadStart(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) {
 }
 
 void Profiler::onThreadEnd(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) {
-  int tid = ProfiledThread::currentTid();
-  if (_thread_filter.enabled()) {
-    _thread_filter.remove(tid);
+  ProfiledThread *current = ProfiledThread::current();
+  int tid = -1;
+  
+  if (current != nullptr) {
+    // ProfiledThread is alive - do full cleanup and use efficient tid access
+    int slot_id = current->filterSlotId();
+    tid = current->tid();
+    
+    if (_thread_filter.enabled()) {
+      _thread_filter.unregisterThread(slot_id);
+      current->setFilterSlotId(-1);
+    }
+    
+    ProfiledThread::release();
+  } else {
+    // ProfiledThread already cleaned up - try to get tid from JVMTI as fallback
+    tid = VMThread::nativeThreadId(jni, thread);
+    if (tid < 0) {
+      // No ProfiledThread AND can't get tid from JVMTI - nothing we can do
+      return;
+    }
   }
-  updateThreadName(jvmti, jni, thread, true);
-
+  
+  // These can run if we have a valid tid
+  updateThreadName(jvmti, jni, thread, false);  // false = not self
   _cpu_engine->unregisterThread(tid);
-  // unregister here because JNI callers generally don't know about thread exits
   _wall_engine->unregisterThread(tid);
-  ProfiledThread::release();
 }
 
 int Profiler::registerThread(int tid) {
diff --git a/ddprof-lib/src/main/cpp/thread.h b/ddprof-lib/src/main/cpp/thread.h
index 264ad6b8..bba5145a 100644
--- a/ddprof-lib/src/main/cpp/thread.h
+++ b/ddprof-lib/src/main/cpp/thread.h
@@ -43,11 +43,12 @@ class ProfiledThread : public ThreadLocalData {
   u32 _wall_epoch;
   u32 _call_trace_id;
   u32 _recording_epoch;
+  int _filter_slot_id; // Slot ID for thread filtering
   UnwindFailures _unwind_failures;
 
   ProfiledThread(int buffer_pos, int tid)
       : ThreadLocalData(), _pc(0), _span_id(0), _crash_depth(0), _buffer_pos(buffer_pos), _tid(tid), _cpu_epoch(0),
-        _wall_epoch(0), _call_trace_id(0), _recording_epoch(0) {};
+        _wall_epoch(0), _call_trace_id(0), _recording_epoch(0), _filter_slot_id(-1) {};
 
   void releaseFromBuffer();
 
@@ -120,6 +121,9 @@ class ProfiledThread : public ThreadLocalData {
   }
 
   static void signalHandler(int signo, siginfo_t *siginfo, void *ucontext);
+
+  int filterSlotId() { return _filter_slot_id; }
+  void setFilterSlotId(int slotId) { _filter_slot_id = slotId; }
 };
 
 #endif // _THREAD_H
diff --git a/ddprof-lib/src/main/cpp/threadFilter.cpp b/ddprof-lib/src/main/cpp/threadFilter.cpp
index 13e6c2ae..825fdbeb 100644
--- a/ddprof-lib/src/main/cpp/threadFilter.cpp
+++ b/ddprof-lib/src/main/cpp/threadFilter.cpp
@@ -1,175 +1,270 @@
-/*
- * Copyright 2020 Andrei Pangin
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+// Copyright (C) Datadog 2025
+// High-performance lock-free thread filter implementation
 
 #include "threadFilter.h"
-#include "counters.h"
-#include "os.h"
-#include "reverse_bits.h"
-#include <cassert>
-#include <stdlib.h>
-#include <string.h>
-
-void trackPage() {
-  Counters::increment(THREAD_FILTER_PAGES, 1);
-  Counters::increment(THREAD_FILTER_BYTES, BITMAP_SIZE);
-}
 
-ThreadFilter::ThreadFilter() {
-  _max_thread_id = OS::getMaxThreadId(128 * 1024);
-  _max_bitmaps = (_max_thread_id + BITMAP_SIZE - 1) / BITMAP_SIZE;
-  u32 capacity = _max_bitmaps * sizeof(u64 *);
-  _bitmap = (u64 **)OS::safeAlloc(capacity);
-  memset(_bitmap, 0, capacity);
-  _bitmap[0] = (u64 *)OS::safeAlloc(BITMAP_SIZE);
-  trackPage();
-  _enabled = false;
-  _size = 0;
+#include <cstdlib>
+#include <thread>
+#include <algorithm>
+#include <cstring>
+
+ThreadFilter::ShardHead ThreadFilter::_free_heads[ThreadFilter::kShardCount] {};
+
+ThreadFilter::ThreadFilter() : _enabled(false) {
+    // Initialize chunk pointers to null (lazy allocation)
+    for (int i = 0; i < kMaxChunks; ++i) {
+        _chunks[i].store(nullptr, std::memory_order_relaxed);
+    }
+    _free_list = std::make_unique<FreeListNode[]>(kFreeListSize);
+
+    // Initialize the first chunk
+    initializeChunk(0);
+    clear();
 }
 
 ThreadFilter::~ThreadFilter() {
-  for (int i = 0; i < _max_bitmaps; i++) {
-    if (_bitmap[i] != NULL) {
-      OS::safeFree(_bitmap[i], BITMAP_SIZE);
-    }
-  }
-  if (_bitmap) {
-    OS::safeFree(_bitmap, _max_bitmaps * sizeof(u64 *));
-  }
+    // Clean up allocated chunks
+    for (int i = 0; i < kMaxChunks; ++i) {
+        ChunkStorage* chunk = _chunks[i].load(std::memory_order_relaxed);
+        delete chunk;
+    }
 }
 
-void ThreadFilter::init(const char *filter) {
-  if (filter == NULL) {
-    _enabled = false;
-    return;
-  }
+void ThreadFilter::initializeChunk(int chunk_idx) {
+    if (chunk_idx >= kMaxChunks) return;
+
+    // Check if chunk already exists
+    ChunkStorage* existing = _chunks[chunk_idx].load(std::memory_order_acquire);
+    if (existing != nullptr) return;
 
-  char *end;
-  do {
-    int id = strtol(filter, &end, 0);
-    if (id <= 0) {
-      break;
+    // Allocate and initialize new chunk completely before swapping
+    ChunkStorage* new_chunk = new ChunkStorage();
+    for (auto& slot : new_chunk->slots) {
+        slot.value.store(-1, std::memory_order_relaxed);
     }
 
-    if (*end == '-') {
-      int to = strtol(end + 1, &end, 0);
-      while (id <= to) {
-        add(id++);
-      }
+    // Try to install it atomically
+    ChunkStorage* expected = nullptr;
+    if (_chunks[chunk_idx].compare_exchange_strong(expected, new_chunk, std::memory_order_acq_rel)) {
+        // Successfully installed
     } else {
-      add(id);
+        // Another thread beat us to it - clean up our allocation
+        delete new_chunk;
+    }
+}
+
+ThreadFilter::SlotID ThreadFilter::registerThread() {
+    // First, try to get a slot from the free list (lock-free stack)
+    SlotID reused_slot = popFromFreeList();
+    if (reused_slot >= 0) {
+        return reused_slot;
     }
 
-    filter = end + 1;
-  } while (*end);
+    // Allocate a new slot
+    SlotID index = _next_index.fetch_add(1, std::memory_order_relaxed);
+    if (index >= kMaxThreads) {
+        // Revert the increment and return failure
+        _next_index.fetch_sub(1, std::memory_order_relaxed);
+        return -1;
+    }
 
-  _enabled = true;
+    const int chunk_idx = index >> kChunkShift;
+
+    // Ensure the chunk is initialized (lock-free)
+    if (chunk_idx >= _num_chunks.load(std::memory_order_acquire)) {
+        // Update the chunk count atomically
+        int expected_chunks = chunk_idx;
+        int desired_chunks = chunk_idx + 1;
+        while (!_num_chunks.compare_exchange_weak(expected_chunks, desired_chunks,
+                                                   std::memory_order_acq_rel)) {
+            if (expected_chunks > chunk_idx) {
+                break; // Another thread already updated it
+            }
+            desired_chunks = expected_chunks + 1;
+        }
+    }
+
+    // Initialize the chunk if needed
+    initializeChunk(chunk_idx);
+
+    return index;
 }
 
 void ThreadFilter::clear() {
-  for (int i = 0; i < _max_bitmaps; i++) {
-    if (_bitmap[i] != NULL) {
-      memset(_bitmap[i], 0, BITMAP_SIZE);
+    // Clear all initialized chunks
+    int num_chunks = _num_chunks.load(std::memory_order_relaxed);
+    for (int i = 0; i < num_chunks; ++i) {
+        ChunkStorage* chunk = _chunks[i].load(std::memory_order_relaxed);
+        if (chunk != nullptr) {
+            for (auto& slot : chunk->slots) {
+                slot.value.store(-1, std::memory_order_relaxed);
+            }
+        }
     }
-  }
-  _size = 0;
-}
 
-// The mapping has to be reversible: f(f(x)) == x
-int ThreadFilter::mapThreadId(int thread_id) {
-  // We want to map the thread_id inside the same bitmap
-  static_assert(BITMAP_SIZE >= (u16)0xffff, "Potential verflow");
-  u16 lower16 = (u16)(thread_id & 0xffff);
-  lower16 = reverse16(lower16);
-  int tid = (thread_id & ~0xffff) | lower16;
-  return tid;
+    // Clear the free list
+    for (int i = 0; i < kFreeListSize; ++i) {
+        _free_list[i].value.store(-1, std::memory_order_relaxed);
+        _free_list[i].next.store(-1, std::memory_order_relaxed);
+    }
+
+    // Reset the free heads for each shard
+    for (int s = 0; s < kShardCount; ++s) {
+        _free_heads[s].head.store(-1, std::memory_order_relaxed);
+    }
 }
 
-// Get bitmap that contains the thread id, create one if it does not exist
-u64* ThreadFilter::getBitmapFor(int thread_id) {
-  int index = thread_id / BITMAP_CAPACITY;
-  assert(index >= 0 && index < (int)_max_bitmaps);
-  u64* b = _bitmap[index];
-  if (b == NULL) {
-    b = (u64 *)OS::safeAlloc(BITMAP_SIZE);
-    u64 *oldb = __sync_val_compare_and_swap(
-        &_bitmap[index], NULL, b);
-    if (oldb != NULL) {
-      OS::safeFree(b, BITMAP_SIZE);
-      b = oldb;
-    } else {
-      trackPage();
+bool ThreadFilter::accept(SlotID slot_id) const {
+    if (!_enabled) {
+        return true;
     }
-  }
-  return b;
+    if (slot_id < 0) return false;
+
+    int chunk_idx = slot_id >> kChunkShift;
+    int slot_idx = slot_id & kChunkMask;
+
+    if (chunk_idx >= kMaxChunks) return false;
+    if (slot_idx >= kChunkSize) return false;
+
+    ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed);
+    if (chunk == nullptr) return false;  // Fail-fast if not allocated
+
+    return chunk->slots[slot_idx].value.load(std::memory_order_acquire) != -1;
 }
 
-u64* ThreadFilter::bitmapAddressFor(int thread_id) {
-  u64* b = getBitmapFor(thread_id);
-  thread_id = mapThreadId(thread_id);
-  assert(b == bitmap(thread_id));
-  return wordAddress(b, thread_id);
+void ThreadFilter::add(int tid, SlotID slot_id) {
+    if (slot_id < 0) return;
+
+    int chunk_idx = slot_id >> kChunkShift;
+    int slot_idx = slot_id & kChunkMask;
+
+    if (chunk_idx >= kMaxChunks) return;
+    if (slot_idx >= kChunkSize) return;
+
+    ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed);
+    if (chunk == nullptr) return;  // Fail-fast if not allocated
+
+    // Store the tid
+    chunk->slots[slot_idx].value.store(tid, std::memory_order_release);
 }
 
-bool ThreadFilter::accept(int thread_id) {
-  thread_id = mapThreadId(thread_id);
-  u64 *b = bitmap(thread_id);
-  return b != NULL && (word(b, thread_id) & (1ULL << (thread_id & 0x3f)));
+void ThreadFilter::remove(SlotID slot_id) {
+    if (slot_id < 0) return;
+
+    int chunk_idx = slot_id >> kChunkShift;
+    int slot_idx = slot_id & kChunkMask;
+
+    if (chunk_idx >= kMaxChunks) return;
+    if (slot_idx >= kChunkSize) return;
+
+    ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed);
+    if (chunk == nullptr) return;  // Fail-fast if not allocated
+
+    // Remove the tid
+    chunk->slots[slot_idx].value.store(-1, std::memory_order_acq_rel);
 }
 
-void ThreadFilter::add(int thread_id) {
-  u64 *b = getBitmapFor(thread_id);
-  assert (b != NULL);
-  thread_id = mapThreadId(thread_id);
-  assert(b == bitmap(thread_id));
-  u64 bit = 1ULL << (thread_id & 0x3f);
-  if (!(__atomic_fetch_or(&word(b, thread_id), bit, __ATOMIC_RELAXED) & bit)) {
-    atomicInc(_size);
-  }
+void ThreadFilter::unregisterThread(SlotID slot_id) {
+    if (slot_id < 0) return;
+
+    // Clear the slot first
+    remove(slot_id);
+
+    // Add to free list for reuse
+    pushToFreeList(slot_id);
+}
+
+bool ThreadFilter::pushToFreeList(SlotID slot_id) {
+    // Lock-free sharded Treiber stack push
+    const int shard = shardOfSlot(slot_id);
+    auto& head      = _free_heads[shard].head; // private cache-line
+
+    for (int i = 0; i < kFreeListSize; ++i) {
+        int expected = -1;
+        if (_free_list[i].value.compare_exchange_strong(
+                expected, slot_id, std::memory_order_acq_rel)) {
+            // Link node into this shard’s Treiber stack
+            int old_head = head.load(std::memory_order_acquire);
+            do {
+                _free_list[i].next.store(old_head, std::memory_order_relaxed);
+            } while (!head.compare_exchange_weak(old_head, i,
+                       std::memory_order_acq_rel, std::memory_order_relaxed));
+            return true;
+        }
+    }
+    return false; // Free list full, slot is lost but this is rare
 }
 
-void ThreadFilter::remove(int thread_id) {
-  thread_id = mapThreadId(thread_id);
-  u64 *b = bitmap(thread_id);
-  if (b == NULL) {
-    return;
-  }
-
-  u64 bit = 1ULL << (thread_id & 0x3f);
-  if (__atomic_fetch_and(&word(b, thread_id), ~bit, __ATOMIC_RELAXED) & bit) {
-    atomicInc(_size, -1);
-  }
+ThreadFilter::SlotID ThreadFilter::popFromFreeList() {
+    // Lock-free sharded Treiber stack pop
+    int hash = static_cast<int>(std::hash<std::thread::id>{}(std::this_thread::get_id()));
+    int start = shardOf(hash);
+
+    for (int pass = 0; pass < kShardCount; ++pass) {
+        int s      = (start + pass) & (kShardCount - 1);
+        auto& head = _free_heads[s].head;
+
+        while (true) {
+            int node = head.load(std::memory_order_acquire);
+            if (node == -1) break;                 // shard empty → try next
+
+            int next = _free_list[node].next.load(std::memory_order_relaxed);
+            if (head.compare_exchange_weak(node, next,
+                                           std::memory_order_acq_rel,
+                                           std::memory_order_relaxed))
+            {
+                int id = _free_list[node].value.exchange(-1,
+                              std::memory_order_relaxed);
+                _free_list[node].next.store(-1, std::memory_order_relaxed);
+                return id;
+            }
+        }
+    }
+    return -1; // Empty list
 }
 
-void ThreadFilter::collect(std::vector<int> &v) {
-  for (int i = 0; i < _max_bitmaps; i++) {
-    u64 *b = _bitmap[i];
-    if (b != NULL) {
-      int start_id = i * BITMAP_CAPACITY;
-      for (int j = 0; j < BITMAP_SIZE / sizeof(u64); j++) {
-        // Considering the functional impact, relaxed could be a reasonable
-        // order here
-        u64 word = __atomic_load_n(&b[j], __ATOMIC_ACQUIRE);
-        while (word != 0) {
-          int tid = start_id + j * 64 + __builtin_ctzl(word);
-          // restore thread id
-          tid = mapThreadId(tid);
-          v.push_back(tid);
-          word &= (word - 1);
+void ThreadFilter::collect(std::vector<int>& tids) const {
+    tids.clear();
+    
+    // Reserve space for efficiency
+    // The eventual resize is not the bottleneck, so we reserve a reasonable size
+    tids.reserve(512);
+    
+    // Scan only initialized chunks
+    int num_chunks = _num_chunks.load(std::memory_order_relaxed);
+    for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+        ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed);
+        if (chunk == nullptr) {
+            continue;  // Skip unallocated chunks
+        }
+        
+        for (const auto& slot : chunk->slots) {
+            int slot_tid = slot.value.load(std::memory_order_acquire);
+            if (slot_tid != -1) {
+                tids.push_back(slot_tid);
+            }
         }
-      }
     }
-  }
+    
+    // Optional: shrink if we over-reserved significantly
+    if (tids.capacity() > tids.size() * 2) {
+        tids.shrink_to_fit();
+    }
+}
+
+void ThreadFilter::init(const char* filter) {
+    if (!filter) {
+        _enabled = false;
+        return;
+    }
+    
+    // Simple logic: any filter value (including "0") enables filtering
+    // Only explicitly registered threads via addThread() will be sampled
+    // Previously we had a syntax where we could manually force some thread IDs.
+    // This is no longer supported.
+    _enabled = true;
+}
+
+bool ThreadFilter::enabled() const {
+    return _enabled;
 }
diff --git a/ddprof-lib/src/main/cpp/threadFilter.h b/ddprof-lib/src/main/cpp/threadFilter.h
index 7454be57..9151f199 100644
--- a/ddprof-lib/src/main/cpp/threadFilter.h
+++ b/ddprof-lib/src/main/cpp/threadFilter.h
@@ -1,85 +1,76 @@
-/*
- * Copyright 2020 Andrei Pangin
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 #ifndef _THREADFILTER_H
 #define _THREADFILTER_H
 
-#include "arch_dd.h"
+#include <atomic>
+#include <array>
 #include <vector>
+#include <cstdint>
+#include <memory>
 
-// The size of thread ID bitmap in bytes. Must be at least 64K to allow mmap()
-const u32 BITMAP_SIZE = 65536;
-// How many thread IDs one bitmap can hold
-const u32 BITMAP_CAPACITY = BITMAP_SIZE * 8;
-
-// ThreadFilter query operations must be lock-free and signal-safe;
-// update operations are mostly lock-free, except rare bitmap allocations
 class ThreadFilter {
-private:
-  // Total number of bitmaps required to hold the entire range of thread IDs
-  u32 _max_thread_id;
-  u32 _max_bitmaps;
-  u64 **_bitmap;
-  bool _enabled;
-  volatile int _size;
-
-  u64 *bitmap(int thread_id) {
-    if (thread_id >= _max_thread_id) {
-      return NULL;
-    }
-    return __atomic_load_n(
-        &(_bitmap[static_cast<u32>(thread_id) / BITMAP_CAPACITY]),
-        __ATOMIC_ACQUIRE);
-  }
-
-  static int mapThreadId(int thread_id);
-
-  u64 &word(u64 *bitmap, int thread_id) {
-    // todo: add thread safe APIs
-    return bitmap[((u32)thread_id % BITMAP_CAPACITY) >> 6];
-  }
-
-  u64* const wordAddress(u64 *bitmap, int thread_id) const {
-    return &bitmap[((u32)thread_id % BITMAP_CAPACITY) >> 6];
-  }
-
-  u64* getBitmapFor(int thread_id);
 public:
-  ThreadFilter();
-  ThreadFilter(ThreadFilter &threadFilter) = delete;
-  ~ThreadFilter();
-
-  bool enabled() const { return _enabled; }
-
-  int size() const { return _size; }
-  const volatile int* addressOfSize() const { return &_size; }
-
-  void init(const char *filter);
-  void clear();
+    using SlotID = int;
+
+    // Optimized limits for reasonable memory usage
+    static constexpr int kChunkSize = 256;
+    static constexpr int kChunkShift = 8;   // log2(256)
+    static constexpr int kChunkMask = kChunkSize - 1;
+    static constexpr int kMaxThreads = 2048;
+    static constexpr int kMaxChunks = (kMaxThreads + kChunkSize - 1) / kChunkSize;  // = 8 chunks
+    // High-performance free list using Treiber stack, 64 shards
+    static constexpr int kFreeListSize  = 1024;
+    static constexpr int kShardCount    = 64;          // power-of-two
+    ThreadFilter();
+    ~ThreadFilter();
+
+    void init(const char* filter);
+    void clear();
+    bool enabled() const;
+    bool accept(SlotID slot_id) const;
+    void add(int tid, SlotID slot_id);
+    void remove(SlotID slot_id);
+    void collect(std::vector<int>& tids) const;
+
+    SlotID registerThread();
+    void unregisterThread(SlotID slot_id);
 
-  inline bool isValid(int thread_id) {
-    return thread_id >= 0 && thread_id < _max_thread_id;
-  }
-
-  bool accept(int thread_id);
-  void add(int thread_id);
-  void remove(int thread_id);
-  u64* bitmapAddressFor(int thread_id);
-
-  void collect(std::vector<int> &v);
+private:
+    // Optimized slot structure with padding to avoid false sharing
+    struct alignas(64) Slot {
+        std::atomic<int> value{-1};
+        char padding[60];  // Pad to cache line size
+    };
+
+    // Lock-free free list using a stack-like structure
+    struct FreeListNode {
+        std::atomic<int> value{-1};
+        std::atomic<int> next{-1};
+    };
+
+    // Pre-allocated chunk storage to eliminate mutex contention
+    struct ChunkStorage {
+        std::array<Slot, kChunkSize> slots;
+    };
+
+    bool _enabled = false;
+
+    // Lazily allocated storage for chunks
+    std::atomic<ChunkStorage*> _chunks[kMaxChunks];
+    std::atomic<int> _num_chunks{1};
+
+    // Lock-free slot allocation
+    std::atomic<SlotID> _next_index{0};
+    std::unique_ptr<FreeListNode[]> _free_list;
+
+    struct alignas(64) ShardHead { std::atomic<int> head{-1}; };
+    static ShardHead _free_heads[kShardCount];         // one cache-line each
+
+    static inline int shardOf(int tid)  { return tid & (kShardCount - 1); }
+    static inline int shardOfSlot(int s){ return s  & (kShardCount - 1); }
+    // Helper methods for lock-free operations
+    void initializeChunk(int chunk_idx);
+    bool pushToFreeList(SlotID slot_id);
+    SlotID popFromFreeList();
 };
 
 #endif // _THREADFILTER_H
diff --git a/ddprof-lib/src/main/cpp/threadIdTable.h b/ddprof-lib/src/main/cpp/threadIdTable.h
new file mode 100644
index 00000000..1856c652
--- /dev/null
+++ b/ddprof-lib/src/main/cpp/threadIdTable.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright The async-profiler authors
+ * SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021, 2025 Datadog, Inc
+ */
+
+#ifndef _THREADIDTABLE_H
+#define _THREADIDTABLE_H
+
+#include <atomic>
+#include <unordered_set>
+
+// Signal-safe thread ID table with fixed size
+class ThreadIdTable {
+private:
+    // We have 256 slots per concurrency level (currently 16)
+    // This should cater for 4096 threads - if it turns out to be too small, we
+    // can increase it or make it configurable
+    static const int TABLE_SIZE = 256;
+    std::atomic<int> table[TABLE_SIZE];
+    
+    int hash(int tid) const {
+        // Simple hash function - could be improved if needed
+        return tid % TABLE_SIZE;
+    }
+    
+public:
+    ThreadIdTable() {
+        clear();
+    }
+    
+    // Signal-safe insertion using atomic operations only
+    void insert(int tid) {
+        if (tid == 0) return; // Invalid thread ID, 0 is reserved for empty slots
+        
+        int start_slot = hash(tid);
+        for (int probe = 0; probe < TABLE_SIZE; probe++) {
+            int slot = (start_slot + probe) % TABLE_SIZE;
+            int expected = 0;
+            
+            // Try to claim empty slot
+            if (table[slot].compare_exchange_strong(expected, tid, std::memory_order_relaxed)) {
+                return; // Successfully inserted
+            }
+            
+            // Check if already present
+            if (table[slot].load(std::memory_order_relaxed) == tid) {
+                return; // Already exists
+            }
+        }
+        // Table full - thread ID will be lost, but this is rare and non-critical
+        // Could increment a counter here for diagnostics if needed
+    }
+    
+    // Clear the table (not signal-safe, called during buffer switch)
+    void clear() {
+        for (int i = 0; i < TABLE_SIZE; i++) {
+            table[i].store(0, std::memory_order_relaxed);
+        }
+    }
+    
+    // Collect all thread IDs into a set (not signal-safe, called during buffer switch)
+    void collect(std::unordered_set<int>& result) {
+        for (int i = 0; i < TABLE_SIZE; i++) {
+            int tid = table[i].load(std::memory_order_relaxed);
+            if (tid != 0) {
+                result.insert(tid);
+            }
+        }
+    }
+};
+
+#endif // _THREADIDTABLE_H 
\ No newline at end of file
diff --git a/ddprof-lib/src/main/cpp/wallClock.cpp b/ddprof-lib/src/main/cpp/wallClock.cpp
index d261cdce..0b16d0db 100644
--- a/ddprof-lib/src/main/cpp/wallClock.cpp
+++ b/ddprof-lib/src/main/cpp/wallClock.cpp
@@ -26,6 +26,7 @@
 #include "vmStructs_dd.h"
 #include <math.h>
 #include <random>
+#include <algorithm> // For std::sort and std::binary_search
 
 std::atomic<bool> BaseWallClock::_enabled{false};
 
@@ -196,6 +197,14 @@ void WallClockJVMTI::timerLoop() {
       bool do_filter = threadFilter->enabled();
       int self = OS::threadId();
 
+      // If filtering is enabled, collect the filtered TIDs first
+      std::vector<int> filtered_tids;
+      if (do_filter) {
+          Profiler::instance()->threadFilter()->collect(filtered_tids);
+          // Sort the TIDs for efficient lookup
+          std::sort(filtered_tids.begin(), filtered_tids.end());
+      }
+
       for (int i = 0; i < threads_count; i++) {
         jthread thread = threads_ptr[i];
         if (thread != nullptr) {
@@ -204,12 +213,10 @@ void WallClockJVMTI::timerLoop() {
             continue;
           }
           int tid = nThread->osThreadId();
-          if (!threadFilter->isValid(tid)) {
-            continue;
-          }
-
-          if (tid != self && (!do_filter || threadFilter->accept(tid))) {
-            threads.push_back({nThread, thread, tid});
+          if (tid != self && (!do_filter ||
+               // Use binary search to efficiently find if tid is in filtered_tids
+               std::binary_search(filtered_tids.begin(), filtered_tids.end(), tid))) {
+            threads.push_back({nThread, thread});
           }
         }
       }
@@ -264,13 +271,17 @@ void WallClockJVMTI::timerLoop() {
 }
 
 void WallClockASGCT::timerLoop() {
+    // todo: re-allocating the vector every time is not efficient
     auto collectThreads = [&](std::vector<int>& tids) {
+      // Get thread IDs from the filter if it's enabled
+      // Otherwise list all threads in the system
       if (Profiler::instance()->threadFilter()->enabled()) {
         Profiler::instance()->threadFilter()->collect(tids);
       } else {
         ThreadList *thread_list = OS::listThreads();
         int tid = thread_list->next();
         while (tid != -1) {
+          // Don't include the current thread
           if (tid != OS::threadId()) {
             tids.push_back(tid);
           }
diff --git a/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java b/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java
index a7cda6bc..fbc96555 100644
--- a/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java
+++ b/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java
@@ -109,7 +109,6 @@ public static synchronized JavaProfiler getInstance(String libLocation, String s
             throw new IOException("Failed to load Datadog Java profiler library", result.error);
         }
         init0();
-        ActiveBitmap.initialize();
 
         profiler.initializeContextStorage();
         instance = profiler;
@@ -210,11 +209,7 @@ public boolean recordTraceRoot(long rootSpanId, String endpoint, int sizeLimit)
      * 'filter' option must be enabled to use this method.
      */
     public void addThread() {
-        if (UNSAFE != null) {
-            ActiveBitmap.setActive(TID.get(), true);
-        } else {
-            filterThread0(true);
-        }
+        filterThreadAdd0();
     }
 
     /**
@@ -222,14 +217,9 @@ public void addThread() {
      * 'filter' option must be enabled to use this method.
      */
     public void removeThread() {
-        if (UNSAFE != null) {
-            ActiveBitmap.setActive(TID.get(), false);
-        } else {
-            filterThread0(false);
-        }
+        filterThreadRemove0();
     }
 
-
     /**
      * Passing context identifier to a profiler. This ID is thread-local and is dumped in
      * the JFR output only. 0 is a reserved value for "no-context".
@@ -479,6 +469,10 @@ public Map<String, Long> getDebugCounters() {
     private static native boolean init0();
     private native void stop0() throws IllegalStateException;
     private native String execute0(String command) throws IllegalArgumentException, IllegalStateException, IOException;
+
+    private native void filterThreadAdd0();
+    private native void filterThreadRemove0();
+    // Backward compatibility for existing code
     private native void filterThread0(boolean enable);
 
     private static native int getTid0();
diff --git a/ddprof-lib/src/test/cpp/ddprof_ut.cpp b/ddprof-lib/src/test/cpp/ddprof_ut.cpp
index a754c9b0..88d2e657 100644
--- a/ddprof-lib/src/test/cpp/ddprof_ut.cpp
+++ b/ddprof-lib/src/test/cpp/ddprof_ut.cpp
@@ -14,6 +14,9 @@
     #include <map>
     #include <thread>
     #include <vector>
+    #include <algorithm>  // For std::sort
+    #include <thread>
+    #include <atomic>
 
     ssize_t callback(char* ptr, int len) {
         return len;
@@ -121,31 +124,109 @@
     }
 
     TEST(ThreadFilter, testThreadFilter) {
-        int maxTid = OS::getMaxThreadId();
         ThreadFilter filter;
         filter.init("");
         ASSERT_TRUE(filter.enabled());
-        EXPECT_EQ(0, filter.size());
-        // increase step gradually to create different bit densities
-        int step = 1;
-        int size = 0;
-        for (int tid = 1; tid < maxTid - step - 1; tid += step, size++) {
-            EXPECT_FALSE(filter.accept(tid));
-            filter.add(tid);
-            EXPECT_TRUE(filter.accept(tid));
-            step++;
+
+        const int num_threads = 10;
+        const int num_ops = 100;
+        std::vector<std::thread> threads;
+        std::atomic<int> completed_threads{0};
+
+        // Each thread will add and remove its own thread ID multiple times
+        for (int i = 1; i <= num_threads; i++) {
+            threads.emplace_back([&filter, i, &completed_threads]() {
+                for (int j = 0; j < num_ops; j++) {
+                    // Register a new slot for this thread
+                    int slot_id = filter.registerThread();
+                    
+                    // Add thread ID to slot
+                    filter.add(i, slot_id);
+                    bool accepted = filter.accept(slot_id);
+                    if (!accepted) {
+                        fprintf(stderr, "FAIL: Thread %d, op %d, slot %d: accept(slot=%d) returned false after add\n", 
+                                i, j, slot_id, slot_id);
+                    }
+                    EXPECT_TRUE(accepted);
+                    
+                    // Remove thread ID
+                    filter.remove(slot_id);
+                    accepted = filter.accept(slot_id);
+                    if (accepted) {
+                        fprintf(stderr, "FAIL: Thread %d, op %d, slot %d: accept(slot=%d) returned true after remove\n", 
+                                i, j, slot_id, slot_id);
+                    }
+                    EXPECT_FALSE(accepted);
+                }
+                completed_threads++;
+            });
+        }
+
+        // Wait for all threads to complete
+        for (auto& t : threads) {
+            t.join();
         }
-        ASSERT_EQ(size, filter.size());
+
+        // Verify all threads completed
+        ASSERT_EQ(completed_threads.load(), num_threads);
+
+        // Collect and verify all thread IDs were properly removed
         std::vector<int> tids;
-        tids.reserve(size);
         filter.collect(tids);
-        ASSERT_EQ(size, tids.size());
-        for (int tid : tids) {
-            ASSERT_TRUE(filter.accept(tid));
-            filter.remove(tid);
-            ASSERT_FALSE(filter.accept(tid));
+        ASSERT_EQ(tids.size(), 0);
+    }
+
+    TEST(ThreadFilter, testThreadFilterCollect) {
+        ThreadFilter filter;
+        filter.init("");
+        ASSERT_TRUE(filter.enabled());
+
+        const int num_threads = 10;
+        std::vector<std::thread> threads;
+        std::atomic<int> completed_threads{0};
+        std::vector<int> expected_tids;
+        std::vector<int> slots(num_threads); // Track slot IDs
+
+        // Pre-register slots for each thread
+        for (int i = 0; i < num_threads; i++) {
+            slots[i] = filter.registerThread();
+        }
+
+        // Each thread will add its thread ID
+        for (int i = 1; i <= num_threads; i++) {
+            expected_tids.push_back(i);
+            int slot_id = slots[i-1]; // Use the pre-registered slot
+            
+            threads.emplace_back([&filter, i, slot_id, &completed_threads]() {
+                filter.add(i, slot_id);
+                EXPECT_TRUE(filter.accept(slot_id));
+                completed_threads++;
+            });
+        }
+
+        // Wait for all threads to complete
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        // Verify all threads completed
+        ASSERT_EQ(completed_threads.load(), num_threads);
+
+        // Collect and verify all thread IDs are present
+        std::vector<int> collected_tids;
+        filter.collect(collected_tids);
+        
+        // Sort both vectors for comparison
+        std::sort(expected_tids.begin(), expected_tids.end());
+        std::sort(collected_tids.begin(), collected_tids.end());
+        
+        ASSERT_EQ(expected_tids.size(), collected_tids.size());
+        for (size_t i = 0; i < expected_tids.size(); i++) {
+            EXPECT_EQ(expected_tids[i], collected_tids[i]) 
+                << "Mismatch at index " << i 
+                << ": expected " << expected_tids[i] 
+                << ", got " << collected_tids[i];
         }
-        EXPECT_EQ(0, filter.size());
     }
 
     TEST(ThreadInfoTest, testThreadInfoCleanupAllDead) {
diff --git a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/counters/ThreadFilterBenchmark.java b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/counters/ThreadFilterBenchmark.java
index eab18e7a..eb0df4d9 100644
--- a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/counters/ThreadFilterBenchmark.java
+++ b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/counters/ThreadFilterBenchmark.java
@@ -15,7 +15,10 @@
 @State(Scope.Benchmark)
 public class ThreadFilterBenchmark extends Configuration {
 
-    private static final int NUM_THREADS = 4;
+    @Param({"true", "false"}) // Parameterize the filter usage
+    public boolean useThreadFilters;
+
+    private static final int NUM_THREADS = 15;
     private ExecutorService executorService;
     private JavaProfiler profiler;
     private AtomicBoolean running;
@@ -30,13 +33,13 @@ public class ThreadFilterBenchmark extends Configuration {
     private static final AtomicIntegerArray atomicArray = new AtomicIntegerArray(ARRAY_SIZE);
     private static final int CACHE_LINE_SIZE = 64; // Typical cache line size
     private static final int STRIDE = CACHE_LINE_SIZE / Integer.BYTES; // Elements per cache line
-    private boolean useThreadFilters = true; // Flag to control the use of thread filters
     private AtomicLong addThreadCount = new AtomicLong(0);
     private AtomicLong removeThreadCount = new AtomicLong(0);
 
     @Setup(Level.Trial)
     public void setup() throws IOException {
         System.out.println("Setting up benchmark...");
+        System.out.println("Thread filters enabled: " + useThreadFilters);
         System.out.println("Creating thread pool with " + NUM_THREADS + " threads");
         executorService = Executors.newFixedThreadPool(NUM_THREADS);
         System.out.println("Getting profiler instance");
@@ -53,6 +56,7 @@ public void setup() throws IOException {
         System.out.println("Starting profiler with " + config);
         profiler.execute(config);
         System.out.println("Started profiler with output file");
+        
         running = new AtomicBoolean(true);
         operationCount = new AtomicLong(0);
         startTime = System.currentTimeMillis();
@@ -139,7 +143,7 @@ public void setUseThreadFilters(boolean useThreadFilters) {
 
     @Benchmark
     @BenchmarkMode(Mode.Throughput)
-    @Fork(value = 1, warmups = 0)
+    @Fork(value = 1, warmups = 1)
     @Warmup(iterations = 1, time = 1)
     @Measurement(iterations = 1, time = 2)
     @Threads(1)
@@ -149,14 +153,13 @@ public long threadFilterStress() throws InterruptedException {
         startLatch = new CountDownLatch(NUM_THREADS);
         stopLatch = new CountDownLatch(NUM_THREADS);
 
-        // Start all worker threads
+        // Start all worker threads[]
         for (int i = 0; i < NUM_THREADS; i++) {
             final int threadId = i;
             executorService.submit(() -> {
                 try {
                     startLatch.countDown();
                     startLatch.await(30, TimeUnit.SECONDS);
-                    
                     String startMsg = String.format("Thread %d started%n", threadId);
                     System.out.print(startMsg);
                     if (logWriter != null) {
@@ -216,14 +219,14 @@ public long threadFilterStress() throws InterruptedException {
                             operationCount.incrementAndGet();
                         }
                         
-                        if (operationCount.get() % 1000 == 0) {
-                            String progressMsg = String.format("Thread %d completed %d operations%n", threadId, operationCount.get());
-                            System.out.print(progressMsg);
-                            if (logWriter != null) {
-                                logWriter.print(progressMsg);
-                                logWriter.flush();
-                            }
-                        }
+                        // if (operationCount.get() % 1000 == 0) {
+                        //     String progressMsg = String.format("Thread %d completed %d operations%n", threadId, operationCount.get());
+                        //     System.out.print(progressMsg);
+                        //     if (logWriter != null) {
+                        //         logWriter.print(progressMsg);
+                        //         logWriter.flush();
+                        //     }
+                        // }
                     }
                 } catch (InterruptedException e) {
                     Thread.currentThread().interrupt();
@@ -236,4 +239,4 @@ public long threadFilterStress() throws InterruptedException {
         stopLatch.await();
         return operationCount.get();
     }
-}
+}
\ No newline at end of file
diff --git a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java
index bc03e0ca..f56bd258 100644
--- a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java
+++ b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java
@@ -13,6 +13,7 @@
 import org.openjdk.jmh.annotations.Scope;
 import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
 import org.openjdk.jmh.annotations.Threads;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
@@ -39,6 +40,17 @@ public class ThreadFilterBenchmark extends Configuration {
     public void setup() throws IOException {
         profiler = JavaProfiler.getInstance();
         workloadNum = Long.parseLong(workload);
+        // Start profiling to enable JVMTI ThreadStart callbacks
+        // Add JFR file parameter to satisfy profiler requirements
+        profiler.execute("start," + command + ",jfr,file=/tmp/thread-filter-benchmark.jfr");
+    }
+
+    @TearDown(Level.Trial)
+    public void tearDown() throws IOException {
+        // Stop profiling and clean up
+        if (profiler != null) {
+            profiler.execute("stop");
+        }
     }
 
     @Benchmark