diff --git a/.gitignore b/.gitignore
index c7a8f76b0..65996175a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,7 @@ zig-out/
 zig-cache/
 
 *.dot
+
+.gitignore/
+build-clang/
+build-icx/
diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp
index ce634213b..b3a7d4477 100644
--- a/examples/dolly-v2/main.cpp
+++ b/examples/dolly-v2/main.cpp
@@ -231,6 +231,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -492,6 +493,7 @@ bool dollyv2_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/dolly-v2/quantize.cpp b/examples/dolly-v2/quantize.cpp
index 0c0d24ccf..56c423d58 100644
--- a/examples/dolly-v2/quantize.cpp
+++ b/examples/dolly-v2/quantize.cpp
@@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 7e12eab5f..61fbc6484 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -203,6 +203,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -425,6 +426,7 @@ bool gpt2_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp
index 9d8d53a67..5c4a2359e 100644
--- a/examples/gpt-2/quantize.cpp
+++ b/examples/gpt-2/quantize.cpp
@@ -145,7 +145,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
index b42764ce2..76ced4a66 100644
--- a/examples/gpt-j/main.cpp
+++ b/examples/gpt-j/main.cpp
@@ -202,6 +202,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -421,6 +422,7 @@ bool gptj_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp
index 437053b7d..0dd5c0367 100644
--- a/examples/gpt-j/quantize.cpp
+++ b/examples/gpt-j/quantize.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp
index 4af771b84..84755da6f 100644
--- a/examples/gpt-neox/main.cpp
+++ b/examples/gpt-neox/main.cpp
@@ -205,6 +205,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -472,6 +473,7 @@ bool gpt_neox_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/gpt-neox/quantize.cpp b/examples/gpt-neox/quantize.cpp
index 96208c1e8..1a07be2d2 100644
--- a/examples/gpt-neox/quantize.cpp
+++ b/examples/gpt-neox/quantize.cpp
@@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
index 2000c9aac..a554c0e77 100644
--- a/examples/mnist/main-cpu.cpp
+++ b/examples/mnist/main-cpu.cpp
@@ -51,6 +51,7 @@ int mnist_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx_work = ggml_init(params);
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index a8d47ac9c..41b112b5f 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -45,6 +45,7 @@ int mnist_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ 1,
     };
 
     struct ggml_context * ctx_work = ggml_init(params);
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index 5ff4ac20f..6eb8419c8 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -80,6 +80,7 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) {
             /*.mem_size   =*/ ctx_size + 1024*1024,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -182,6 +183,7 @@ int mnist_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp
index 457dc3d5b..6294185fe 100644
--- a/examples/mpt/main.cpp
+++ b/examples/mpt/main.cpp
@@ -298,6 +298,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -495,6 +496,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/mpt/quantize.cpp b/examples/mpt/quantize.cpp
index d0c9dda82..45c3eaf66 100644
--- a/examples/mpt/quantize.cpp
+++ b/examples/mpt/quantize.cpp
@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = {0, NULL, false};
+        struct ggml_init_params params = {0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp
index aed7f268b..59929dd8c 100644
--- a/examples/replit/main.cpp
+++ b/examples/replit/main.cpp
@@ -280,6 +280,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -472,6 +473,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/replit/quantize.cpp b/examples/replit/quantize.cpp
index f274074bb..329e5b56a 100644
--- a/examples/replit/quantize.cpp
+++ b/examples/replit/quantize.cpp
@@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = {0, NULL, false};
+        struct ggml_init_params params = {0, NULL, false, 1};
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp
index d84e36634..5686e1196 100644
--- a/examples/starcoder/main.cpp
+++ b/examples/starcoder/main.cpp
@@ -226,6 +226,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -460,6 +461,7 @@ bool starcoder_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/starcoder/quantize.cpp b/examples/starcoder/quantize.cpp
index d3aee3f26..b95e24939 100644
--- a/examples/starcoder/quantize.cpp
+++ b/examples/starcoder/quantize.cpp
@@ -145,7 +145,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/starcoder/starcoder-mmap.cpp b/examples/starcoder/starcoder-mmap.cpp
index 094c441d8..8c606833d 100644
--- a/examples/starcoder/starcoder-mmap.cpp
+++ b/examples/starcoder/starcoder-mmap.cpp
@@ -352,6 +352,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ true,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -450,6 +451,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
         c_params.mem_size   = model.cache.buf.size;
         c_params.mem_buffer = model.cache.buf.addr;
         c_params.no_alloc   = false;
+        c_params.n_threads  = 1;
 
         model.cache.ctx = ggml_init(c_params);
 
@@ -667,6 +669,7 @@ bool starcoder_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp
index 64e8f35c3..d4258dbb3 100644
--- a/examples/whisper/quantize.cpp
+++ b/examples/whisper/quantize.cpp
@@ -184,7 +184,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp
index cad40426c..ca546f638 100644
--- a/examples/whisper/whisper.cpp
+++ b/examples/whisper/whisper.cpp
@@ -741,6 +741,7 @@ static bool kv_cache_init(
         /*.mem_size   =*/ cache.buf.size(),
         /*.mem_buffer =*/ cache.buf.data(),
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ 1,
     };
 
     cache.ctx = ggml_init(params);
@@ -777,6 +778,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
         /*.mem_size   =*/ cache.buf.size(),
         /*.mem_buffer =*/ cache.buf.data(),
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ 1,
     };
 
     cache.ctx = ggml_init(params);
@@ -1136,6 +1138,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             /*.mem_size   =*/ wctx.model.buf->size(),
             /*.mem_buffer =*/ wctx.model.buf->data(),
             /*.no_alloc   =*/ false,
+            /*.threads    =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -1456,6 +1459,7 @@ static bool whisper_encode_internal(
         /*.mem_size   =*/ wstate.buf_compute.size(),
         /*.mem_buffer =*/ wstate.buf_compute.data(),
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -1935,6 +1939,7 @@ static bool whisper_decode_internal(
         /*.mem_size   =*/ wstate.buf_compute.size(),
         /*.mem_buffer =*/ wstate.buf_compute.data(),
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -5084,6 +5089,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
                 /*.mem_size   =*/ buf.size(),
                 /*.mem_buffer =*/ buf.data(),
                 /*.no_alloc   =*/ false,
+                /*.threads    =*/ n_threads,
             };
 
             struct ggml_context * ctx0 = ggml_init(gparams);
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index 24856a255..9c378fdd7 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -482,6 +482,7 @@ extern "C" {
         size_t mem_size;   // bytes
         void * mem_buffer; // if NULL, memory will be allocated internally
         bool   no_alloc;   // don't allocate memory for the tensor data
+        int    n_threads;  // number of threads for the thread pool
     };
 
 
@@ -1350,7 +1351,7 @@ extern "C" {
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
     GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan, void * tpool);
     GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
diff --git a/include/ggml/pthread-win32.h b/include/ggml/pthread-win32.h
new file mode 100644
index 000000000..bb0e7dcf3
--- /dev/null
+++ b/include/ggml/pthread-win32.h
@@ -0,0 +1,253 @@
+#pragma once
+
+#include <stdbool.h>
+#include <time.h>
+#include <windows.h>
+
+static DWORD timespec_to_ms(const struct timespec* abstime)
+{
+    DWORD t;
+
+    if (abstime == NULL)
+        return INFINITE;
+
+    t = ((abstime->tv_sec - time(NULL)) * 1000) + (abstime->tv_nsec / 1000000);
+    if (t < 0)
+        t = 1;
+    return t;
+}
+
+static void ms_to_timespec(struct timespec* ts, unsigned int ms)
+{
+    if (ts == NULL)
+        return;
+    ts->tv_sec = (ms / 1000) + time(NULL);
+    ts->tv_nsec = (ms % 1000) * 1000000;
+}
+
+typedef HANDLE pthread_t;
+typedef void pthread_attr_t;
+typedef DWORD thread_ret_t;
+
+typedef struct {
+    void *(*start_routine)(void *);
+    void *start_arg;
+} win_thread_start_t;
+
+static DWORD WINAPI win_thread_start(void *arg)
+{
+    win_thread_start_t *data       = arg;
+    void *(*start_routine)(void *) = data->start_routine;
+    void *start_arg                = data->start_arg;
+
+    free(data);
+
+    start_routine(start_arg);
+    return 0; /* ERROR_SUCCESS */
+}
+
+static int pthread_create(pthread_t *thread, pthread_attr_t *attr, void *(*start_routine)(void *), void *arg)
+{
+    win_thread_start_t *data;
+
+    if (thread == NULL || start_routine == NULL)
+        return 1;
+
+    data = malloc(sizeof(*data));
+    data->start_routine = start_routine;
+    data->start_arg     = arg;
+
+    *thread = CreateThread(NULL, 0, win_thread_start, data, 0, NULL);
+    if (*thread == NULL)
+        return 1;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void **value_ptr)
+{
+    (void)value_ptr;
+    WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
+    return 0;
+}
+
+static int pthread_detach(pthread_t thread)
+{
+    CloseHandle(thread);
+}
+
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef void pthread_mutexattr_t;
+
+static int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr)
+{
+    (void)attr;
+
+    if (mutex == NULL)
+        return 1;
+
+    InitializeCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_destroy(pthread_mutex_t *mutex)
+{
+    if (mutex == NULL)
+        return 1;
+    DeleteCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_lock(pthread_mutex_t *mutex)
+{
+    if (mutex == NULL)
+        return 1;
+    EnterCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_unlock(pthread_mutex_t *mutex)
+{
+    if (mutex == NULL)
+        return 1;
+    LeaveCriticalSection(mutex);
+    return 0;
+}
+
+typedef CONDITION_VARIABLE pthread_cond_t;
+typedef void pthread_condattr_t;
+
+#ifdef NEEDED
+struct timespec {
+    long tv_sec;
+    long tv_nsec;
+};
+#endif
+
+static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr)
+{
+    (void)attr;
+    if (cond == NULL)
+        return 1;
+    InitializeConditionVariable(cond);
+    return 0;
+}
+
+static int pthread_cond_destroy(pthread_cond_t *cond)
+{
+    /* Windows does not have a destroy for conditionals */
+    (void)cond;
+    return 0;
+}
+
+static int pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
+        const struct timespec *abstime)
+{
+    if (cond == NULL || mutex == NULL)
+        return 1;
+    if (!SleepConditionVariableCS(cond, mutex, timespec_to_ms(abstime)))
+        return 1;
+    return 0;
+}
+
+static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
+{
+    if (cond == NULL || mutex == NULL)
+        return 1;
+    return pthread_cond_timedwait(cond, mutex, NULL);
+}
+
+static int pthread_cond_signal(pthread_cond_t *cond)
+{
+    if (cond == NULL)
+        return 1;
+    WakeConditionVariable(cond);
+    return 0;
+}
+
+static int pthread_cond_broadcast(pthread_cond_t *cond)
+{
+    if (cond == NULL)
+        return 1;
+    WakeAllConditionVariable(cond);
+    return 0;
+}
+
+typedef struct {
+    SRWLOCK lock;
+    bool    exclusive;
+} pthread_rwlock_t;
+
+typedef void pthread_rwlockattr_t;
+
+static int pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
+{
+    (void)attr;
+    if (rwlock == NULL)
+        return 1;
+    InitializeSRWLock(&rwlock->lock);
+    rwlock->exclusive = false;
+    return 0;
+}
+
+static int pthread_rwlock_destroy(pthread_rwlock_t *rwlock)
+{
+    (void)rwlock;
+}
+
+static int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock)
+{
+    if (rwlock == NULL)
+        return 1;
+    AcquireSRWLockShared(&rwlock->lock);
+}
+
+static int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock)
+{
+    if (rwlock == NULL)
+        return 1;
+    return !TryAcquireSRWLockShared(&rwlock->lock);
+}
+
+static int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock)
+{
+    if (rwlock == NULL)
+        return 1;
+    AcquireSRWLockExclusive(&rwlock->lock);
+    rwlock->exclusive = true;
+}
+
+static int pthread_rwlock_trywrlock(pthread_rwlock_t  *rwlock)
+{
+    BOOLEAN ret;
+
+    if (rwlock == NULL)
+        return 1;
+
+    ret = TryAcquireSRWLockExclusive(&rwlock->lock);
+    if (ret)
+        rwlock->exclusive = true;
+    return ret;
+}
+
+static int pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
+{
+    if (rwlock == NULL)
+        return 1;
+
+    if (rwlock->exclusive) {
+        rwlock->exclusive = false;
+        ReleaseSRWLockExclusive(&rwlock->lock);
+    } else {
+        ReleaseSRWLockShared(&rwlock->lock);
+    }
+}
+
+static unsigned int pcthread_get_num_procs()
+{
+    SYSTEM_INFO sysinfo;
+
+    GetSystemInfo(&sysinfo);
+    return sysinfo.dwNumberOfProcessors;
+}
+
diff --git a/include/ggml/tpool.h b/include/ggml/tpool.h
new file mode 100644
index 000000000..d1afcbb56
--- /dev/null
+++ b/include/ggml/tpool.h
@@ -0,0 +1,203 @@
+#pragma once
+
+#if defined(_WIN32)
+#include <windows.h>
+#include "pthread-win32.h"
+#else
+#include <pthread.h>
+#endif
+
+typedef void (*thread_func_t)(void *arg);
+
+struct tpool_work {
+    thread_func_t      func;
+    void              *arg;
+    struct tpool_work *next;
+};
+typedef struct tpool_work tpool_work_t;
+
+static tpool_work_t *tpool_work_create(thread_func_t func, void *arg)
+{
+    tpool_work_t *work;
+
+    if (func == NULL)
+        return NULL;
+
+    work       = malloc(sizeof(*work));
+    work->func = func;
+    work->arg  = arg;
+    work->next = NULL;
+    return work;
+}
+
+static void tpool_work_destroy(tpool_work_t *work)
+{
+    if (work == NULL)
+        return;
+    free(work);
+}
+
+struct tpool {
+    tpool_work_t    *work_first;
+    tpool_work_t    *work_last;
+    pthread_mutex_t  work_mutex;
+    pthread_cond_t   work_cond;
+    pthread_cond_t   working_cond;
+    size_t           working_cnt;
+    size_t           thread_cnt;
+    bool             stop;
+};
+typedef struct tpool tpool_t;
+
+static tpool_work_t *tpool_work_get(tpool_t *tm)
+{
+    tpool_work_t *work;
+
+    if (tm == NULL)
+        return NULL;
+
+    work = tm->work_first;
+    if (work == NULL)
+        return NULL;
+
+    if (work->next == NULL) {
+        tm->work_first = NULL;
+        tm->work_last  = NULL;
+    } else {
+        tm->work_first = work->next;
+    }
+
+    return work;
+}
+
+static void *tpool_worker(void *arg)
+{
+    tpool_t      *tm = arg;
+    tpool_work_t *work;
+
+    while (1) {
+        pthread_mutex_lock(&tm->work_mutex);
+        while (tm->work_first == NULL && !tm->stop)
+            pthread_cond_wait(&tm->work_cond, &tm->work_mutex);
+        if (tm->stop)
+            break;
+        work = tpool_work_get(tm);
+        tm->working_cnt++;
+        pthread_mutex_unlock(&tm->work_mutex);
+
+        if (work != NULL) {
+            work->func(work->arg);
+            tpool_work_destroy(work);
+        }
+
+        pthread_mutex_lock(&tm->work_mutex);
+        tm->working_cnt--;
+        bool predicate = tm->working_cnt == 0 && tm->work_first == NULL;
+        pthread_mutex_unlock(&tm->work_mutex);
+        if(predicate)
+            pthread_cond_signal(&tm->working_cond);
+    }
+
+    tm->thread_cnt--;
+    pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_signal(&tm->working_cond);
+    return NULL;
+}
+
+static tpool_t *tpool_create(size_t num)
+{
+    tpool_t   *tm;
+    pthread_t  thread;
+    size_t     i;
+
+    if (num == 0)
+        num = 2;
+
+    tm             = calloc(1, sizeof(*tm));
+    tm->thread_cnt = num;
+
+    pthread_mutex_init(&tm->work_mutex, NULL);
+    pthread_cond_init(&tm->work_cond, NULL);
+    pthread_cond_init(&tm->working_cond, NULL);
+
+    tm->work_first = NULL;
+    tm->work_last  = NULL;
+
+    for (i=0; i<num; i++) {
+        pthread_create(&thread, NULL, tpool_worker, tm);
+        pthread_detach(thread);
+    }
+
+    return tm;
+}
+
+static void tpool_wait(tpool_t *tm)
+{
+    if (tm == NULL)
+        return;
+
+    pthread_mutex_lock(&tm->work_mutex);
+    while (tm->working_cnt != 0 || tm->work_first != NULL) {
+        pthread_cond_wait(&tm->working_cond, &tm->work_mutex);
+    }
+    pthread_mutex_unlock(&tm->work_mutex);
+}
+
+static void tpool_destroy(tpool_t *tm)
+{
+    tpool_work_t *work;
+    tpool_work_t *work2;
+
+    if (tm == NULL)
+        return;
+
+    pthread_mutex_lock(&tm->work_mutex);
+    work = tm->work_first;
+    while (work != NULL) {
+        work2 = work->next;
+        tpool_work_destroy(work);
+        work = work2;
+    }
+    tm->stop = true;
+    pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_broadcast(&tm->work_cond);
+
+    tpool_wait(tm);
+
+    pthread_mutex_lock(&tm->work_mutex);
+    while (tm->thread_cnt > 0)
+        pthread_cond_wait(&tm->working_cond, &tm->work_mutex);
+    pthread_mutex_unlock(&tm->work_mutex);
+
+    pthread_mutex_destroy(&tm->work_mutex);
+    pthread_cond_destroy(&tm->work_cond);
+    pthread_cond_destroy(&tm->working_cond);
+
+    free(tm);
+}
+
+static bool tpool_add_work(tpool_t *tm, thread_func_t func, void *arg)
+{
+    tpool_work_t *work;
+
+    if (tm == NULL)
+        return false;
+
+    work = tpool_work_create(func, arg);
+    if (work == NULL)
+        return false;
+
+    pthread_mutex_lock(&tm->work_mutex);
+    if (tm->work_first == NULL) {
+        tm->work_first = work;
+        tm->work_last  = tm->work_first;
+    } else {
+        tm->work_last->next = work;
+        tm->work_last       = work;
+    }
+
+    pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_broadcast(&tm->work_cond);
+
+    return true;
+}
diff --git a/src/ggml.c b/src/ggml.c
index c56a3d0e0..9a9d56a07 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -69,31 +69,15 @@ static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
 
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
-    (void) unused;
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void * unused) {
-    (void) unused;
-    return (int) WaitForSingleObject(thread, INFINITE);
-}
-
 static int sched_yield (void) {
     Sleep (0);
     return 0;
 }
+
+#include "pthread-win32.h"
+
 #else
+
 #include <pthread.h>
 #include <stdatomic.h>
 
@@ -105,6 +89,8 @@ typedef void * thread_ret_t;
 
 #endif
 
+#include "tpool.h"
+
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __FMA__
@@ -3948,6 +3934,8 @@ struct ggml_context {
 
     struct ggml_scratch scratch;
     struct ggml_scratch scratch_save;
+
+    tpool_t * tpool;
 };
 
 struct ggml_context_container {
@@ -4389,6 +4377,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         /*.objects_end        =*/ NULL,
         /*.scratch            =*/ { 0, 0, NULL, },
         /*.scratch_save       =*/ { 0, 0, NULL, },
+        /*.tpool              =*/ tpool_create(params.n_threads - 1),
     };
 
     GGML_ASSERT(ctx->mem_buffer != NULL);
@@ -4410,6 +4399,7 @@ void ggml_free(struct ggml_context * ctx) {
 
     for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
         if (&g_state.contexts[i].context == ctx) {
+            tpool_destroy(g_state.contexts[i].context.tpool);
             g_state.contexts[i].used = false;
 
             GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
@@ -16259,9 +16249,7 @@ struct ggml_compute_state_shared {
 
     const int n_threads;
 
-    // synchronization primitives
-    atomic_int n_active; // num active threads
-    atomic_int node_n;   // active graph node
+    int node_n; // active graph node
 
     bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
     void * abort_callback_data;
@@ -16282,116 +16270,26 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
     node->perf_time_us += time_us_cur;
 }
 
-static thread_ret_t ggml_graph_compute_thread(void * data) {
+static void ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
     const struct ggml_cgraph * cgraph = state->shared->cgraph;
     const struct ggml_cplan  * cplan  = state->shared->cplan;
 
-    const int * n_tasks_arr = cplan->n_tasks;
-    const int   n_threads   = state->shared->n_threads;
+    int n_threads = state->shared->n_threads;
+    int node_n = state->shared->node_n;
 
     set_numa_thread_affinity(state->ith, n_threads);
 
-    int node_n = -1;
-
-    while (true) {
-        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->node_n += 1;
-            return (thread_ret_t) GGML_EXIT_ABORTED;
-        }
-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            // all other threads are finished and spinning
-            // do finalize and init here so we don't have synchronize again
-            struct ggml_compute_params params = {
-                /*.type  =*/ GGML_TASK_FINALIZE,
-                /*.ith   =*/ 0,
-                /*.nth   =*/ 0,
-                /*.wsize =*/ cplan->work_size,
-                /*.wdata =*/ cplan->work_data,
-            };
-
-            if (node_n != -1) {
-                /* FINALIZE */
-                struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
-                if (GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = n_tasks_arr[node_n];
-                    ggml_compute_forward(&params, node);
-                }
-                ggml_graph_compute_perf_stats_node(node, state->shared);
-            }
-
-            // distribute new work or execute it direct if 1T
-            while (++node_n < cgraph->n_nodes) {
-                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
-
-                struct ggml_tensor * node = cgraph->nodes[node_n];
-                const int n_tasks = n_tasks_arr[node_n];
-
-                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
-                state->shared->perf_node_start_time_us = ggml_perf_time_us();
-
-                params.nth = n_tasks;
-
-                /* INIT */
-                if (GGML_OP_HAS_INIT[node->op]) {
-                    params.type = GGML_TASK_INIT;
-                    ggml_compute_forward(&params, node);
-                }
-
-                if (n_tasks == 1) {
-                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
-                    // they do something more efficient than spinning (?)
-                    params.type = GGML_TASK_COMPUTE;
-                    ggml_compute_forward(&params, node);
-
-                    if (GGML_OP_HAS_FINALIZE[node->op]) {
-                        params.type = GGML_TASK_FINALIZE;
-                        ggml_compute_forward(&params, node);
-                    }
-
-                    ggml_graph_compute_perf_stats_node(node, state->shared);
-                } else {
-                    break;
-                }
-
-                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-                    break;
-                }
-            }
-
-            atomic_store(&state->shared->n_active, n_threads);
-            atomic_store(&state->shared->node_n,   node_n);
-        } else {
-            // wait for other threads to finish
-            const int last = node_n;
-            do {
-                //sched_yield();
-                node_n = atomic_load(&state->shared->node_n);
-            } while (node_n == last);
-        }
-
-        // check if we should stop
-        if (node_n >= cgraph->n_nodes) break;
-
-        /* COMPUTE */
-        struct ggml_tensor * node = cgraph->nodes[node_n];
-        const int n_tasks = n_tasks_arr[node_n];
-
-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_COMPUTE,
-            /*.ith   =*/ state->ith,
-            /*.nth   =*/ n_tasks,
-            /*.wsize =*/ cplan->work_size,
-            /*.wdata =*/ cplan->work_data,
-        };
-
-        if (state->ith < n_tasks) {
-            ggml_compute_forward(&params, node);
-        }
-    }
+    struct ggml_compute_params params = {
+        /*.type  =*/ GGML_TASK_COMPUTE,
+        /*.ith   =*/ state->ith,
+        /*.nth   =*/ cplan->n_tasks[node_n],
+        /*.wsize =*/ cplan->work_size,
+        /*.wdata =*/ cplan->work_data,
+    };
 
-    return GGML_EXIT_SUCCESS;
+    ggml_compute_forward(&params, cgraph->nodes[node_n]);
 }
 
 struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
@@ -16737,7 +16635,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
     return cplan;
 }
 
-int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan, void * tpool) {
     {
         GGML_ASSERT(cplan);
         GGML_ASSERT(cplan->n_threads > 0);
@@ -16756,59 +16654,80 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
     const int n_threads = cplan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ cplan,
-        /*.perf_node_start_cycles  =*/ 0,
-        /*.perf_node_start_time_us =*/ 0,
-        /*.n_threads               =*/ n_threads,
-        /*.n_active                =*/ n_threads,
-        /*.node_n                  =*/ -1,
-        /*.abort_callback          =*/ NULL,
-        /*.abort_callback_data     =*/ NULL,
+        .cgraph                  = cgraph,
+        .cplan                   = cplan,
+        .perf_node_start_cycles  = 0,
+        .perf_node_start_time_us = 0,
+        .n_threads               = n_threads,
+        .node_n                  = -1,
+        .abort_callback          = NULL,
+        .abort_callback_data     = NULL,
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
-    // create thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; ++j) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .ith = j,
-                .shared = &state_shared,
-            };
+    const int * n_tasks_arr = cplan->n_tasks;
 
-            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-        }
+    // create thread pool arguments
+    for (int j = 1; j < n_threads; ++j) {
+        workers[j] = (struct ggml_compute_state) {
+            .thrd   = 0,
+            .ith = j,
+            .shared = &state_shared,
+        };
     }
     workers[0].ith = 0;
     workers[0].shared = &state_shared;
 
-    const int64_t perf_start_cycles  = ggml_perf_cycles();
-    const int64_t perf_start_time_us = ggml_perf_time_us();
+    int compute_status = GGML_EXIT_SUCCESS;
+    for (int node_n = 0; node_n < cgraph->n_nodes; ++ node_n) {
+        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+            compute_status = GGML_EXIT_ABORTED;
+            break;
+        }
 
-    // this is a work thread too
-    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
+        struct ggml_tensor* node = cgraph->nodes[node_n];
 
-    // don't leave affinity set on the main thread
-    clear_numa_thread_affinity();
+        state_shared.perf_node_start_cycles  = ggml_perf_cycles();
+        state_shared.perf_node_start_time_us = ggml_perf_time_us();
+
+        struct ggml_compute_params params = {
+            /*.type  =*/ -1,
+            /*.ith   =*/ 0,
+            /*.nth   =*/ n_tasks_arr[node_n],
+            /*.wsize =*/ cplan->work_size,
+            /*.wdata =*/ cplan->work_data,
+        };
 
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
+        if (GGML_OP_HAS_INIT[node->op]) {
+            params.type = GGML_TASK_INIT;
+            ggml_compute_forward(&params, node);
         }
+
+        if (n_tasks_arr[node_n] > 1) {
+            state_shared.node_n = node_n;
+            for (int j = 1; j < n_tasks_arr[node_n]; ++j)
+                tpool_add_work(tpool, ggml_graph_compute_thread, &workers[j]);
+        }
+
+        params.type = GGML_TASK_COMPUTE;
+        ggml_compute_forward(&params, node);
+
+        if (n_tasks_arr[node_n] > 1) {
+            tpool_wait(tpool);
+        }
+
+        if (GGML_OP_HAS_FINALIZE[node->op]) {
+            params.type = GGML_TASK_FINALIZE;
+            ggml_compute_forward(&params, node);
+        }
+
+        ggml_graph_compute_perf_stats_node(node, &state_shared);
     }
 
     // performance stats (graph)
     {
-        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
-        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
-
-        cgraph->perf_runs++;
-        cgraph->perf_cycles  += perf_cycles_cur;
-        cgraph->perf_time_us += perf_time_us_cur;
+        int64_t perf_cycles_cur = ggml_perf_cycles() - state_shared.perf_node_start_cycles;
+        int64_t perf_time_us_cur = ggml_perf_time_us() - state_shared.perf_node_start_time_us;
 
         GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
                 __func__, cgraph->perf_runs,
@@ -16839,7 +16758,7 @@ void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph *
 
     cplan.work_data = buf->data;
 
-    ggml_graph_compute(cgraph, &cplan);
+    ggml_graph_compute(cgraph, &cplan, ctx->tpool);
 }
 
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@@ -17105,6 +17024,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                 .mem_size   = fsize + overhead,
                 .mem_buffer = NULL,
                 .no_alloc   = false,
+                .n_threads  = 1,
             };
 
             *ctx_data = ggml_init(params);
@@ -17163,6 +17083,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                 .mem_size   = size_eval + overhead,
                 .mem_buffer = NULL,
                 .no_alloc   = true,
+                .n_threads = 1,
             };
 
             *ctx_eval = ggml_init(params);
@@ -18263,6 +18184,7 @@ enum ggml_opt_result ggml_opt(
             .mem_size   = 16*1024*1024,
             .mem_buffer = NULL,
             .no_alloc   = false,
+            .n_threads = GGML_DEFAULT_N_THREADS,
         };
 
         ctx = ggml_init(params_ctx);
diff --git a/tests/test-blas0.c b/tests/test-blas0.c
index 0977d3ef8..5bf5d49f1 100644
--- a/tests/test-blas0.c
+++ b/tests/test-blas0.c
@@ -69,6 +69,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 2048ul*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads = GGML_DEFAULT_N_THREADS,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 01467bc18..2186f5cb9 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -345,6 +345,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads = GGML_DEFAULT_N_THREADS,
     };
 
     int64_t ne[4];
diff --git a/tests/test-mul-mat0.c b/tests/test-mul-mat0.c
index 1bd6e140b..85d510da4 100644
--- a/tests/test-mul-mat0.c
+++ b/tests/test-mul-mat0.c
@@ -235,6 +235,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads = GGML_DEFAULT_N_THREADS,
     };
 
     int64_t ne[4];
diff --git a/tests/test-mul-mat2.c b/tests/test-mul-mat2.c
index ad30492b4..264434361 100644
--- a/tests/test-mul-mat2.c
+++ b/tests/test-mul-mat2.c
@@ -2375,7 +2375,7 @@ int main(int argc, const char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/tests/test-opt.c b/tests/test-opt.c
index 5531814c4..eae8273fc 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -122,6 +122,7 @@ int main(void) {
         .mem_size   = 1024*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
     struct ggml_context * ctx = ggml_init(params);
 
diff --git a/tests/test-pool.c b/tests/test-pool.c
index cdf00f4ec..1c5dd3e67 100644
--- a/tests/test-pool.c
+++ b/tests/test-pool.c
@@ -7,6 +7,7 @@
 struct ggml_context* make_ctx(void) {
     struct ggml_init_params params = {
         .mem_size = 2 * 1024 * 1024,
+        .n_threads = GGML_DEFAULT_N_THREADS,
     };
 
     return ggml_init(params);
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 8d3c162d2..23de47dbd 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -117,6 +117,7 @@ int main(int argc, char * argv[]) {
         /* .mem_size   = */ 1*1024,
         /* .mem_buffer = */ NULL,
         /* .no_alloc   = */ true,
+        /* .n_threads  = */ GGML_DEFAULT_N_THREADS,
     };
     struct ggml_context * ctx = ggml_init(ggml_params);
 
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 0bb9537f6..bfea4d926 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -266,6 +266,7 @@ int main(int argc, char * argv[]) {
         /* .mem_size   = */ 1*1024,
         /* .mem_buffer = */ NULL,
         /* .no_alloc   = */ true,
+        /* .n_threads  = */ GGML_DEFAULT_N_THREADS,
     };
     struct ggml_context * ctx = ggml_init(ggml_params);
 
diff --git a/tests/test0.c b/tests/test0.c
index 7fba63e77..74b75f00e 100644
--- a/tests/test0.c
+++ b/tests/test0.c
@@ -8,6 +8,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/tests/test1.c b/tests/test1.c
index c313bf8e1..ae67a1f41 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -10,6 +10,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/tests/test2.c b/tests/test2.c
index 839e3e6de..763453fce 100644
--- a/tests/test2.c
+++ b/tests/test2.c
@@ -18,6 +18,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
 
     //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
diff --git a/tests/test3.c b/tests/test3.c
index b92d6233d..f18c8984a 100644
--- a/tests/test3.c
+++ b/tests/test3.c
@@ -13,6 +13,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 1024*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
 
     //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);