From 70a6c547fa4b7cb636f859a1bf0011e44636aed3 Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Thu, 20 Jul 2023 12:21:12 +0200
Subject: [PATCH 1/9] Initial commit

---
 examples/dolly-v2/main.cpp            |   2 +
 examples/dolly-v2/quantize.cpp        |   2 +-
 examples/gpt-2/main.cpp               |   2 +
 examples/gpt-2/quantize.cpp           |   2 +-
 examples/gpt-j/main.cpp               |   2 +
 examples/gpt-j/quantize.cpp           |   2 +-
 examples/gpt-neox/main.cpp            |   2 +
 examples/gpt-neox/quantize.cpp        |   2 +-
 examples/mnist/main-cpu.cpp           |   1 +
 examples/mnist/main-mtl.cpp           |   1 +
 examples/mnist/main.cpp               |   2 +
 examples/mpt/main.cpp                 |   2 +
 examples/mpt/quantize.cpp             |   2 +-
 examples/replit/main.cpp              |   2 +
 examples/replit/quantize.cpp          |   2 +-
 examples/starcoder/main.cpp           |   2 +
 examples/starcoder/quantize.cpp       |   2 +-
 examples/starcoder/starcoder-mmap.cpp |   3 +
 examples/whisper/quantize.cpp         |   2 +-
 examples/whisper/whisper.cpp          |   6 +
 include/ggml/ggml.h                   |   3 +-
 include/ggml/pthreads.h               | 448 ++++++++++++++++++++++++++
 src/ggml.c                            | 248 +++++---------
 tests/test-blas0.c                    |   1 +
 tests/test-grad0.c                    |   1 +
 tests/test-mul-mat0.c                 |   1 +
 tests/test-mul-mat2.c                 |   2 +-
 tests/test-opt.c                      |   1 +
 tests/test-pool.c                     |   1 +
 tests/test-quantize-fns.cpp           |   1 +
 tests/test-quantize-perf.cpp          |   1 +
 tests/test0.c                         |   1 +
 tests/test1.c                         |   1 +
 tests/test2.c                         |   1 +
 tests/test3.c                         |   1 +
 35 files changed, 581 insertions(+), 174 deletions(-)
 create mode 100644 include/ggml/pthreads.h

diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp
index ce634213b..2c0eb6a00 100644
--- a/examples/dolly-v2/main.cpp
+++ b/examples/dolly-v2/main.cpp
@@ -231,6 +231,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -492,6 +493,7 @@ bool dollyv2_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/dolly-v2/quantize.cpp b/examples/dolly-v2/quantize.cpp
index 0c0d24ccf..f7bfbf531 100644
--- a/examples/dolly-v2/quantize.cpp
+++ b/examples/dolly-v2/quantize.cpp
@@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 7e12eab5f..08dfb59fe 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -203,6 +203,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -425,6 +426,7 @@ bool gpt2_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp
index 9d8d53a67..27ea72f0b 100644
--- a/examples/gpt-2/quantize.cpp
+++ b/examples/gpt-2/quantize.cpp
@@ -145,7 +145,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
index b42764ce2..395a929b7 100644
--- a/examples/gpt-j/main.cpp
+++ b/examples/gpt-j/main.cpp
@@ -202,6 +202,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -421,6 +422,7 @@ bool gptj_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp
index 437053b7d..62550a86c 100644
--- a/examples/gpt-j/quantize.cpp
+++ b/examples/gpt-j/quantize.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp
index 4af771b84..ac2e3b90e 100644
--- a/examples/gpt-neox/main.cpp
+++ b/examples/gpt-neox/main.cpp
@@ -205,6 +205,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -472,6 +473,7 @@ bool gpt_neox_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/gpt-neox/quantize.cpp b/examples/gpt-neox/quantize.cpp
index 96208c1e8..b0cc09fbf 100644
--- a/examples/gpt-neox/quantize.cpp
+++ b/examples/gpt-neox/quantize.cpp
@@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
index 2000c9aac..a554c0e77 100644
--- a/examples/mnist/main-cpu.cpp
+++ b/examples/mnist/main-cpu.cpp
@@ -51,6 +51,7 @@ int mnist_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx_work = ggml_init(params);
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index a8d47ac9c..1047a3814 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -45,6 +45,7 @@ int mnist_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
     };
 
     struct ggml_context * ctx_work = ggml_init(params);
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index 5ff4ac20f..afef54ad6 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -80,6 +80,7 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) {
             /*.mem_size   =*/ ctx_size + 1024*1024,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -182,6 +183,7 @@ int mnist_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp
index 457dc3d5b..1ab737a02 100644
--- a/examples/mpt/main.cpp
+++ b/examples/mpt/main.cpp
@@ -298,6 +298,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -495,6 +496,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/mpt/quantize.cpp b/examples/mpt/quantize.cpp
index d0c9dda82..f4c900a2f 100644
--- a/examples/mpt/quantize.cpp
+++ b/examples/mpt/quantize.cpp
@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = {0, NULL, false};
+        struct ggml_init_params params = {0, NULL, false, GGML_DEFAULT_N_THREADS };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp
index aed7f268b..a1e3b383e 100644
--- a/examples/replit/main.cpp
+++ b/examples/replit/main.cpp
@@ -280,6 +280,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -472,6 +473,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/replit/quantize.cpp b/examples/replit/quantize.cpp
index f274074bb..24bbf86ed 100644
--- a/examples/replit/quantize.cpp
+++ b/examples/replit/quantize.cpp
@@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = {0, NULL, false};
+        struct ggml_init_params params = {0, NULL, false, GGML_DEFAULT_N_THREADS};
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp
index d84e36634..5cfd9b8d0 100644
--- a/examples/starcoder/main.cpp
+++ b/examples/starcoder/main.cpp
@@ -226,6 +226,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
+            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -460,6 +461,7 @@ bool starcoder_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.n_threads  =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/starcoder/quantize.cpp b/examples/starcoder/quantize.cpp
index d3aee3f26..edafacb3f 100644
--- a/examples/starcoder/quantize.cpp
+++ b/examples/starcoder/quantize.cpp
@@ -145,7 +145,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/starcoder/starcoder-mmap.cpp b/examples/starcoder/starcoder-mmap.cpp
index 094c441d8..b240abcae 100644
--- a/examples/starcoder/starcoder-mmap.cpp
+++ b/examples/starcoder/starcoder-mmap.cpp
@@ -352,6 +352,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ true,
+            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -450,6 +451,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
         c_params.mem_size   = model.cache.buf.size;
         c_params.mem_buffer = model.cache.buf.addr;
         c_params.no_alloc   = false;
+        c_params.n_threads  = GGML_DEFAULT_N_THREADS;
 
         model.cache.ctx = ggml_init(c_params);
 
@@ -667,6 +669,7 @@ bool starcoder_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp
index 64e8f35c3..82030d386 100644
--- a/examples/whisper/quantize.cpp
+++ b/examples/whisper/quantize.cpp
@@ -184,7 +184,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp
index cad40426c..de1f47bf5 100644
--- a/examples/whisper/whisper.cpp
+++ b/examples/whisper/whisper.cpp
@@ -741,6 +741,7 @@ static bool kv_cache_init(
         /*.mem_size   =*/ cache.buf.size(),
         /*.mem_buffer =*/ cache.buf.data(),
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ GGML_DEFAULT_N_THREADS,
     };
 
     cache.ctx = ggml_init(params);
@@ -777,6 +778,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
         /*.mem_size   =*/ cache.buf.size(),
         /*.mem_buffer =*/ cache.buf.data(),
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ GGML_DEFAULT_N_THREADS,
     };
 
     cache.ctx = ggml_init(params);
@@ -1136,6 +1138,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             /*.mem_size   =*/ wctx.model.buf->size(),
             /*.mem_buffer =*/ wctx.model.buf->data(),
             /*.no_alloc   =*/ false,
+            /*.threads    =*/ GGML_DEFAULT_N_THREADS,
         };
 
         model.ctx = ggml_init(params);
@@ -1456,6 +1459,7 @@ static bool whisper_encode_internal(
         /*.mem_size   =*/ wstate.buf_compute.size(),
         /*.mem_buffer =*/ wstate.buf_compute.data(),
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -1935,6 +1939,7 @@ static bool whisper_decode_internal(
         /*.mem_size   =*/ wstate.buf_compute.size(),
         /*.mem_buffer =*/ wstate.buf_compute.data(),
         /*.no_alloc   =*/ false,
+        /*.threads    =*/ n_threads,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -5084,6 +5089,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
                 /*.mem_size   =*/ buf.size(),
                 /*.mem_buffer =*/ buf.data(),
                 /*.no_alloc   =*/ false,
+                /*.threads    =*/ n_threads,
             };
 
             struct ggml_context * ctx0 = ggml_init(gparams);
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index 24856a255..9c378fdd7 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -482,6 +482,7 @@ extern "C" {
         size_t mem_size;   // bytes
         void * mem_buffer; // if NULL, memory will be allocated internally
         bool   no_alloc;   // don't allocate memory for the tensor data
+        int    n_threads;  // number of threads for the thread pool
     };
 
 
@@ -1350,7 +1351,7 @@ extern "C" {
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
     GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan, void * tpool);
     GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
diff --git a/include/ggml/pthreads.h b/include/ggml/pthreads.h
new file mode 100644
index 000000000..c67fa3341
--- /dev/null
+++ b/include/ggml/pthreads.h
@@ -0,0 +1,448 @@
+#include <stdbool.h>
+#include <time.h>
+#include <windows.h>
+
+static DWORD timespec_to_ms(const struct timespec* abstime)
+{
+    DWORD t;
+
+    if (abstime == NULL)
+        return INFINITE;
+
+    t = ((abstime->tv_sec - time(NULL)) * 1000) + (abstime->tv_nsec / 1000000);
+    if (t < 0)
+        t = 1;
+    return t;
+}
+
+static void ms_to_timespec(struct timespec* ts, unsigned int ms)
+{
+    if (ts == NULL)
+        return;
+    ts->tv_sec = (ms / 1000) + time(NULL);
+    ts->tv_nsec = (ms % 1000) * 1000000;
+}
+
+typedef HANDLE pthread_t;
+typedef void pthread_attr_t;
+typedef DWORD thread_ret_t;
+
+typedef struct {
+    void *(*start_routine)(void *);
+    void *start_arg;
+} win_thread_start_t;
+
+static DWORD WINAPI win_thread_start(void *arg)
+{
+    win_thread_start_t *data       = arg;
+    void *(*start_routine)(void *) = data->start_routine;
+    void *start_arg                = data->start_arg;
+
+    free(data);
+
+    start_routine(start_arg);
+    return 0; /* ERROR_SUCCESS */
+}
+
+static int pthread_create(pthread_t *thread, pthread_attr_t *attr, void *(*start_routine)(void *), void *arg)
+{
+    win_thread_start_t *data;
+
+    if (thread == NULL || start_routine == NULL)
+        return 1;
+
+    data = malloc(sizeof(*data));
+    data->start_routine = start_routine;
+    data->start_arg     = arg;
+
+    *thread = CreateThread(NULL, 0, win_thread_start, data, 0, NULL);
+    if (*thread == NULL)
+        return 1;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void **value_ptr)
+{
+    (void)value_ptr;
+    WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
+    return 0;
+}
+
+static int pthread_detach(pthread_t thread)
+{
+    CloseHandle(thread);
+}
+
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef void pthread_mutexattr_t;
+
+static int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr)
+{
+    (void)attr;
+
+    if (mutex == NULL)
+        return 1;
+
+    InitializeCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_destroy(pthread_mutex_t *mutex)
+{
+    if (mutex == NULL)
+        return 1;
+    DeleteCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_lock(pthread_mutex_t *mutex)
+{
+    if (mutex == NULL)
+        return 1;
+    EnterCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_unlock(pthread_mutex_t *mutex)
+{
+    if (mutex == NULL)
+        return 1;
+    LeaveCriticalSection(mutex);
+    return 0;
+}
+
+typedef CONDITION_VARIABLE pthread_cond_t;
+typedef void pthread_condattr_t;
+
+#ifdef NEEDED
+struct timespec {
+    long tv_sec;
+    long tv_nsec;
+};
+#endif
+
+static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr)
+{
+    (void)attr;
+    if (cond == NULL)
+        return 1;
+    InitializeConditionVariable(cond);
+    return 0;
+}
+
+static int pthread_cond_destroy(pthread_cond_t *cond)
+{
+    /* Windows does not have a destroy for conditionals */
+    (void)cond;
+    return 0;
+}
+
+static int pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
+        const struct timespec *abstime)
+{
+    if (cond == NULL || mutex == NULL)
+        return 1;
+    if (!SleepConditionVariableCS(cond, mutex, timespec_to_ms(abstime)))
+        return 1;
+    return 0;
+}
+
+static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
+{
+    if (cond == NULL || mutex == NULL)
+        return 1;
+    return pthread_cond_timedwait(cond, mutex, NULL);
+}
+
+static int pthread_cond_signal(pthread_cond_t *cond)
+{
+    if (cond == NULL)
+        return 1;
+    WakeConditionVariable(cond);
+    return 0;
+}
+
+static int pthread_cond_broadcast(pthread_cond_t *cond)
+{
+    if (cond == NULL)
+        return 1;
+    WakeAllConditionVariable(cond);
+    return 0;
+}
+
+typedef struct {
+    SRWLOCK lock;
+    bool    exclusive;
+} pthread_rwlock_t;
+
+typedef void pthread_rwlockattr_t;
+
+static int pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
+{
+    (void)attr;
+    if (rwlock == NULL)
+        return 1;
+    InitializeSRWLock(&rwlock->lock);
+    rwlock->exclusive = false;
+    return 0;
+}
+
+static int pthread_rwlock_destroy(pthread_rwlock_t *rwlock)
+{
+    (void)rwlock;
+}
+
+static int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock)
+{
+    if (rwlock == NULL)
+        return 1;
+    AcquireSRWLockShared(&rwlock->lock);
+}
+
+static int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock)
+{
+    if (rwlock == NULL)
+        return 1;
+    return !TryAcquireSRWLockShared(&rwlock->lock);
+}
+
+static int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock)
+{
+    if (rwlock == NULL)
+        return 1;
+    AcquireSRWLockExclusive(&rwlock->lock);
+    rwlock->exclusive = true;
+}
+
+static int pthread_rwlock_trywrlock(pthread_rwlock_t  *rwlock)
+{
+    BOOLEAN ret;
+
+    if (rwlock == NULL)
+        return 1;
+
+    ret = TryAcquireSRWLockExclusive(&rwlock->lock);
+    if (ret)
+        rwlock->exclusive = true;
+    return ret;
+}
+
+static int pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
+{
+    if (rwlock == NULL)
+        return 1;
+
+    if (rwlock->exclusive) {
+        rwlock->exclusive = false;
+        ReleaseSRWLockExclusive(&rwlock->lock);
+    } else {
+        ReleaseSRWLockShared(&rwlock->lock);
+    }
+}
+
+static unsigned int pcthread_get_num_procs()
+{
+    SYSTEM_INFO sysinfo;
+
+    GetSystemInfo(&sysinfo);
+    return sysinfo.dwNumberOfProcessors;
+}
+
+typedef void (*thread_func_t)(void *arg);
+
+struct tpool_work {
+    thread_func_t      func;
+    void              *arg;
+    struct tpool_work *next;
+};
+typedef struct tpool_work tpool_work_t;
+
+static tpool_work_t *tpool_work_create(thread_func_t func, void *arg)
+{
+    tpool_work_t *work;
+
+    if (func == NULL)
+        return NULL;
+
+    work       = malloc(sizeof(*work));
+    work->func = func;
+    work->arg  = arg;
+    work->next = NULL;
+    return work;
+}
+
+static void tpool_work_destroy(tpool_work_t *work)
+{
+    if (work == NULL)
+        return;
+    free(work);
+}
+
+struct tpool {
+    tpool_work_t    *work_first;
+    tpool_work_t    *work_last;
+    pthread_mutex_t  work_mutex;
+    pthread_cond_t   work_cond;
+    pthread_cond_t   working_cond;
+    size_t           working_cnt;
+    size_t           thread_cnt;
+    bool             stop;
+};
+typedef struct tpool tpool_t;
+
+static tpool_work_t *tpool_work_get(tpool_t *tm)
+{
+    tpool_work_t *work;
+
+    if (tm == NULL)
+        return NULL;
+
+    work = tm->work_first;
+    if (work == NULL)
+        return NULL;
+
+    if (work->next == NULL) {
+        tm->work_first = NULL;
+        tm->work_last  = NULL;
+    } else {
+        tm->work_first = work->next;
+    }
+
+    return work;
+}
+
+static void *tpool_worker(void *arg)
+{
+    tpool_t      *tm = arg;
+    tpool_work_t *work;
+
+    // printf("pthreads %p starts\n", arg);
+    while (1) {
+        pthread_mutex_lock(&tm->work_mutex);
+        while (tm->work_first == NULL && !tm->stop)
+            pthread_cond_wait(&tm->work_cond, &tm->work_mutex);
+        if (tm->stop)
+            break;
+        work = tpool_work_get(tm);
+        tm->working_cnt++;
+        pthread_mutex_unlock(&tm->work_mutex);
+
+        // printf("pthreads %p works\n", arg);
+        if (work != NULL) {
+            work->func(work->arg);
+            tpool_work_destroy(work);
+        }
+        // printf("pthreads %p waits\n", arg);
+
+        pthread_mutex_lock(&tm->work_mutex);
+        tm->working_cnt--;
+        if (tm->working_cnt == 0 && tm->work_first == NULL)
+            pthread_cond_signal(&tm->working_cond);
+        pthread_mutex_unlock(&tm->work_mutex);
+    }
+
+    // printf("pthreads %p stops\n", arg);
+    tm->thread_cnt--;
+    pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_signal(&tm->working_cond);
+    return NULL;
+}
+
+static tpool_t *tpool_create(size_t num)
+{
+    tpool_t   *tm;
+    pthread_t  thread;
+    size_t     i;
+
+    if (num == 0)
+        num = 2;
+
+    tm             = calloc(1, sizeof(*tm));
+    tm->thread_cnt = num;
+
+    pthread_mutex_init(&tm->work_mutex, NULL);
+    pthread_cond_init(&tm->work_cond, NULL);
+    pthread_cond_init(&tm->working_cond, NULL);
+
+    tm->work_first = NULL;
+    tm->work_last  = NULL;
+
+    for (i=0; i<num; i++) {
+        pthread_create(&thread, NULL, tpool_worker, tm);
+        pthread_detach(thread);
+    }
+
+    return tm;
+}
+
+static void tpool_wait(tpool_t *tm)
+{
+    if (tm == NULL)
+        return;
+
+    pthread_mutex_lock(&tm->work_mutex);
+    while (tm->working_cnt != 0 || tm->work_first != NULL) {
+        pthread_cond_wait(&tm->working_cond, &tm->work_mutex);
+    }
+    pthread_mutex_unlock(&tm->work_mutex);
+}
+
+static void tpool_destroy(tpool_t *tm)
+{
+    tpool_work_t *work;
+    tpool_work_t *work2;
+
+    if (tm == NULL)
+        return;
+
+    pthread_mutex_lock(&tm->work_mutex);
+    work = tm->work_first;
+    while (work != NULL) {
+        work2 = work->next;
+        tpool_work_destroy(work);
+        work = work2;
+    }
+    tm->stop = true;
+    pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_broadcast(&tm->work_cond);
+
+    tpool_wait(tm);
+
+    pthread_mutex_lock(&tm->work_mutex);
+    while (tm->thread_cnt > 0)
+        pthread_cond_wait(&tm->working_cond, &tm->work_mutex);
+    pthread_mutex_unlock(&tm->work_mutex);
+
+    pthread_mutex_destroy(&tm->work_mutex);
+    pthread_cond_destroy(&tm->work_cond);
+    pthread_cond_destroy(&tm->working_cond);
+
+    free(tm);
+}
+
+static bool tpool_add_work(tpool_t *tm, thread_func_t func, void *arg)
+{
+    tpool_work_t *work;
+
+    if (tm == NULL)
+        return false;
+
+    work = tpool_work_create(func, arg);
+    if (work == NULL)
+        return false;
+
+    pthread_mutex_lock(&tm->work_mutex);
+    if (tm->work_first == NULL) {
+        tm->work_first = work;
+        tm->work_last  = tm->work_first;
+    } else {
+        tm->work_last->next = work;
+        tm->work_last       = work;
+    }
+
+    pthread_cond_broadcast(&tm->work_cond);
+    pthread_mutex_unlock(&tm->work_mutex);
+
+    return true;
+}
diff --git a/src/ggml.c b/src/ggml.c
index c56a3d0e0..cf5f18416 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -69,30 +69,13 @@ static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
 
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
-    (void) unused;
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void * unused) {
-    (void) unused;
-    return (int) WaitForSingleObject(thread, INFINITE);
-}
-
 static int sched_yield (void) {
     Sleep (0);
     return 0;
 }
+
+#include "pthreads.h"
+
 #else
 #include <pthread.h>
 #include <stdatomic.h>
@@ -119,7 +102,7 @@ typedef void * thread_ret_t;
 #endif
 
 /*#define GGML_PERF*/
-#define GGML_DEBUG 0
+#define GGML_DEBUG 10
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
 #define GGML_SILU_FP16
@@ -250,6 +233,7 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
+#define GGML_BLAS_USE_MKL
 #if defined(GGML_BLAS_USE_MKL)
 #include <mkl.h>
 #else
@@ -3948,6 +3932,8 @@ struct ggml_context {
 
     struct ggml_scratch scratch;
     struct ggml_scratch scratch_save;
+
+    tpool_t * tpool;
 };
 
 struct ggml_context_container {
@@ -4389,6 +4375,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         /*.objects_end        =*/ NULL,
         /*.scratch            =*/ { 0, 0, NULL, },
         /*.scratch_save       =*/ { 0, 0, NULL, },
+        /*.tpool              =*/ tpool_create(params.n_threads - 1),
     };
 
     GGML_ASSERT(ctx->mem_buffer != NULL);
@@ -4410,6 +4397,7 @@ void ggml_free(struct ggml_context * ctx) {
 
     for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
         if (&g_state.contexts[i].context == ctx) {
+            tpool_destroy(g_state.contexts[i].context.tpool);
             g_state.contexts[i].used = false;
 
             GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
@@ -16259,9 +16247,7 @@ struct ggml_compute_state_shared {
 
     const int n_threads;
 
-    // synchronization primitives
-    atomic_int n_active; // num active threads
-    atomic_int node_n;   // active graph node
+    int node_n; // active graph node
 
     bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
     void * abort_callback_data;
@@ -16282,116 +16268,26 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
     node->perf_time_us += time_us_cur;
 }
 
-static thread_ret_t ggml_graph_compute_thread(void * data) {
+static void ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
     const struct ggml_cgraph * cgraph = state->shared->cgraph;
     const struct ggml_cplan  * cplan  = state->shared->cplan;
 
-    const int * n_tasks_arr = cplan->n_tasks;
-    const int   n_threads   = state->shared->n_threads;
+    int n_threads = state->shared->n_threads;
+    int node_n = state->shared->node_n;
 
     set_numa_thread_affinity(state->ith, n_threads);
 
-    int node_n = -1;
-
-    while (true) {
-        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->node_n += 1;
-            return (thread_ret_t) GGML_EXIT_ABORTED;
-        }
-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            // all other threads are finished and spinning
-            // do finalize and init here so we don't have synchronize again
-            struct ggml_compute_params params = {
-                /*.type  =*/ GGML_TASK_FINALIZE,
-                /*.ith   =*/ 0,
-                /*.nth   =*/ 0,
-                /*.wsize =*/ cplan->work_size,
-                /*.wdata =*/ cplan->work_data,
-            };
-
-            if (node_n != -1) {
-                /* FINALIZE */
-                struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
-                if (GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = n_tasks_arr[node_n];
-                    ggml_compute_forward(&params, node);
-                }
-                ggml_graph_compute_perf_stats_node(node, state->shared);
-            }
-
-            // distribute new work or execute it direct if 1T
-            while (++node_n < cgraph->n_nodes) {
-                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
-
-                struct ggml_tensor * node = cgraph->nodes[node_n];
-                const int n_tasks = n_tasks_arr[node_n];
-
-                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
-                state->shared->perf_node_start_time_us = ggml_perf_time_us();
-
-                params.nth = n_tasks;
-
-                /* INIT */
-                if (GGML_OP_HAS_INIT[node->op]) {
-                    params.type = GGML_TASK_INIT;
-                    ggml_compute_forward(&params, node);
-                }
-
-                if (n_tasks == 1) {
-                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
-                    // they do something more efficient than spinning (?)
-                    params.type = GGML_TASK_COMPUTE;
-                    ggml_compute_forward(&params, node);
-
-                    if (GGML_OP_HAS_FINALIZE[node->op]) {
-                        params.type = GGML_TASK_FINALIZE;
-                        ggml_compute_forward(&params, node);
-                    }
-
-                    ggml_graph_compute_perf_stats_node(node, state->shared);
-                } else {
-                    break;
-                }
-
-                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-                    break;
-                }
-            }
-
-            atomic_store(&state->shared->n_active, n_threads);
-            atomic_store(&state->shared->node_n,   node_n);
-        } else {
-            // wait for other threads to finish
-            const int last = node_n;
-            do {
-                //sched_yield();
-                node_n = atomic_load(&state->shared->node_n);
-            } while (node_n == last);
-        }
-
-        // check if we should stop
-        if (node_n >= cgraph->n_nodes) break;
-
-        /* COMPUTE */
-        struct ggml_tensor * node = cgraph->nodes[node_n];
-        const int n_tasks = n_tasks_arr[node_n];
-
-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_COMPUTE,
-            /*.ith   =*/ state->ith,
-            /*.nth   =*/ n_tasks,
-            /*.wsize =*/ cplan->work_size,
-            /*.wdata =*/ cplan->work_data,
-        };
-
-        if (state->ith < n_tasks) {
-            ggml_compute_forward(&params, node);
-        }
-    }
+    struct ggml_compute_params params = {
+        /*.type  =*/ GGML_TASK_COMPUTE,
+        /*.ith   =*/ state->ith,
+        /*.nth   =*/ cplan->n_tasks[node_n],
+        /*.wsize =*/ cplan->work_size,
+        /*.wdata =*/ cplan->work_data,
+    };
 
-    return GGML_EXIT_SUCCESS;
+    ggml_compute_forward(&params, cgraph->nodes[node_n]);
 }
 
 struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
@@ -16737,7 +16633,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
     return cplan;
 }
 
-int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan, void * tpool) {
     {
         GGML_ASSERT(cplan);
         GGML_ASSERT(cplan->n_threads > 0);
@@ -16756,59 +16652,80 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
     const int n_threads = cplan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ cplan,
-        /*.perf_node_start_cycles  =*/ 0,
-        /*.perf_node_start_time_us =*/ 0,
-        /*.n_threads               =*/ n_threads,
-        /*.n_active                =*/ n_threads,
-        /*.node_n                  =*/ -1,
-        /*.abort_callback          =*/ NULL,
-        /*.abort_callback_data     =*/ NULL,
+        .cgraph                  = cgraph,
+        .cplan                   = cplan,
+        .perf_node_start_cycles  = 0,
+        .perf_node_start_time_us = 0,
+        .n_threads               = n_threads,
+        .node_n                  = -1,
+        .abort_callback          = NULL,
+        .abort_callback_data     = NULL,
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
-    // create thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; ++j) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .ith = j,
-                .shared = &state_shared,
-            };
+    const int * n_tasks_arr = cplan->n_tasks;
 
-            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-        }
+    // create thread pool arguments
+    for (int j = 1; j < n_threads; ++j) {
+        workers[j] = (struct ggml_compute_state) {
+            .thrd   = 0,
+            .ith = j,
+            .shared = &state_shared,
+        };
     }
     workers[0].ith = 0;
     workers[0].shared = &state_shared;
 
-    const int64_t perf_start_cycles  = ggml_perf_cycles();
-    const int64_t perf_start_time_us = ggml_perf_time_us();
+    int compute_status = GGML_EXIT_SUCCESS;
+    for (int node_n = 0; node_n < cgraph->n_nodes; ++ node_n) {
+        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+            compute_status = GGML_EXIT_ABORTED;
+            break;
+        }
+
+        struct ggml_tensor* node = cgraph->nodes[node_n];
+
+        state_shared.perf_node_start_cycles  = ggml_perf_cycles();
+        state_shared.perf_node_start_time_us = ggml_perf_time_us();
+
+        struct ggml_compute_params params = {
+            /*.type  =*/ -1,
+            /*.ith   =*/ 0,
+            /*.nth   =*/ n_tasks_arr[node_n],
+            /*.wsize =*/ cplan->work_size,
+            /*.wdata =*/ cplan->work_data,
+        };
 
-    // this is a work thread too
-    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
+        if (GGML_OP_HAS_INIT[node->op]) {
+            params.type = GGML_TASK_INIT;
+            ggml_compute_forward(&params, node);
+        }
 
-    // don't leave affinity set on the main thread
-    clear_numa_thread_affinity();
+        if (n_tasks_arr[node_n] > 1) {
+            state_shared.node_n = node_n;
+            for (int j = 1; j < n_tasks_arr[node_n]; ++j)
+                tpool_add_work(tpool, ggml_graph_compute_thread, &workers[j]);
+        }
 
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
+        params.type = GGML_TASK_COMPUTE;
+        ggml_compute_forward(&params, node);
+
+        if (n_tasks_arr[node_n] > 1) {
+            tpool_wait(tpool);
         }
+
+        if (GGML_OP_HAS_FINALIZE[node->op]) {
+            params.type = GGML_TASK_FINALIZE;
+            ggml_compute_forward(&params, node);
+        }
+
+        ggml_graph_compute_perf_stats_node(node, &state_shared);
     }
 
     // performance stats (graph)
     {
-        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
-        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
-
-        cgraph->perf_runs++;
-        cgraph->perf_cycles  += perf_cycles_cur;
-        cgraph->perf_time_us += perf_time_us_cur;
+        int64_t perf_cycles_cur = ggml_perf_cycles() - state_shared.perf_node_start_cycles;
+        int64_t perf_time_us_cur = ggml_perf_time_us() - state_shared.perf_node_start_time_us;
 
         GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
                 __func__, cgraph->perf_runs,
@@ -16839,7 +16756,7 @@ void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph *
 
     cplan.work_data = buf->data;
 
-    ggml_graph_compute(cgraph, &cplan);
+    ggml_graph_compute(cgraph, &cplan, ctx->tpool);
 }
 
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@@ -17105,6 +17022,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                 .mem_size   = fsize + overhead,
                 .mem_buffer = NULL,
                 .no_alloc   = false,
+                .n_threads  = 1,
             };
 
             *ctx_data = ggml_init(params);
@@ -17163,6 +17081,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                 .mem_size   = size_eval + overhead,
                 .mem_buffer = NULL,
                 .no_alloc   = true,
+                .n_threads = 1,
             };
 
             *ctx_eval = ggml_init(params);
@@ -18263,6 +18182,7 @@ enum ggml_opt_result ggml_opt(
             .mem_size   = 16*1024*1024,
             .mem_buffer = NULL,
             .no_alloc   = false,
+            .n_threads = 1,
         };
 
         ctx = ggml_init(params_ctx);
diff --git a/tests/test-blas0.c b/tests/test-blas0.c
index 0977d3ef8..5bf5d49f1 100644
--- a/tests/test-blas0.c
+++ b/tests/test-blas0.c
@@ -69,6 +69,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 2048ul*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads = GGML_DEFAULT_N_THREADS,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 01467bc18..2186f5cb9 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -345,6 +345,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads = GGML_DEFAULT_N_THREADS,
     };
 
     int64_t ne[4];
diff --git a/tests/test-mul-mat0.c b/tests/test-mul-mat0.c
index 1bd6e140b..85d510da4 100644
--- a/tests/test-mul-mat0.c
+++ b/tests/test-mul-mat0.c
@@ -235,6 +235,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads = GGML_DEFAULT_N_THREADS,
     };
 
     int64_t ne[4];
diff --git a/tests/test-mul-mat2.c b/tests/test-mul-mat2.c
index ad30492b4..264434361 100644
--- a/tests/test-mul-mat2.c
+++ b/tests/test-mul-mat2.c
@@ -2375,7 +2375,7 @@ int main(int argc, const char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/tests/test-opt.c b/tests/test-opt.c
index 5531814c4..b24f74647 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -122,6 +122,7 @@ int main(void) {
         .mem_size   = 1024*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = 1,
     };
     struct ggml_context * ctx = ggml_init(params);
 
diff --git a/tests/test-pool.c b/tests/test-pool.c
index cdf00f4ec..1c5dd3e67 100644
--- a/tests/test-pool.c
+++ b/tests/test-pool.c
@@ -7,6 +7,7 @@
 struct ggml_context* make_ctx(void) {
     struct ggml_init_params params = {
         .mem_size = 2 * 1024 * 1024,
+        .n_threads = GGML_DEFAULT_N_THREADS,
     };
 
     return ggml_init(params);
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 8d3c162d2..23de47dbd 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -117,6 +117,7 @@ int main(int argc, char * argv[]) {
         /* .mem_size   = */ 1*1024,
         /* .mem_buffer = */ NULL,
         /* .no_alloc   = */ true,
+        /* .n_threads  = */ GGML_DEFAULT_N_THREADS,
     };
     struct ggml_context * ctx = ggml_init(ggml_params);
 
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 0bb9537f6..bfea4d926 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -266,6 +266,7 @@ int main(int argc, char * argv[]) {
         /* .mem_size   = */ 1*1024,
         /* .mem_buffer = */ NULL,
         /* .no_alloc   = */ true,
+        /* .n_threads  = */ GGML_DEFAULT_N_THREADS,
     };
     struct ggml_context * ctx = ggml_init(ggml_params);
 
diff --git a/tests/test0.c b/tests/test0.c
index 7fba63e77..74b75f00e 100644
--- a/tests/test0.c
+++ b/tests/test0.c
@@ -8,6 +8,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/tests/test1.c b/tests/test1.c
index c313bf8e1..ae67a1f41 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -10,6 +10,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/tests/test2.c b/tests/test2.c
index 839e3e6de..763453fce 100644
--- a/tests/test2.c
+++ b/tests/test2.c
@@ -18,6 +18,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 128*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
 
     //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
diff --git a/tests/test3.c b/tests/test3.c
index b92d6233d..f18c8984a 100644
--- a/tests/test3.c
+++ b/tests/test3.c
@@ -13,6 +13,7 @@ int main(int argc, const char ** argv) {
         .mem_size   = 1024*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
 
     //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);

From 6198dca2445f8a305798d62c3b1be644deb08f53 Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Thu, 20 Jul 2023 13:27:58 +0200
Subject: [PATCH 2/9] Some cleanup

---
 examples/dolly-v2/main.cpp            | 2 +-
 examples/dolly-v2/quantize.cpp        | 2 +-
 examples/gpt-2/main.cpp               | 2 +-
 examples/gpt-2/quantize.cpp           | 2 +-
 examples/gpt-j/main.cpp               | 2 +-
 examples/gpt-j/quantize.cpp           | 2 +-
 examples/gpt-neox/main.cpp            | 2 +-
 examples/gpt-neox/quantize.cpp        | 2 +-
 examples/mnist/main-mtl.cpp           | 2 +-
 examples/mnist/main.cpp               | 2 +-
 examples/mpt/main.cpp                 | 2 +-
 examples/mpt/quantize.cpp             | 2 +-
 examples/replit/main.cpp              | 2 +-
 examples/replit/quantize.cpp          | 2 +-
 examples/starcoder/main.cpp           | 2 +-
 examples/starcoder/quantize.cpp       | 2 +-
 examples/starcoder/starcoder-mmap.cpp | 4 ++--
 examples/whisper/quantize.cpp         | 2 +-
 examples/whisper/whisper.cpp          | 6 +++---
 src/ggml.c                            | 5 ++---
 tests/test-opt.c                      | 2 +-
 21 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp
index 2c0eb6a00..b3a7d4477 100644
--- a/examples/dolly-v2/main.cpp
+++ b/examples/dolly-v2/main.cpp
@@ -231,7 +231,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
-            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
diff --git a/examples/dolly-v2/quantize.cpp b/examples/dolly-v2/quantize.cpp
index f7bfbf531..56c423d58 100644
--- a/examples/dolly-v2/quantize.cpp
+++ b/examples/dolly-v2/quantize.cpp
@@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 08dfb59fe..61fbc6484 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -203,7 +203,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
-            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp
index 27ea72f0b..5c4a2359e 100644
--- a/examples/gpt-2/quantize.cpp
+++ b/examples/gpt-2/quantize.cpp
@@ -145,7 +145,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
index 395a929b7..76ced4a66 100644
--- a/examples/gpt-j/main.cpp
+++ b/examples/gpt-j/main.cpp
@@ -202,7 +202,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
-            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp
index 62550a86c..0dd5c0367 100644
--- a/examples/gpt-j/quantize.cpp
+++ b/examples/gpt-j/quantize.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp
index ac2e3b90e..84755da6f 100644
--- a/examples/gpt-neox/main.cpp
+++ b/examples/gpt-neox/main.cpp
@@ -205,7 +205,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
-            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
diff --git a/examples/gpt-neox/quantize.cpp b/examples/gpt-neox/quantize.cpp
index b0cc09fbf..1a07be2d2 100644
--- a/examples/gpt-neox/quantize.cpp
+++ b/examples/gpt-neox/quantize.cpp
@@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index 1047a3814..41b112b5f 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -45,7 +45,7 @@ int mnist_eval(
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ buf,
         /*.no_alloc   =*/ false,
-        /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+        /*.n_threads  =*/ 1,
     };
 
     struct ggml_context * ctx_work = ggml_init(params);
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index afef54ad6..6eb8419c8 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -80,7 +80,7 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) {
             /*.mem_size   =*/ ctx_size + 1024*1024,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
-            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp
index 1ab737a02..6294185fe 100644
--- a/examples/mpt/main.cpp
+++ b/examples/mpt/main.cpp
@@ -298,7 +298,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
-            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
diff --git a/examples/mpt/quantize.cpp b/examples/mpt/quantize.cpp
index f4c900a2f..45c3eaf66 100644
--- a/examples/mpt/quantize.cpp
+++ b/examples/mpt/quantize.cpp
@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = {0, NULL, false, GGML_DEFAULT_N_THREADS };
+        struct ggml_init_params params = {0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp
index a1e3b383e..59929dd8c 100644
--- a/examples/replit/main.cpp
+++ b/examples/replit/main.cpp
@@ -280,7 +280,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
-            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
diff --git a/examples/replit/quantize.cpp b/examples/replit/quantize.cpp
index 24bbf86ed..329e5b56a 100644
--- a/examples/replit/quantize.cpp
+++ b/examples/replit/quantize.cpp
@@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = {0, NULL, false, GGML_DEFAULT_N_THREADS};
+        struct ggml_init_params params = {0, NULL, false, 1};
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp
index 5cfd9b8d0..5686e1196 100644
--- a/examples/starcoder/main.cpp
+++ b/examples/starcoder/main.cpp
@@ -226,7 +226,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
-            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
diff --git a/examples/starcoder/quantize.cpp b/examples/starcoder/quantize.cpp
index edafacb3f..b95e24939 100644
--- a/examples/starcoder/quantize.cpp
+++ b/examples/starcoder/quantize.cpp
@@ -145,7 +145,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/starcoder/starcoder-mmap.cpp b/examples/starcoder/starcoder-mmap.cpp
index b240abcae..8c606833d 100644
--- a/examples/starcoder/starcoder-mmap.cpp
+++ b/examples/starcoder/starcoder-mmap.cpp
@@ -352,7 +352,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ true,
-            /*.n_threads  =*/ GGML_DEFAULT_N_THREADS,
+            /*.n_threads  =*/ 1,
         };
 
         model.ctx = ggml_init(params);
@@ -451,7 +451,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
         c_params.mem_size   = model.cache.buf.size;
         c_params.mem_buffer = model.cache.buf.addr;
         c_params.no_alloc   = false;
-        c_params.n_threads  = GGML_DEFAULT_N_THREADS;
+        c_params.n_threads  = 1;
 
         model.cache.ctx = ggml_init(c_params);
 
diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp
index 82030d386..d4258dbb3 100644
--- a/examples/whisper/quantize.cpp
+++ b/examples/whisper/quantize.cpp
@@ -184,7 +184,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS };
+        struct ggml_init_params params = { 0, NULL, false, 1 };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp
index de1f47bf5..ca546f638 100644
--- a/examples/whisper/whisper.cpp
+++ b/examples/whisper/whisper.cpp
@@ -741,7 +741,7 @@ static bool kv_cache_init(
         /*.mem_size   =*/ cache.buf.size(),
         /*.mem_buffer =*/ cache.buf.data(),
         /*.no_alloc   =*/ false,
-        /*.threads    =*/ GGML_DEFAULT_N_THREADS,
+        /*.threads    =*/ 1,
     };
 
     cache.ctx = ggml_init(params);
@@ -778,7 +778,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
         /*.mem_size   =*/ cache.buf.size(),
         /*.mem_buffer =*/ cache.buf.data(),
         /*.no_alloc   =*/ false,
-        /*.threads    =*/ GGML_DEFAULT_N_THREADS,
+        /*.threads    =*/ 1,
     };
 
     cache.ctx = ggml_init(params);
@@ -1138,7 +1138,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             /*.mem_size   =*/ wctx.model.buf->size(),
             /*.mem_buffer =*/ wctx.model.buf->data(),
             /*.no_alloc   =*/ false,
-            /*.threads    =*/ GGML_DEFAULT_N_THREADS,
+            /*.threads    =*/ 1,
         };
 
         model.ctx = ggml_init(params);
diff --git a/src/ggml.c b/src/ggml.c
index cf5f18416..2974afca4 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -102,7 +102,7 @@ typedef void * thread_ret_t;
 #endif
 
 /*#define GGML_PERF*/
-#define GGML_DEBUG 10
+#define GGML_DEBUG 0
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
 #define GGML_SILU_FP16
@@ -233,7 +233,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
-#define GGML_BLAS_USE_MKL
 #if defined(GGML_BLAS_USE_MKL)
 #include <mkl.h>
 #else
@@ -18182,7 +18181,7 @@ enum ggml_opt_result ggml_opt(
             .mem_size   = 16*1024*1024,
             .mem_buffer = NULL,
             .no_alloc   = false,
-            .n_threads = 1,
+            .n_threads = GGML_DEFAULT_N_THREADS,
         };
 
         ctx = ggml_init(params_ctx);
diff --git a/tests/test-opt.c b/tests/test-opt.c
index b24f74647..eae8273fc 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -122,7 +122,7 @@ int main(void) {
         .mem_size   = 1024*1024*1024,
         .mem_buffer = NULL,
         .no_alloc   = false,
-        .n_threads  = 1,
+        .n_threads  = GGML_DEFAULT_N_THREADS,
     };
     struct ggml_context * ctx = ggml_init(params);
 

From 2b8dcb781f81bb53271329a8c754335af7ba0882 Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Thu, 20 Jul 2023 14:00:53 +0200
Subject: [PATCH 3/9] Possible fix for non Windows OS

---
 include/ggml/pthreads.h | 199 +--------------------------------------
 include/ggml/tpool.h    | 201 ++++++++++++++++++++++++++++++++++++++++
 src/ggml.c              |   4 +-
 3 files changed, 205 insertions(+), 199 deletions(-)
 create mode 100644 include/ggml/tpool.h

diff --git a/include/ggml/pthreads.h b/include/ggml/pthreads.h
index c67fa3341..bb0e7dcf3 100644
--- a/include/ggml/pthreads.h
+++ b/include/ggml/pthreads.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <stdbool.h>
 #include <time.h>
 #include <windows.h>
@@ -249,200 +251,3 @@ static unsigned int pcthread_get_num_procs()
     return sysinfo.dwNumberOfProcessors;
 }
 
-typedef void (*thread_func_t)(void *arg);
-
-struct tpool_work {
-    thread_func_t      func;
-    void              *arg;
-    struct tpool_work *next;
-};
-typedef struct tpool_work tpool_work_t;
-
-static tpool_work_t *tpool_work_create(thread_func_t func, void *arg)
-{
-    tpool_work_t *work;
-
-    if (func == NULL)
-        return NULL;
-
-    work       = malloc(sizeof(*work));
-    work->func = func;
-    work->arg  = arg;
-    work->next = NULL;
-    return work;
-}
-
-static void tpool_work_destroy(tpool_work_t *work)
-{
-    if (work == NULL)
-        return;
-    free(work);
-}
-
-struct tpool {
-    tpool_work_t    *work_first;
-    tpool_work_t    *work_last;
-    pthread_mutex_t  work_mutex;
-    pthread_cond_t   work_cond;
-    pthread_cond_t   working_cond;
-    size_t           working_cnt;
-    size_t           thread_cnt;
-    bool             stop;
-};
-typedef struct tpool tpool_t;
-
-static tpool_work_t *tpool_work_get(tpool_t *tm)
-{
-    tpool_work_t *work;
-
-    if (tm == NULL)
-        return NULL;
-
-    work = tm->work_first;
-    if (work == NULL)
-        return NULL;
-
-    if (work->next == NULL) {
-        tm->work_first = NULL;
-        tm->work_last  = NULL;
-    } else {
-        tm->work_first = work->next;
-    }
-
-    return work;
-}
-
-static void *tpool_worker(void *arg)
-{
-    tpool_t      *tm = arg;
-    tpool_work_t *work;
-
-    // printf("pthreads %p starts\n", arg);
-    while (1) {
-        pthread_mutex_lock(&tm->work_mutex);
-        while (tm->work_first == NULL && !tm->stop)
-            pthread_cond_wait(&tm->work_cond, &tm->work_mutex);
-        if (tm->stop)
-            break;
-        work = tpool_work_get(tm);
-        tm->working_cnt++;
-        pthread_mutex_unlock(&tm->work_mutex);
-
-        // printf("pthreads %p works\n", arg);
-        if (work != NULL) {
-            work->func(work->arg);
-            tpool_work_destroy(work);
-        }
-        // printf("pthreads %p waits\n", arg);
-
-        pthread_mutex_lock(&tm->work_mutex);
-        tm->working_cnt--;
-        if (tm->working_cnt == 0 && tm->work_first == NULL)
-            pthread_cond_signal(&tm->working_cond);
-        pthread_mutex_unlock(&tm->work_mutex);
-    }
-
-    // printf("pthreads %p stops\n", arg);
-    tm->thread_cnt--;
-    pthread_mutex_unlock(&tm->work_mutex);
-    pthread_cond_signal(&tm->working_cond);
-    return NULL;
-}
-
-static tpool_t *tpool_create(size_t num)
-{
-    tpool_t   *tm;
-    pthread_t  thread;
-    size_t     i;
-
-    if (num == 0)
-        num = 2;
-
-    tm             = calloc(1, sizeof(*tm));
-    tm->thread_cnt = num;
-
-    pthread_mutex_init(&tm->work_mutex, NULL);
-    pthread_cond_init(&tm->work_cond, NULL);
-    pthread_cond_init(&tm->working_cond, NULL);
-
-    tm->work_first = NULL;
-    tm->work_last  = NULL;
-
-    for (i=0; i<num; i++) {
-        pthread_create(&thread, NULL, tpool_worker, tm);
-        pthread_detach(thread);
-    }
-
-    return tm;
-}
-
-static void tpool_wait(tpool_t *tm)
-{
-    if (tm == NULL)
-        return;
-
-    pthread_mutex_lock(&tm->work_mutex);
-    while (tm->working_cnt != 0 || tm->work_first != NULL) {
-        pthread_cond_wait(&tm->working_cond, &tm->work_mutex);
-    }
-    pthread_mutex_unlock(&tm->work_mutex);
-}
-
-static void tpool_destroy(tpool_t *tm)
-{
-    tpool_work_t *work;
-    tpool_work_t *work2;
-
-    if (tm == NULL)
-        return;
-
-    pthread_mutex_lock(&tm->work_mutex);
-    work = tm->work_first;
-    while (work != NULL) {
-        work2 = work->next;
-        tpool_work_destroy(work);
-        work = work2;
-    }
-    tm->stop = true;
-    pthread_mutex_unlock(&tm->work_mutex);
-    pthread_cond_broadcast(&tm->work_cond);
-
-    tpool_wait(tm);
-
-    pthread_mutex_lock(&tm->work_mutex);
-    while (tm->thread_cnt > 0)
-        pthread_cond_wait(&tm->working_cond, &tm->work_mutex);
-    pthread_mutex_unlock(&tm->work_mutex);
-
-    pthread_mutex_destroy(&tm->work_mutex);
-    pthread_cond_destroy(&tm->work_cond);
-    pthread_cond_destroy(&tm->working_cond);
-
-    free(tm);
-}
-
-static bool tpool_add_work(tpool_t *tm, thread_func_t func, void *arg)
-{
-    tpool_work_t *work;
-
-    if (tm == NULL)
-        return false;
-
-    work = tpool_work_create(func, arg);
-    if (work == NULL)
-        return false;
-
-    pthread_mutex_lock(&tm->work_mutex);
-    if (tm->work_first == NULL) {
-        tm->work_first = work;
-        tm->work_last  = tm->work_first;
-    } else {
-        tm->work_last->next = work;
-        tm->work_last       = work;
-    }
-
-    pthread_cond_broadcast(&tm->work_cond);
-    pthread_mutex_unlock(&tm->work_mutex);
-
-    return true;
-}
diff --git a/include/ggml/tpool.h b/include/ggml/tpool.h
new file mode 100644
index 000000000..a7cdf96db
--- /dev/null
+++ b/include/ggml/tpool.h
@@ -0,0 +1,201 @@
+#pragma once
+
+#include "pthreads.h"
+
+typedef void (*thread_func_t)(void *arg);
+
+struct tpool_work {
+    thread_func_t      func;
+    void              *arg;
+    struct tpool_work *next;
+};
+typedef struct tpool_work tpool_work_t;
+
+static tpool_work_t *tpool_work_create(thread_func_t func, void *arg)
+{
+    tpool_work_t *work;
+
+    if (func == NULL)
+        return NULL;
+
+    work       = malloc(sizeof(*work));
+    work->func = func;
+    work->arg  = arg;
+    work->next = NULL;
+    return work;
+}
+
+static void tpool_work_destroy(tpool_work_t *work)
+{
+    if (work == NULL)
+        return;
+    free(work);
+}
+
+struct tpool {
+    tpool_work_t    *work_first;
+    tpool_work_t    *work_last;
+    pthread_mutex_t  work_mutex;
+    pthread_cond_t   work_cond;
+    pthread_cond_t   working_cond;
+    size_t           working_cnt;
+    size_t           thread_cnt;
+    bool             stop;
+};
+typedef struct tpool tpool_t;
+
+static tpool_work_t *tpool_work_get(tpool_t *tm)
+{
+    tpool_work_t *work;
+
+    if (tm == NULL)
+        return NULL;
+
+    work = tm->work_first;
+    if (work == NULL)
+        return NULL;
+
+    if (work->next == NULL) {
+        tm->work_first = NULL;
+        tm->work_last  = NULL;
+    } else {
+        tm->work_first = work->next;
+    }
+
+    return work;
+}
+
+static void *tpool_worker(void *arg)
+{
+    tpool_t      *tm = arg;
+    tpool_work_t *work;
+
+    // printf("pthreads %p starts\n", arg);
+    while (1) {
+        pthread_mutex_lock(&tm->work_mutex);
+        while (tm->work_first == NULL && !tm->stop)
+            pthread_cond_wait(&tm->work_cond, &tm->work_mutex);
+        if (tm->stop)
+            break;
+        work = tpool_work_get(tm);
+        tm->working_cnt++;
+        pthread_mutex_unlock(&tm->work_mutex);
+
+        // printf("pthreads %p works\n", arg);
+        if (work != NULL) {
+            work->func(work->arg);
+            tpool_work_destroy(work);
+        }
+        // printf("pthreads %p waits\n", arg);
+
+        pthread_mutex_lock(&tm->work_mutex);
+        tm->working_cnt--;
+        if (tm->working_cnt == 0 && tm->work_first == NULL)
+            pthread_cond_signal(&tm->working_cond);
+        pthread_mutex_unlock(&tm->work_mutex);
+    }
+
+    // printf("pthreads %p stops\n", arg);
+    tm->thread_cnt--;
+    pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_signal(&tm->working_cond);
+    return NULL;
+}
+
+static tpool_t *tpool_create(size_t num)
+{
+    tpool_t   *tm;
+    pthread_t  thread;
+    size_t     i;
+
+    if (num == 0)
+        num = 2;
+
+    tm             = calloc(1, sizeof(*tm));
+    tm->thread_cnt = num;
+
+    pthread_mutex_init(&tm->work_mutex, NULL);
+    pthread_cond_init(&tm->work_cond, NULL);
+    pthread_cond_init(&tm->working_cond, NULL);
+
+    tm->work_first = NULL;
+    tm->work_last  = NULL;
+
+    for (i=0; i<num; i++) {
+        pthread_create(&thread, NULL, tpool_worker, tm);
+        pthread_detach(thread);
+    }
+
+    return tm;
+}
+
+static void tpool_wait(tpool_t *tm)
+{
+    if (tm == NULL)
+        return;
+
+    pthread_mutex_lock(&tm->work_mutex);
+    while (tm->working_cnt != 0 || tm->work_first != NULL) {
+        pthread_cond_wait(&tm->working_cond, &tm->work_mutex);
+    }
+    pthread_mutex_unlock(&tm->work_mutex);
+}
+
+static void tpool_destroy(tpool_t *tm)
+{
+    tpool_work_t *work;
+    tpool_work_t *work2;
+
+    if (tm == NULL)
+        return;
+
+    pthread_mutex_lock(&tm->work_mutex);
+    work = tm->work_first;
+    while (work != NULL) {
+        work2 = work->next;
+        tpool_work_destroy(work);
+        work = work2;
+    }
+    tm->stop = true;
+    pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_broadcast(&tm->work_cond);
+
+    tpool_wait(tm);
+
+    pthread_mutex_lock(&tm->work_mutex);
+    while (tm->thread_cnt > 0)
+        pthread_cond_wait(&tm->working_cond, &tm->work_mutex);
+    pthread_mutex_unlock(&tm->work_mutex);
+
+    pthread_mutex_destroy(&tm->work_mutex);
+    pthread_cond_destroy(&tm->work_cond);
+    pthread_cond_destroy(&tm->working_cond);
+
+    free(tm);
+}
+
+static bool tpool_add_work(tpool_t *tm, thread_func_t func, void *arg)
+{
+    tpool_work_t *work;
+
+    if (tm == NULL)
+        return false;
+
+    work = tpool_work_create(func, arg);
+    if (work == NULL)
+        return false;
+
+    pthread_mutex_lock(&tm->work_mutex);
+    if (tm->work_first == NULL) {
+        tm->work_first = work;
+        tm->work_last  = tm->work_first;
+    } else {
+        tm->work_last->next = work;
+        tm->work_last       = work;
+    }
+
+    pthread_cond_broadcast(&tm->work_cond);
+    pthread_mutex_unlock(&tm->work_mutex);
+
+    return true;
+}
diff --git a/src/ggml.c b/src/ggml.c
index 2974afca4..11faa09ef 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -75,7 +75,6 @@ static int sched_yield (void) {
 }
 
 #include "pthreads.h"
-
 #else
 #include <pthread.h>
 #include <stdatomic.h>
@@ -85,9 +84,10 @@ typedef void * thread_ret_t;
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
-
 #endif
 
+#include "tpool.h"
+
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __FMA__

From c163f27ad247658a178062c97badb1564a90085a Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Thu, 20 Jul 2023 14:04:49 +0200
Subject: [PATCH 4/9] Another trial

---
 include/ggml/tpool.h | 5 +++++
 src/ggml.c           | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/include/ggml/tpool.h b/include/ggml/tpool.h
index a7cdf96db..7ec0d5420 100644
--- a/include/ggml/tpool.h
+++ b/include/ggml/tpool.h
@@ -1,6 +1,11 @@
 #pragma once
 
+#if defined(_WIN32)
+#include <windows.h>
 #include "pthreads.h"
+#else
+#include <pthreads.h>
+#endif
 
 typedef void (*thread_func_t)(void *arg);
 
diff --git a/src/ggml.c b/src/ggml.c
index 11faa09ef..f3960b9e6 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -75,7 +75,9 @@ static int sched_yield (void) {
 }
 
 #include "pthreads.h"
+
 #else
+
 #include <pthread.h>
 #include <stdatomic.h>
 
@@ -84,6 +86,7 @@ typedef void * thread_ret_t;
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+
 #endif
 
 #include "tpool.h"

From a427e10cf79c540bcdc216cfe39a3d3aa8eff335 Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Thu, 20 Jul 2023 14:07:30 +0200
Subject: [PATCH 5/9] Rename header file

---
 include/ggml/{pthreads.h => pthreads-win32.h} | 0
 include/ggml/tpool.h                          | 2 +-
 src/ggml.c                                    | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename include/ggml/{pthreads.h => pthreads-win32.h} (100%)

diff --git a/include/ggml/pthreads.h b/include/ggml/pthreads-win32.h
similarity index 100%
rename from include/ggml/pthreads.h
rename to include/ggml/pthreads-win32.h
diff --git a/include/ggml/tpool.h b/include/ggml/tpool.h
index 7ec0d5420..20c21be73 100644
--- a/include/ggml/tpool.h
+++ b/include/ggml/tpool.h
@@ -2,7 +2,7 @@
 
 #if defined(_WIN32)
 #include <windows.h>
-#include "pthreads.h"
+#include "pthreads-win32.h"
 #else
 #include <pthreads.h>
 #endif
diff --git a/src/ggml.c b/src/ggml.c
index f3960b9e6..dce4b5b3a 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -74,7 +74,7 @@ static int sched_yield (void) {
     return 0;
 }
 
-#include "pthreads.h"
+#include "pthreads-win32.h"
 
 #else
 

From 85756a6e92c89c008c9c4545fb2be8f952cfbacc Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Thu, 20 Jul 2023 14:11:56 +0200
Subject: [PATCH 6/9] Some more renaming required

---
 include/ggml/{pthreads-win32.h => pthread-win32.h} | 0
 include/ggml/tpool.h                               | 4 ++--
 src/ggml.c                                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename include/ggml/{pthreads-win32.h => pthread-win32.h} (100%)

diff --git a/include/ggml/pthreads-win32.h b/include/ggml/pthread-win32.h
similarity index 100%
rename from include/ggml/pthreads-win32.h
rename to include/ggml/pthread-win32.h
diff --git a/include/ggml/tpool.h b/include/ggml/tpool.h
index 20c21be73..68a656dc7 100644
--- a/include/ggml/tpool.h
+++ b/include/ggml/tpool.h
@@ -2,9 +2,9 @@
 
 #if defined(_WIN32)
 #include <windows.h>
-#include "pthreads-win32.h"
+#include "pthread-win32.h"
 #else
-#include <pthreads.h>
+#include <pthread.h>
 #endif
 
 typedef void (*thread_func_t)(void *arg);
diff --git a/src/ggml.c b/src/ggml.c
index dce4b5b3a..9a9d56a07 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -74,7 +74,7 @@ static int sched_yield (void) {
     return 0;
 }
 
-#include "pthreads-win32.h"
+#include "pthread-win32.h"
 
 #else
 

From 4fa00ba7bd1b6a5d40db0310b413fa1280f98b98 Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Thu, 20 Jul 2023 15:26:09 +0200
Subject: [PATCH 7/9] Uniform order for signal and unlock.

---
 .gitignore           | 4 ++++
 include/ggml/tpool.h | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index c7a8f76b0..65996175a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,7 @@ zig-out/
 zig-cache/
 
 *.dot
+
+.gitignore/
+build-clang/
+build-icx/
diff --git a/include/ggml/tpool.h b/include/ggml/tpool.h
index 68a656dc7..2b95e978a 100644
--- a/include/ggml/tpool.h
+++ b/include/ggml/tpool.h
@@ -102,8 +102,8 @@ static void *tpool_worker(void *arg)
 
     // printf("pthreads %p stops\n", arg);
     tm->thread_cnt--;
-    pthread_mutex_unlock(&tm->work_mutex);
     pthread_cond_signal(&tm->working_cond);
+    pthread_mutex_unlock(&tm->work_mutex);
     return NULL;
 }
 
@@ -162,8 +162,8 @@ static void tpool_destroy(tpool_t *tm)
         work = work2;
     }
     tm->stop = true;
-    pthread_mutex_unlock(&tm->work_mutex);
     pthread_cond_broadcast(&tm->work_cond);
+    pthread_mutex_unlock(&tm->work_mutex);
 
     tpool_wait(tm);
 

From 3e852d64dfd9538e6de48617ee10faefba718b82 Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Thu, 20 Jul 2023 16:35:05 +0200
Subject: [PATCH 8/9] Fixing gitignore oops

---
 .gitignore | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 65996175a..c7a8f76b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,7 +29,3 @@ zig-out/
 zig-cache/
 
 *.dot
-
-.gitignore/
-build-clang/
-build-icx/

From ae8f0517f638e5ae0a9fa1901b4fbed5c6f621fd Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Sun, 23 Jul 2023 12:12:49 +0200
Subject: [PATCH 9/9] Reorder unlocking and signalling

---
 .gitignore           |  4 ++++
 include/ggml/tpool.h | 15 ++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index c7a8f76b0..65996175a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,7 @@ zig-out/
 zig-cache/
 
 *.dot
+
+.gitignore/
+build-clang/
+build-icx/
diff --git a/include/ggml/tpool.h b/include/ggml/tpool.h
index 2b95e978a..d1afcbb56 100644
--- a/include/ggml/tpool.h
+++ b/include/ggml/tpool.h
@@ -75,7 +75,6 @@ static void *tpool_worker(void *arg)
     tpool_t      *tm = arg;
     tpool_work_t *work;
 
-    // printf("pthreads %p starts\n", arg);
     while (1) {
         pthread_mutex_lock(&tm->work_mutex);
         while (tm->work_first == NULL && !tm->stop)
@@ -86,24 +85,22 @@ static void *tpool_worker(void *arg)
         tm->working_cnt++;
         pthread_mutex_unlock(&tm->work_mutex);
 
-        // printf("pthreads %p works\n", arg);
         if (work != NULL) {
             work->func(work->arg);
             tpool_work_destroy(work);
         }
-        // printf("pthreads %p waits\n", arg);
 
         pthread_mutex_lock(&tm->work_mutex);
         tm->working_cnt--;
-        if (tm->working_cnt == 0 && tm->work_first == NULL)
-            pthread_cond_signal(&tm->working_cond);
+        bool predicate = tm->working_cnt == 0 && tm->work_first == NULL;
         pthread_mutex_unlock(&tm->work_mutex);
+        if(predicate)
+            pthread_cond_signal(&tm->working_cond);
     }
 
-    // printf("pthreads %p stops\n", arg);
     tm->thread_cnt--;
-    pthread_cond_signal(&tm->working_cond);
     pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_signal(&tm->working_cond);
     return NULL;
 }
 
@@ -162,8 +159,8 @@ static void tpool_destroy(tpool_t *tm)
         work = work2;
     }
     tm->stop = true;
-    pthread_cond_broadcast(&tm->work_cond);
     pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_broadcast(&tm->work_cond);
 
     tpool_wait(tm);
 
@@ -199,8 +196,8 @@ static bool tpool_add_work(tpool_t *tm, thread_func_t func, void *arg)
         tm->work_last       = work;
     }
 
-    pthread_cond_broadcast(&tm->work_cond);
     pthread_mutex_unlock(&tm->work_mutex);
+    pthread_cond_broadcast(&tm->work_cond);
 
     return true;
 }