diff --git a/.gitignore b/.gitignore index c7a8f76b0..65996175a 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,7 @@ zig-out/ zig-cache/ *.dot + +.gitignore/ +build-clang/ +build-icx/ diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp index ce634213b..b3a7d4477 100644 --- a/examples/dolly-v2/main.cpp +++ b/examples/dolly-v2/main.cpp @@ -231,6 +231,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, + /*.n_threads =*/ 1, }; model.ctx = ggml_init(params); @@ -492,6 +493,7 @@ bool dollyv2_eval( /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/dolly-v2/quantize.cpp b/examples/dolly-v2/quantize.cpp index 0c0d24ccf..56c423d58 100644 --- a/examples/dolly-v2/quantize.cpp +++ b/examples/dolly-v2/quantize.cpp @@ -139,7 +139,7 @@ int main(int argc, char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = { 0, NULL, false }; + struct ggml_init_params params = { 0, NULL, false, 1 }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 7e12eab5f..61fbc6484 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -203,6 +203,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, + /*.n_threads =*/ 1, }; model.ctx = ggml_init(params); @@ -425,6 +426,7 @@ bool gpt2_eval( /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp index 9d8d53a67..5c4a2359e 100644 --- a/examples/gpt-2/quantize.cpp +++ b/examples/gpt-2/quantize.cpp @@ -145,7 +145,7 @@ int main(int argc, char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = { 0, NULL, false }; + struct ggml_init_params params = { 0, NULL, false, 1 }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp index b42764ce2..76ced4a66 100644 --- a/examples/gpt-j/main.cpp +++ b/examples/gpt-j/main.cpp @@ -202,6 +202,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, + /*.n_threads =*/ 1, }; model.ctx = ggml_init(params); @@ -421,6 +422,7 @@ bool gptj_eval( /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp index 437053b7d..0dd5c0367 100644 --- a/examples/gpt-j/quantize.cpp +++ b/examples/gpt-j/quantize.cpp @@ -143,7 +143,7 @@ int main(int argc, char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = { 0, NULL, false }; + struct ggml_init_params params = { 0, NULL, false, 1 }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp index 4af771b84..84755da6f 100644 --- a/examples/gpt-neox/main.cpp +++ b/examples/gpt-neox/main.cpp @@ -205,6 +205,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_ /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, + /*.n_threads =*/ 1, }; model.ctx = ggml_init(params); @@ -472,6 +473,7 @@ bool gpt_neox_eval( /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/gpt-neox/quantize.cpp b/examples/gpt-neox/quantize.cpp index 96208c1e8..1a07be2d2 100644 --- a/examples/gpt-neox/quantize.cpp +++ b/examples/gpt-neox/quantize.cpp @@ -139,7 +139,7 @@ int main(int argc, char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = { 0, NULL, false }; + struct ggml_init_params params = { 0, NULL, false, 1 }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp index 2000c9aac..a554c0e77 100644 --- a/examples/mnist/main-cpu.cpp +++ b/examples/mnist/main-cpu.cpp @@ -51,6 +51,7 @@ int mnist_eval( /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ n_threads, }; struct ggml_context * ctx_work = ggml_init(params); diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index a8d47ac9c..41b112b5f 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -45,6 +45,7 @@ int mnist_eval( /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ 1, }; struct ggml_context * ctx_work = ggml_init(params); diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index 5ff4ac20f..6eb8419c8 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -80,6 +80,7 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) { /*.mem_size =*/ ctx_size + 1024*1024, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, + /*.n_threads =*/ 1, }; model.ctx = ggml_init(params); @@ -182,6 +183,7 @@ int mnist_eval( /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp index 457dc3d5b..6294185fe 100644 --- a/examples/mpt/main.cpp +++ b/examples/mpt/main.cpp @@ -298,6 +298,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, + /*.n_threads =*/ 1, }; model.ctx = ggml_init(params); @@ -495,6 +496,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/mpt/quantize.cpp b/examples/mpt/quantize.cpp index d0c9dda82..45c3eaf66 100644 --- a/examples/mpt/quantize.cpp +++ b/examples/mpt/quantize.cpp @@ -144,7 +144,7 @@ int main(int argc, char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = {0, NULL, false}; + struct ggml_init_params params = {0, NULL, false, 1 }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp index aed7f268b..59929dd8c 100644 --- a/examples/replit/main.cpp +++ b/examples/replit/main.cpp @@ -280,6 +280,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, + /*.n_threads =*/ 1, }; model.ctx = ggml_init(params); @@ -472,6 +473,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/replit/quantize.cpp b/examples/replit/quantize.cpp index f274074bb..329e5b56a 100644 --- a/examples/replit/quantize.cpp +++ b/examples/replit/quantize.cpp @@ -140,7 +140,7 @@ int main(int argc, char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = {0, NULL, false}; + struct ggml_init_params params = {0, NULL, false, 1}; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp index d84e36634..5686e1196 100644 --- a/examples/starcoder/main.cpp +++ b/examples/starcoder/main.cpp @@ -226,6 +226,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, + /*.n_threads =*/ 1, }; model.ctx = ggml_init(params); @@ -460,6 +461,7 @@ bool starcoder_eval( /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.n_threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/starcoder/quantize.cpp b/examples/starcoder/quantize.cpp index d3aee3f26..b95e24939 100644 --- a/examples/starcoder/quantize.cpp +++ b/examples/starcoder/quantize.cpp @@ -145,7 +145,7 @@ int main(int argc, char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = { 0, NULL, false }; + struct ggml_init_params params = { 0, NULL, false, 1 }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/examples/starcoder/starcoder-mmap.cpp b/examples/starcoder/starcoder-mmap.cpp index 094c441d8..8c606833d 100644 --- a/examples/starcoder/starcoder-mmap.cpp +++ b/examples/starcoder/starcoder-mmap.cpp @@ -352,6 +352,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, + /*.n_threads =*/ 1, }; model.ctx = ggml_init(params); @@ -450,6 +451,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp c_params.mem_size = model.cache.buf.size; c_params.mem_buffer = model.cache.buf.addr; c_params.no_alloc = false; + c_params.n_threads = 1; model.cache.ctx = ggml_init(c_params); @@ -667,6 +669,7 @@ bool starcoder_eval( /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, + /*.threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp index 64e8f35c3..d4258dbb3 100644 --- a/examples/whisper/quantize.cpp +++ b/examples/whisper/quantize.cpp @@ -184,7 +184,7 @@ int main(int argc, char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = { 0, NULL, false }; + struct ggml_init_params params = { 0, NULL, false, 1 }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp index cad40426c..ca546f638 100644 --- a/examples/whisper/whisper.cpp +++ b/examples/whisper/whisper.cpp @@ -741,6 +741,7 @@ static bool kv_cache_init( /*.mem_size =*/ cache.buf.size(), /*.mem_buffer =*/ cache.buf.data(), /*.no_alloc =*/ false, + /*.threads =*/ 1, }; cache.ctx = ggml_init(params); @@ -777,6 +778,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) { /*.mem_size =*/ cache.buf.size(), /*.mem_buffer =*/ cache.buf.data(), /*.no_alloc =*/ false, + /*.threads =*/ 1, }; cache.ctx = ggml_init(params); @@ -1136,6 +1138,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con /*.mem_size =*/ wctx.model.buf->size(), /*.mem_buffer =*/ wctx.model.buf->data(), /*.no_alloc =*/ false, + /*.threads =*/ 1, }; model.ctx = ggml_init(params); @@ -1456,6 +1459,7 @@ static bool whisper_encode_internal( /*.mem_size =*/ wstate.buf_compute.size(), /*.mem_buffer =*/ wstate.buf_compute.data(), /*.no_alloc =*/ false, + /*.threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); @@ -1935,6 +1939,7 @@ static bool whisper_decode_internal( /*.mem_size =*/ wstate.buf_compute.size(), /*.mem_buffer =*/ wstate.buf_compute.data(), /*.no_alloc =*/ false, + /*.threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(params); @@ -5084,6 +5089,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { /*.mem_size =*/ buf.size(), /*.mem_buffer =*/ buf.data(), /*.no_alloc =*/ false, + /*.threads =*/ n_threads, }; struct ggml_context * ctx0 = ggml_init(gparams); diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index 24856a255..9c378fdd7 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -482,6 +482,7 @@ extern "C" { size_t mem_size; // bytes void * mem_buffer; // if NULL, memory will be allocated internally bool no_alloc; // don't allocate memory for the tensor data + int n_threads; // number of threads for the thread pool }; @@ -1350,7 +1351,7 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan, void * tpool); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // same as ggml_graph_compute() but the work data is allocated as a part of the context diff --git a/include/ggml/pthread-win32.h b/include/ggml/pthread-win32.h new file mode 100644 index 000000000..bb0e7dcf3 --- /dev/null +++ b/include/ggml/pthread-win32.h @@ -0,0 +1,253 @@ +#pragma once + +#include +#include +#include + +static DWORD timespec_to_ms(const struct timespec* abstime) +{ + DWORD t; + + if (abstime == NULL) + return INFINITE; + + t = ((abstime->tv_sec - time(NULL)) * 1000) + (abstime->tv_nsec / 1000000); + if (t < 0) + t = 1; + return t; +} + +static void ms_to_timespec(struct timespec* ts, unsigned int ms) +{ + if (ts == NULL) + return; + ts->tv_sec = (ms / 1000) + time(NULL); + ts->tv_nsec = (ms % 1000) * 1000000; +} + +typedef HANDLE pthread_t; +typedef void pthread_attr_t; +typedef DWORD thread_ret_t; + +typedef struct { + void *(*start_routine)(void *); + void *start_arg; +} win_thread_start_t; + +static DWORD WINAPI win_thread_start(void *arg) +{ + win_thread_start_t *data = arg; + void *(*start_routine)(void *) = data->start_routine; + void *start_arg = data->start_arg; + + free(data); + + start_routine(start_arg); + return 0; /* ERROR_SUCCESS */ +} + +static int pthread_create(pthread_t *thread, pthread_attr_t *attr, void *(*start_routine)(void *), void *arg) +{ + win_thread_start_t *data; + + if (thread == NULL || start_routine == NULL) + return 1; + + data = malloc(sizeof(*data)); + data->start_routine = start_routine; + data->start_arg = arg; + + *thread = CreateThread(NULL, 0, win_thread_start, data, 0, NULL); + if (*thread == NULL) + return 1; + return 0; +} + +static int pthread_join(pthread_t thread, void **value_ptr) +{ + (void)value_ptr; + WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); + return 0; +} + +static int pthread_detach(pthread_t thread) +{ + CloseHandle(thread); +} + +typedef CRITICAL_SECTION pthread_mutex_t; +typedef void pthread_mutexattr_t; + +static int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr) +{ + (void)attr; + + if (mutex == NULL) + return 1; + + InitializeCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_destroy(pthread_mutex_t *mutex) +{ + if (mutex == NULL) + return 1; + DeleteCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_lock(pthread_mutex_t *mutex) +{ + if (mutex == NULL) + return 1; + EnterCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_unlock(pthread_mutex_t *mutex) +{ + if (mutex == NULL) + return 1; + LeaveCriticalSection(mutex); + return 0; +} + +typedef CONDITION_VARIABLE pthread_cond_t; +typedef void pthread_condattr_t; + +#ifdef NEEDED +struct timespec { + long tv_sec; + long tv_nsec; +}; +#endif + +static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr) +{ + (void)attr; + if (cond == NULL) + return 1; + InitializeConditionVariable(cond); + return 0; +} + +static int pthread_cond_destroy(pthread_cond_t *cond) +{ + /* Windows does not have a destroy for conditionals */ + (void)cond; + return 0; +} + +static int pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex, + const struct timespec *abstime) +{ + if (cond == NULL || mutex == NULL) + return 1; + if (!SleepConditionVariableCS(cond, mutex, timespec_to_ms(abstime))) + return 1; + return 0; +} + +static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) +{ + if (cond == NULL || mutex == NULL) + return 1; + return pthread_cond_timedwait(cond, mutex, NULL); +} + +static int pthread_cond_signal(pthread_cond_t *cond) +{ + if (cond == NULL) + return 1; + WakeConditionVariable(cond); + return 0; +} + +static int pthread_cond_broadcast(pthread_cond_t *cond) +{ + if (cond == NULL) + return 1; + WakeAllConditionVariable(cond); + return 0; +} + +typedef struct { + SRWLOCK lock; + bool exclusive; +} pthread_rwlock_t; + +typedef void pthread_rwlockattr_t; + +static int pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr) +{ + (void)attr; + if (rwlock == NULL) + return 1; + InitializeSRWLock(&rwlock->lock); + rwlock->exclusive = false; + return 0; +} + +static int pthread_rwlock_destroy(pthread_rwlock_t *rwlock) +{ + (void)rwlock; +} + +static int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock) +{ + if (rwlock == NULL) + return 1; + AcquireSRWLockShared(&rwlock->lock); +} + +static int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock) +{ + if (rwlock == NULL) + return 1; + return !TryAcquireSRWLockShared(&rwlock->lock); +} + +static int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock) +{ + if (rwlock == NULL) + return 1; + AcquireSRWLockExclusive(&rwlock->lock); + rwlock->exclusive = true; +} + +static int pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock) +{ + BOOLEAN ret; + + if (rwlock == NULL) + return 1; + + ret = TryAcquireSRWLockExclusive(&rwlock->lock); + if (ret) + rwlock->exclusive = true; + return ret; +} + +static int pthread_rwlock_unlock(pthread_rwlock_t *rwlock) +{ + if (rwlock == NULL) + return 1; + + if (rwlock->exclusive) { + rwlock->exclusive = false; + ReleaseSRWLockExclusive(&rwlock->lock); + } else { + ReleaseSRWLockShared(&rwlock->lock); + } +} + +static unsigned int pcthread_get_num_procs() +{ + SYSTEM_INFO sysinfo; + + GetSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; +} + diff --git a/include/ggml/tpool.h b/include/ggml/tpool.h new file mode 100644 index 000000000..d1afcbb56 --- /dev/null +++ b/include/ggml/tpool.h @@ -0,0 +1,203 @@ +#pragma once + +#if defined(_WIN32) +#include +#include "pthread-win32.h" +#else +#include +#endif + +typedef void (*thread_func_t)(void *arg); + +struct tpool_work { + thread_func_t func; + void *arg; + struct tpool_work *next; +}; +typedef struct tpool_work tpool_work_t; + +static tpool_work_t *tpool_work_create(thread_func_t func, void *arg) +{ + tpool_work_t *work; + + if (func == NULL) + return NULL; + + work = malloc(sizeof(*work)); + work->func = func; + work->arg = arg; + work->next = NULL; + return work; +} + +static void tpool_work_destroy(tpool_work_t *work) +{ + if (work == NULL) + return; + free(work); +} + +struct tpool { + tpool_work_t *work_first; + tpool_work_t *work_last; + pthread_mutex_t work_mutex; + pthread_cond_t work_cond; + pthread_cond_t working_cond; + size_t working_cnt; + size_t thread_cnt; + bool stop; +}; +typedef struct tpool tpool_t; + +static tpool_work_t *tpool_work_get(tpool_t *tm) +{ + tpool_work_t *work; + + if (tm == NULL) + return NULL; + + work = tm->work_first; + if (work == NULL) + return NULL; + + if (work->next == NULL) { + tm->work_first = NULL; + tm->work_last = NULL; + } else { + tm->work_first = work->next; + } + + return work; +} + +static void *tpool_worker(void *arg) +{ + tpool_t *tm = arg; + tpool_work_t *work; + + while (1) { + pthread_mutex_lock(&tm->work_mutex); + while (tm->work_first == NULL && !tm->stop) + pthread_cond_wait(&tm->work_cond, &tm->work_mutex); + if (tm->stop) + break; + work = tpool_work_get(tm); + tm->working_cnt++; + pthread_mutex_unlock(&tm->work_mutex); + + if (work != NULL) { + work->func(work->arg); + tpool_work_destroy(work); + } + + pthread_mutex_lock(&tm->work_mutex); + tm->working_cnt--; + bool predicate = tm->working_cnt == 0 && tm->work_first == NULL; + pthread_mutex_unlock(&tm->work_mutex); + if(predicate) + pthread_cond_signal(&tm->working_cond); + } + + tm->thread_cnt--; + pthread_mutex_unlock(&tm->work_mutex); + pthread_cond_signal(&tm->working_cond); + return NULL; +} + +static tpool_t *tpool_create(size_t num) +{ + tpool_t *tm; + pthread_t thread; + size_t i; + + if (num == 0) + num = 2; + + tm = calloc(1, sizeof(*tm)); + tm->thread_cnt = num; + + pthread_mutex_init(&tm->work_mutex, NULL); + pthread_cond_init(&tm->work_cond, NULL); + pthread_cond_init(&tm->working_cond, NULL); + + tm->work_first = NULL; + tm->work_last = NULL; + + for (i=0; iwork_mutex); + while (tm->working_cnt != 0 || tm->work_first != NULL) { + pthread_cond_wait(&tm->working_cond, &tm->work_mutex); + } + pthread_mutex_unlock(&tm->work_mutex); +} + +static void tpool_destroy(tpool_t *tm) +{ + tpool_work_t *work; + tpool_work_t *work2; + + if (tm == NULL) + return; + + pthread_mutex_lock(&tm->work_mutex); + work = tm->work_first; + while (work != NULL) { + work2 = work->next; + tpool_work_destroy(work); + work = work2; + } + tm->stop = true; + pthread_mutex_unlock(&tm->work_mutex); + pthread_cond_broadcast(&tm->work_cond); + + tpool_wait(tm); + + pthread_mutex_lock(&tm->work_mutex); + while (tm->thread_cnt > 0) + pthread_cond_wait(&tm->working_cond, &tm->work_mutex); + pthread_mutex_unlock(&tm->work_mutex); + + pthread_mutex_destroy(&tm->work_mutex); + pthread_cond_destroy(&tm->work_cond); + pthread_cond_destroy(&tm->working_cond); + + free(tm); +} + +static bool tpool_add_work(tpool_t *tm, thread_func_t func, void *arg) +{ + tpool_work_t *work; + + if (tm == NULL) + return false; + + work = tpool_work_create(func, arg); + if (work == NULL) + return false; + + pthread_mutex_lock(&tm->work_mutex); + if (tm->work_first == NULL) { + tm->work_first = work; + tm->work_last = tm->work_first; + } else { + tm->work_last->next = work; + tm->work_last = work; + } + + pthread_mutex_unlock(&tm->work_mutex); + pthread_cond_broadcast(&tm->work_cond); + + return true; +} diff --git a/src/ggml.c b/src/ggml.c index c56a3d0e0..9a9d56a07 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -69,31 +69,15 @@ static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } -typedef HANDLE pthread_t; - -typedef DWORD thread_ret_t; -static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) { - (void) unused; - HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); - if (handle == NULL) - { - return EAGAIN; - } - - *out = handle; - return 0; -} - -static int pthread_join(pthread_t thread, void * unused) { - (void) unused; - return (int) WaitForSingleObject(thread, INFINITE); -} - static int sched_yield (void) { Sleep (0); return 0; } + +#include "pthread-win32.h" + #else + #include #include @@ -105,6 +89,8 @@ typedef void * thread_ret_t; #endif +#include "tpool.h" + // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) #ifndef __FMA__ @@ -3948,6 +3934,8 @@ struct ggml_context { struct ggml_scratch scratch; struct ggml_scratch scratch_save; + + tpool_t * tpool; }; struct ggml_context_container { @@ -4389,6 +4377,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.objects_end =*/ NULL, /*.scratch =*/ { 0, 0, NULL, }, /*.scratch_save =*/ { 0, 0, NULL, }, + /*.tpool =*/ tpool_create(params.n_threads - 1), }; GGML_ASSERT(ctx->mem_buffer != NULL); @@ -4410,6 +4399,7 @@ void ggml_free(struct ggml_context * ctx) { for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { if (&g_state.contexts[i].context == ctx) { + tpool_destroy(g_state.contexts[i].context.tpool); g_state.contexts[i].used = false; GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n", @@ -16259,9 +16249,7 @@ struct ggml_compute_state_shared { const int n_threads; - // synchronization primitives - atomic_int n_active; // num active threads - atomic_int node_n; // active graph node + int node_n; // active graph node bool (*abort_callback)(void * data); // abort ggml_graph_compute when true void * abort_callback_data; @@ -16282,116 +16270,26 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const node->perf_time_us += time_us_cur; } -static thread_ret_t ggml_graph_compute_thread(void * data) { +static void ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; const struct ggml_cgraph * cgraph = state->shared->cgraph; const struct ggml_cplan * cplan = state->shared->cplan; - const int * n_tasks_arr = cplan->n_tasks; - const int n_threads = state->shared->n_threads; + int n_threads = state->shared->n_threads; + int node_n = state->shared->node_n; set_numa_thread_affinity(state->ith, n_threads); - int node_n = -1; - - while (true) { - if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - state->shared->node_n += 1; - return (thread_ret_t) GGML_EXIT_ABORTED; - } - if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - // all other threads are finished and spinning - // do finalize and init here so we don't have synchronize again - struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_FINALIZE, - /*.ith =*/ 0, - /*.nth =*/ 0, - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - }; - - if (node_n != -1) { - /* FINALIZE */ - struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; - if (GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = n_tasks_arr[node_n]; - ggml_compute_forward(¶ms, node); - } - ggml_graph_compute_perf_stats_node(node, state->shared); - } - - // distribute new work or execute it direct if 1T - while (++node_n < cgraph->n_nodes) { - GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); - - struct ggml_tensor * node = cgraph->nodes[node_n]; - const int n_tasks = n_tasks_arr[node_n]; - - state->shared->perf_node_start_cycles = ggml_perf_cycles(); - state->shared->perf_node_start_time_us = ggml_perf_time_us(); - - params.nth = n_tasks; - - /* INIT */ - if (GGML_OP_HAS_INIT[node->op]) { - params.type = GGML_TASK_INIT; - ggml_compute_forward(¶ms, node); - } - - if (n_tasks == 1) { - // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, - // they do something more efficient than spinning (?) - params.type = GGML_TASK_COMPUTE; - ggml_compute_forward(¶ms, node); - - if (GGML_OP_HAS_FINALIZE[node->op]) { - params.type = GGML_TASK_FINALIZE; - ggml_compute_forward(¶ms, node); - } - - ggml_graph_compute_perf_stats_node(node, state->shared); - } else { - break; - } - - if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - break; - } - } - - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_n, node_n); - } else { - // wait for other threads to finish - const int last = node_n; - do { - //sched_yield(); - node_n = atomic_load(&state->shared->node_n); - } while (node_n == last); - } - - // check if we should stop - if (node_n >= cgraph->n_nodes) break; - - /* COMPUTE */ - struct ggml_tensor * node = cgraph->nodes[node_n]; - const int n_tasks = n_tasks_arr[node_n]; - - struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_COMPUTE, - /*.ith =*/ state->ith, - /*.nth =*/ n_tasks, - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - }; - - if (state->ith < n_tasks) { - ggml_compute_forward(¶ms, node); - } - } + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_COMPUTE, + /*.ith =*/ state->ith, + /*.nth =*/ cplan->n_tasks[node_n], + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; - return GGML_EXIT_SUCCESS; + ggml_compute_forward(¶ms, cgraph->nodes[node_n]); } struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { @@ -16737,7 +16635,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { return cplan; } -int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { +int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan, void * tpool) { { GGML_ASSERT(cplan); GGML_ASSERT(cplan->n_threads > 0); @@ -16756,59 +16654,80 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { const int n_threads = cplan->n_threads; struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.cgraph_plan =*/ cplan, - /*.perf_node_start_cycles =*/ 0, - /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - /*.abort_callback =*/ NULL, - /*.abort_callback_data =*/ NULL, + .cgraph = cgraph, + .cplan = cplan, + .perf_node_start_cycles = 0, + .perf_node_start_time_us = 0, + .n_threads = n_threads, + .node_n = -1, + .abort_callback = NULL, + .abort_callback_data = NULL, }; struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); - // create thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; ++j) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .ith = j, - .shared = &state_shared, - }; + const int * n_tasks_arr = cplan->n_tasks; - const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - } + // create thread pool arguments + for (int j = 1; j < n_threads; ++j) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .ith = j, + .shared = &state_shared, + }; } workers[0].ith = 0; workers[0].shared = &state_shared; - const int64_t perf_start_cycles = ggml_perf_cycles(); - const int64_t perf_start_time_us = ggml_perf_time_us(); + int compute_status = GGML_EXIT_SUCCESS; + for (int node_n = 0; node_n < cgraph->n_nodes; ++ node_n) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + compute_status = GGML_EXIT_ABORTED; + break; + } - // this is a work thread too - int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]); + struct ggml_tensor* node = cgraph->nodes[node_n]; - // don't leave affinity set on the main thread - clear_numa_thread_affinity(); + state_shared.perf_node_start_cycles = ggml_perf_cycles(); + state_shared.perf_node_start_time_us = ggml_perf_time_us(); + + struct ggml_compute_params params = { + /*.type =*/ -1, + /*.ith =*/ 0, + /*.nth =*/ n_tasks_arr[node_n], + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; - // join or kill thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; j++) { - const int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_INIT; + ggml_compute_forward(¶ms, node); } + + if (n_tasks_arr[node_n] > 1) { + state_shared.node_n = node_n; + for (int j = 1; j < n_tasks_arr[node_n]; ++j) + tpool_add_work(tpool, ggml_graph_compute_thread, &workers[j]); + } + + params.type = GGML_TASK_COMPUTE; + ggml_compute_forward(¶ms, node); + + if (n_tasks_arr[node_n] > 1) { + tpool_wait(tpool); + } + + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_FINALIZE; + ggml_compute_forward(¶ms, node); + } + + ggml_graph_compute_perf_stats_node(node, &state_shared); } // performance stats (graph) { - int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; - int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us; - - cgraph->perf_runs++; - cgraph->perf_cycles += perf_cycles_cur; - cgraph->perf_time_us += perf_time_us_cur; + int64_t perf_cycles_cur = ggml_perf_cycles() - state_shared.perf_node_start_cycles; + int64_t perf_time_us_cur = ggml_perf_time_us() - state_shared.perf_node_start_time_us; GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", __func__, cgraph->perf_runs, @@ -16839,7 +16758,7 @@ void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cplan.work_data = buf->data; - ggml_graph_compute(cgraph, &cplan); + ggml_graph_compute(cgraph, &cplan, ctx->tpool); } struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { @@ -17105,6 +17024,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** .mem_size = fsize + overhead, .mem_buffer = NULL, .no_alloc = false, + .n_threads = 1, }; *ctx_data = ggml_init(params); @@ -17163,6 +17083,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** .mem_size = size_eval + overhead, .mem_buffer = NULL, .no_alloc = true, + .n_threads = 1, }; *ctx_eval = ggml_init(params); @@ -18263,6 +18184,7 @@ enum ggml_opt_result ggml_opt( .mem_size = 16*1024*1024, .mem_buffer = NULL, .no_alloc = false, + .n_threads = GGML_DEFAULT_N_THREADS, }; ctx = ggml_init(params_ctx); diff --git a/tests/test-blas0.c b/tests/test-blas0.c index 0977d3ef8..5bf5d49f1 100644 --- a/tests/test-blas0.c +++ b/tests/test-blas0.c @@ -69,6 +69,7 @@ int main(int argc, const char ** argv) { .mem_size = 2048ul*1024*1024, .mem_buffer = NULL, .no_alloc = false, + .n_threads = GGML_DEFAULT_N_THREADS, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/tests/test-grad0.c b/tests/test-grad0.c index 01467bc18..2186f5cb9 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -345,6 +345,7 @@ int main(int argc, const char ** argv) { .mem_size = 128*1024*1024, .mem_buffer = NULL, .no_alloc = false, + .n_threads = GGML_DEFAULT_N_THREADS, }; int64_t ne[4]; diff --git a/tests/test-mul-mat0.c b/tests/test-mul-mat0.c index 1bd6e140b..85d510da4 100644 --- a/tests/test-mul-mat0.c +++ b/tests/test-mul-mat0.c @@ -235,6 +235,7 @@ int main(int argc, const char ** argv) { .mem_size = 128*1024*1024, .mem_buffer = NULL, .no_alloc = false, + .n_threads = GGML_DEFAULT_N_THREADS, }; int64_t ne[4]; diff --git a/tests/test-mul-mat2.c b/tests/test-mul-mat2.c index ad30492b4..264434361 100644 --- a/tests/test-mul-mat2.c +++ b/tests/test-mul-mat2.c @@ -2375,7 +2375,7 @@ int main(int argc, const char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = { 0, NULL, false }; + struct ggml_init_params params = { 0, NULL, false, GGML_DEFAULT_N_THREADS }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/tests/test-opt.c b/tests/test-opt.c index 5531814c4..eae8273fc 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -122,6 +122,7 @@ int main(void) { .mem_size = 1024*1024*1024, .mem_buffer = NULL, .no_alloc = false, + .n_threads = GGML_DEFAULT_N_THREADS, }; struct ggml_context * ctx = ggml_init(params); diff --git a/tests/test-pool.c b/tests/test-pool.c index cdf00f4ec..1c5dd3e67 100644 --- a/tests/test-pool.c +++ b/tests/test-pool.c @@ -7,6 +7,7 @@ struct ggml_context* make_ctx(void) { struct ggml_init_params params = { .mem_size = 2 * 1024 * 1024, + .n_threads = GGML_DEFAULT_N_THREADS, }; return ggml_init(params); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 8d3c162d2..23de47dbd 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -117,6 +117,7 @@ int main(int argc, char * argv[]) { /* .mem_size = */ 1*1024, /* .mem_buffer = */ NULL, /* .no_alloc = */ true, + /* .n_threads = */ GGML_DEFAULT_N_THREADS, }; struct ggml_context * ctx = ggml_init(ggml_params); diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 0bb9537f6..bfea4d926 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -266,6 +266,7 @@ int main(int argc, char * argv[]) { /* .mem_size = */ 1*1024, /* .mem_buffer = */ NULL, /* .no_alloc = */ true, + /* .n_threads = */ GGML_DEFAULT_N_THREADS, }; struct ggml_context * ctx = ggml_init(ggml_params); diff --git a/tests/test0.c b/tests/test0.c index 7fba63e77..74b75f00e 100644 --- a/tests/test0.c +++ b/tests/test0.c @@ -8,6 +8,7 @@ int main(int argc, const char ** argv) { .mem_size = 128*1024*1024, .mem_buffer = NULL, .no_alloc = false, + .n_threads = GGML_DEFAULT_N_THREADS, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/tests/test1.c b/tests/test1.c index c313bf8e1..ae67a1f41 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -10,6 +10,7 @@ int main(int argc, const char ** argv) { .mem_size = 128*1024*1024, .mem_buffer = NULL, .no_alloc = false, + .n_threads = GGML_DEFAULT_N_THREADS, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/tests/test2.c b/tests/test2.c index 839e3e6de..763453fce 100644 --- a/tests/test2.c +++ b/tests/test2.c @@ -18,6 +18,7 @@ int main(int argc, const char ** argv) { .mem_size = 128*1024*1024, .mem_buffer = NULL, .no_alloc = false, + .n_threads = GGML_DEFAULT_N_THREADS, }; //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM); diff --git a/tests/test3.c b/tests/test3.c index b92d6233d..f18c8984a 100644 --- a/tests/test3.c +++ b/tests/test3.c @@ -13,6 +13,7 @@ int main(int argc, const char ** argv) { .mem_size = 1024*1024*1024, .mem_buffer = NULL, .no_alloc = false, + .n_threads = GGML_DEFAULT_N_THREADS, }; //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);