Skip to content

llama : add thread safety test #14035

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,7 @@ jobs:
cmake -S . -B build ${{ matrix.defines }} `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release

- name: Add libopenblas.dll
id: add_libopenblas_dll
Expand Down
2 changes: 1 addition & 1 deletion ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`

CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"

if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
Expand Down
16 changes: 12 additions & 4 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,9 @@ bool fs_validate_filename(const std::string & filename) {
return true;
}

#include <iostream>


// returns true if successful, false otherwise
bool fs_create_directory_with_parents(const std::string & path) {
#ifdef _WIN32
Expand All @@ -784,9 +787,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
const std::wstring subpath = wpath.substr(0, pos_slash);
const wchar_t * test = subpath.c_str();

const bool success = CreateDirectoryW(test, NULL);
pos_slash += 1;

// skip the drive letter, in some systems it can return an access denied error
if (subpath.length() == 2 && subpath[1] == ':') {
continue;
}

const bool success = CreateDirectoryW(subpath.c_str(), NULL);

if (!success) {
const DWORD error = GetLastError();

Expand All @@ -800,8 +810,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
return false;
}
}

pos_slash += 1;
}

return true;
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
// TODO: move to ggml-threading
void ggml_barrier(struct ggml_threadpool * tp);

void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);

#ifdef __cplusplus
}
#endif
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
#endif
}

void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
}

int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
}

#if defined(__gnu_linux__)
static cpu_set_t ggml_get_numa_affinity(void) {
cpu_set_t cpuset;
Expand Down
8 changes: 2 additions & 6 deletions ggml/src/ggml-cpu/llamafile/sgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
#include "ggml-cpu-impl.h"
#include "ggml-quants.h"

#include <atomic>
#include <array>
#include <type_traits>

Expand Down Expand Up @@ -394,8 +393,6 @@ class tinyBLAS {

template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
static std::atomic<int64_t> current_chunk;

GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN;
Expand All @@ -410,7 +407,7 @@ class tinyBLAS {
if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
ggml_threadpool_chunk_set(params->threadpool, params->nth);
}

ggml_barrier(params->threadpool);
Expand Down Expand Up @@ -439,8 +436,7 @@ class tinyBLAS {
GGML_ASSERT(jj == jj2);
}

// next step.
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
job = ggml_threadpool_chunk_add(params->threadpool, 1);
}

ggml_barrier(params->threadpool);
Expand Down
18 changes: 11 additions & 7 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,18 @@ static struct llama_model * llama_model_load_from_file_impl(

// if using single GPU mode, remove all except the main GPU
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
llama_model_free(model);
return nullptr;
if (params.main_gpu < 0) {
model->devices.clear();
} else {
if (params.main_gpu >= (int)model->devices.size()) {
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
llama_model_free(model);
return nullptr;
}
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
model->devices.clear();
model->devices.push_back(main_gpu);
}
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
model->devices.clear();
model->devices.push_back(main_gpu);
}

for (auto * dev : model->devices) {
Expand Down
2 changes: 2 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ llama_build_and_test(test-json-partial.cpp)
llama_build_and_test(test-log.cpp)
llama_build_and_test(test-regex-partial.cpp)

llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4)

# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
if (NOT WIN32)
llama_build_and_test(test-arg-parser.cpp)
Expand Down
152 changes: 152 additions & 0 deletions tests/test-thread-safety.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// thread safety test
// - Loads a copy of the same model on each GPU, plus a copy on the CPU
// - Creates n_parallel (--parallel) contexts per model
// - Runs inference in parallel on each context

#include <thread>
#include <vector>
#include <atomic>
#include "llama.h"
#include "arg.h"
#include "common.h"
#include "log.h"
#include "sampling.h"

int main(int argc, char ** argv) {
common_params params;

if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}

common_init();

llama_backend_init();
llama_numa_init(params.numa);

LOG_INF("%s\n", common_params_get_system_info(params).c_str());

//llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
// if (level == GGML_LOG_LEVEL_ERROR) {
// common_log_add(common_log_main(), level, "%s", text);
// }
//}, NULL);

auto cparams = common_context_params_to_llama(params);

int dev_count = ggml_backend_dev_count();
int gpu_dev_count = 0;
for (int i = 0; i < dev_count; ++i) {
auto * dev = ggml_backend_dev_get(i);
if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
gpu_dev_count++;
}
}
const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split
//const int num_models = std::max(1, gpu_dev_count);
const int num_contexts = std::max(1, params.n_parallel);

std::vector<llama_model_ptr> models;
std::vector<std::thread> threads;
std::atomic<bool> failed = false;

for (int m = 0; m < num_models; ++m) {
auto mparams = common_model_params_to_llama(params);

if (m < gpu_dev_count) {
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
mparams.main_gpu = m;
} else if (m == gpu_dev_count) {
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
mparams.main_gpu = -1; // CPU model
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally think this line will brings an unexpected issue to a specified backend, because an unexpected issue can be reproduced in my forked project which introduced by test-thread-safety, pls refer to: zhouwg@5dba384

@qnixsynapse, sorry to bother you, just FYI: I'm not sure whether this is related to the weird issue in the ggml-sycl backend:

#5277 (comment)

#5277 (reply in thread)

} else {
mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;;
}

llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return 1;
}

models.emplace_back(model);
}

for (int m = 0; m < num_models; ++m) {
auto * model = models[m].get();
for (int c = 0; c < num_contexts; ++c) {
threads.emplace_back([&, m, c, model]() {
LOG_INF("Creating context %d/%d for model %d/%d\n", c + 1, num_contexts, m + 1, num_models);

llama_context_ptr ctx { llama_init_from_model(model, cparams) };
if (ctx == NULL) {
LOG_ERR("failed to create context\n");
failed.store(true);
return;
}

std::unique_ptr<common_sampler, decltype(&common_sampler_free)> sampler { common_sampler_init(model, params.sampling), common_sampler_free };
if (sampler == NULL) {
LOG_ERR("failed to create sampler\n");
failed.store(true);
return;
}

llama_batch batch = {};
{
auto prompt = common_tokenize(ctx.get(), params.prompt, true);
if (prompt.empty()) {
LOG_ERR("failed to tokenize prompt\n");
failed.store(true);
return;
}
batch = llama_batch_get_one(prompt.data(), prompt.size());
if (llama_decode(ctx.get(), batch)) {
LOG_ERR("failed to decode prompt\n");
failed.store(true);
return;
}
}

const auto * vocab = llama_model_get_vocab(model);
std::string result = params.prompt;

for (int i = 0; i < params.n_predict; i++) {
llama_token token;
if (batch.n_tokens > 0) {
token = common_sampler_sample(sampler.get(), ctx.get(), batch.n_tokens - 1);
} else {
token = llama_vocab_bos(vocab);
}

result += common_token_to_piece(ctx.get(), token);

if (llama_vocab_is_eog(vocab, token)) {
break;
}

batch = llama_batch_get_one(&token, 1);
if (llama_decode(ctx.get(), batch)) {
LOG_ERR("Model %d/%d, Context %d/%d: failed to decode\n", m + 1, num_models, c + 1, num_contexts);
failed.store(true);
return;
}
}

LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
});
}
}

for (auto & thread : threads) {
thread.join();
}

if (failed) {
LOG_ERR("One or more threads failed.\n");
return 1;
}

LOG_INF("All threads finished without errors.\n");
return 0;
}
Loading