Skip to content

Commit ff65f9a

Browse files
CUDA: use 1 thread if model is fully offloaded
1 parent 71d6975 commit ff65f9a

File tree

6 files changed

+32
-4
lines changed

6 files changed

+32
-4
lines changed

common/common.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,15 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
726726
return std::make_tuple(nullptr, nullptr);
727727
}
728728

729+
if (params.n_threads == -2) {
730+
params.n_threads = get_num_physical_cores();
731+
#ifdef GGML_USE_CUBLAS
732+
if (params.n_gpu_layers >= llama_model_n_layer(model) + 3) {
733+
params.n_threads = 1;
734+
}
735+
#endif // GGML_USE_CUBLAS
736+
}
737+
729738
llama_context * lctx = llama_new_context_with_model(model, lparams);
730739
if (lctx == NULL) {
731740
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ int32_t get_num_physical_cores();
2727

2828
struct gpt_params {
2929
uint32_t seed = -1; // RNG seed
30-
int32_t n_threads = get_num_physical_cores();
30+
int32_t n_threads = -2; // -1 == num logical cores; -2 == num physical cores if at least 1 CPU layer, 1 otherwise
3131
int32_t n_predict = -1; // new tokens to predict
3232
int32_t n_ctx = 512; // context size
3333
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)

examples/llama-bench/llama-bench.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ static const cmd_params cmd_params_defaults = {
153153
/* n_gen */ {128},
154154
/* n_batch */ {512},
155155
/* f32_kv */ {false},
156-
/* n_threads */ {get_num_physical_cores()},
156+
/* n_threads */ {-2},
157157
/* n_gpu_layers */ {99},
158158
/* main_gpu */ {0},
159159
/* mul_mat_q */ {true},
@@ -174,7 +174,7 @@ static void print_usage(int /* argc */, char ** argv) {
174174
fprintf(stdout, " -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
175175
fprintf(stdout, " -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
176176
fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
177-
fprintf(stdout, " -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
177+
fprintf(stdout, " -t, --threads <n> (default: 1 if full GPU, %d otherwise)\n", get_num_physical_cores());
178178
fprintf(stdout, " -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
179179
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
180180
fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
@@ -466,7 +466,20 @@ struct test {
466466
model_size = llama_model_size(lmodel);
467467
model_n_params = llama_model_n_params(lmodel);
468468
n_batch = inst.n_batch;
469-
n_threads = inst.n_threads;
469+
470+
if (inst.n_threads == -1) {
471+
n_threads = std::thread::hardware_concurrency();
472+
} else if (inst.n_threads == -2) {
473+
n_threads = get_num_physical_cores();
474+
#ifdef GGML_USE_CUBLAS
475+
if (inst.n_gpu_layers >= llama_model_n_layer(lmodel) + 3) {
476+
n_threads = 1;
477+
}
478+
#endif // GGML_USE_CUBLAS
479+
} else {
480+
n_threads = inst.n_threads;
481+
}
482+
470483
f32_kv = inst.f32_kv;
471484
n_gpu_layers = inst.n_gpu_layers;
472485
main_gpu = inst.main_gpu;

examples/simple/simple.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
int main(int argc, char ** argv) {
1616
gpt_params params;
17+
params.n_threads = get_num_physical_cores();
1718

1819
if (argc == 1 || argv[1][0] == '-') {
1920
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);

llama.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5586,6 +5586,10 @@ int llama_model_n_embd(const struct llama_model * model) {
55865586
return model->hparams.n_embd;
55875587
}
55885588

5589+
int llama_model_n_layer(const struct llama_model * model) {
5590+
return model->hparams.n_layer;
5591+
}
5592+
55895593
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
55905594
return snprintf(buf, buf_size, "%s %s %s",
55915595
model->name.c_str(),

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ extern "C" {
253253
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
254254
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
255255
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
256+
LLAMA_API int llama_model_n_layer(const struct llama_model * model);
256257

257258
// Get a string describing the model type
258259
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

0 commit comments

Comments
 (0)