CUDA: use 1 thread if model is fully offloaded

JohannesGaessler · JohannesGaessler · commit ff65f9ae4d10 · 2023-08-30T21:55:33.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -726,6 +726,15 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
+    if (params.n_threads == -2) {
+        params.n_threads = get_num_physical_cores();
+#ifdef GGML_USE_CUBLAS
+        if (params.n_gpu_layers >= llama_model_n_layer(model) + 3) {
+            params.n_threads = 1;
+        }
+#endif // GGML_USE_CUBLAS
+    }
+
     llama_context * lctx = llama_new_context_with_model(model, lparams);
     if (lctx == NULL) {
         fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
diff --git a/common/common.h b/common/common.h
@@ -27,7 +27,7 @@ int32_t get_num_physical_cores();
 
 struct gpt_params {
     uint32_t seed                           = -1;   // RNG seed
-    int32_t n_threads                       = get_num_physical_cores();
+    int32_t n_threads                       = -2;   // -1 == num logical cores; -2 == num physical cores if at least 1 CPU layer, 1 otherwise
     int32_t n_predict                       = -1;   // new tokens to predict
     int32_t n_ctx                           = 512;  // context size
     int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -153,7 +153,7 @@ static const cmd_params cmd_params_defaults = {
     /* n_gen         */ {128},
     /* n_batch       */ {512},
     /* f32_kv        */ {false},
-    /* n_threads     */ {get_num_physical_cores()},
+    /* n_threads     */ {-2},
     /* n_gpu_layers  */ {99},
     /* main_gpu      */ {0},
     /* mul_mat_q     */ {true},
@@ -174,7 +174,7 @@ static void print_usage(int /* argc */, char ** argv) {
     fprintf(stdout, "  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
     fprintf(stdout, "  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
     fprintf(stdout, "  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
-    fprintf(stdout, "  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    fprintf(stdout, "  -t, --threads <n>                 (default: 1 if full GPU, %d otherwise)\n", get_num_physical_cores());
     fprintf(stdout, "  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
     fprintf(stdout, "  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
     fprintf(stdout, "  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
@@ -466,7 +466,20 @@ struct test {
         model_size = llama_model_size(lmodel);
         model_n_params = llama_model_n_params(lmodel);
         n_batch = inst.n_batch;
-        n_threads = inst.n_threads;
+
+        if (inst.n_threads == -1) {
+            n_threads = std::thread::hardware_concurrency();
+        } else if (inst.n_threads == -2) {
+            n_threads = get_num_physical_cores();
+#ifdef GGML_USE_CUBLAS
+            if (inst.n_gpu_layers >= llama_model_n_layer(lmodel) + 3) {
+                n_threads = 1;
+            }
+#endif // GGML_USE_CUBLAS
+        } else {
+            n_threads = inst.n_threads;
+        }
+
         f32_kv = inst.f32_kv;
         n_gpu_layers = inst.n_gpu_layers;
         main_gpu = inst.main_gpu;
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -14,6 +14,7 @@
 
 int main(int argc, char ** argv) {
     gpt_params params;
+    params.n_threads = get_num_physical_cores();
 
     if (argc == 1 || argv[1][0] == '-') {
         printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
diff --git a/llama.cpp b/llama.cpp
@@ -5586,6 +5586,10 @@ int llama_model_n_embd(const struct llama_model * model) {
     return model->hparams.n_embd;
 }
 
+int llama_model_n_layer(const struct llama_model * model) {
+    return model->hparams.n_layer;
+}
+
 int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
     return snprintf(buf, buf_size, "%s %s %s",
             model->name.c_str(),
diff --git a/llama.h b/llama.h
@@ -253,6 +253,7 @@ extern "C" {
     LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
     LLAMA_API int llama_model_n_ctx  (const struct llama_model * model);
     LLAMA_API int llama_model_n_embd (const struct llama_model * model);
+    LLAMA_API int llama_model_n_layer(const struct llama_model * model);
 
     // Get a string describing the model type
     LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);