From 712b5d6344955621b6467bc1089f7614549ab60a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 Mar 2024 16:36:17 +0200 Subject: [PATCH 1/2] metal : require ne00 >= 128 for mat-mat kernels ggml-ci --- examples/batched/batched.cpp | 2 +- ggml-metal.m | 4 ++-- tests/test-backend-ops.cpp | 7 +++++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index ee1f8f1bf5dd2..5fb2cb6035d68 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -78,7 +78,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); ctx_params.seed = 1234; - ctx_params.n_ctx = n_kv_req; + ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_len, n_parallel); ctx_params.n_seq_max = n_parallel; ctx_params.n_threads = params.n_threads; diff --git a/ggml-metal.m b/ggml-metal.m index 109e5fe6ba13d..e9598ddff1f0a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1383,7 +1383,7 @@ static enum ggml_status ggml_metal_graph_compute( !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1t == GGML_TYPE_F32 && - ne00 % 32 == 0 && ne00 >= 64 && + ne00 % 32 == 0 && ne00 >= 128 && (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); @@ -1698,7 +1698,7 @@ static enum ggml_status ggml_metal_graph_compute( // indirect matrix multiplication // !!! if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - ne20 % 32 == 0 && ne20 >= 64 && + ne20 % 32 == 0 && ne20 >= 128 && ne11 > ne11_mm_min) { id pipeline = nil; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index c2916c3e480e0..1998e1cbc4703 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2091,6 +2091,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } } + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1})); + for (ggml_type type_a : all_types) { for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { for (int n_mats : {2, 4, 8}) { From 1d6112bace46c383aab758f31df10441007fb70d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Mar 2024 14:55:15 +0200 Subject: [PATCH 2/2] llama : pad n_ctx by 32 ggml-ci --- common/common.cpp | 2 +- examples/batched/batched.cpp | 2 ++ ggml-metal.m | 4 ++-- llama.cpp | 3 +++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 5f10718ece90c..85c8292e4a1f1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -101,7 +101,7 @@ int32_t get_num_physical_cores() { return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } -void process_escapes(std::string& input) { +void process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 5fb2cb6035d68..7aaf63ceb1a7c 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -48,6 +48,8 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; } + process_escapes(params.prompt); + // init LLM llama_backend_init(); diff --git a/ggml-metal.m b/ggml-metal.m index e9598ddff1f0a..109e5fe6ba13d 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1383,7 +1383,7 @@ static enum ggml_status ggml_metal_graph_compute( !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1t == GGML_TYPE_F32 && - ne00 % 32 == 0 && ne00 >= 128 && + ne00 % 32 == 0 && ne00 >= 64 && (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); @@ -1698,7 +1698,7 @@ static enum ggml_status ggml_metal_graph_compute( // indirect matrix multiplication // !!! if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - ne20 % 32 == 0 && ne20 >= 128 && + ne20 % 32 == 0 && ne20 >= 64 && ne11 > ne11_mm_min) { id pipeline = nil; diff --git a/llama.cpp b/llama.cpp index 1a9fe0c4d2cea..9de4a86022b1a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13044,6 +13044,9 @@ struct llama_context * llama_new_context_with_model( cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + // this is necessary due to kv_self.n being padded later during inference + cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32); + // with causal attention, the batch size is limited by the context size cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);