diff --git a/llama.cpp b/llama.cpp index 8c906a22f0ba9..c3c94347e2f67 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8001,10 +8001,16 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; - if (k == (int) candidates->size) { + if (k >= (int) (3*candidates->size /4)) { std::sort(candidates->data, candidates->data + candidates->size, comp); } else { - std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); + if (k > 3000) { + // this needs a closer look, tests on multiple platforms. On Intel I7 13th gen with VC compilers the performance is equal at ~2500 top-k. Before that partial_sort is faster. + std::nth_element(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); // separate stack to top-k + std::sort(candidates->data, candidates->data + k, comp); // Sort the top-k stack + } else { + std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); + } } candidates->sorted = true; }