From e31c8790ff8a8364f6b4fce8748e0bdd7baad6d2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 14:24:05 +0300 Subject: [PATCH 1/7] llama : deprecate softmax sampler + fix dist sampler ggml-ci --- common/sampling.cpp | 2 - .../llama.cpp.swift/LibLlama.swift | 1 - examples/save-load-state/save-load-state.cpp | 3 - examples/speculative/speculative.cpp | 3 - include/llama.h | 4 +- src/llama-sampling.cpp | 3 + tests/test-sampling.cpp | 107 +++++------------- 7 files changed, 35 insertions(+), 88 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 56cd0df6b81bc..f536c1e0ae667 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -203,7 +203,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co GGML_ASSERT(false && "unknown sampler type"); } } - llama_sampler_chain_add(result->chain, llama_sampler_init_softmax()); llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); } else if (params.mirostat == 1) { llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); @@ -222,7 +221,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but // it is much faster, since we avoid sorting all tokens and should give a good approximation llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs)); - llama_sampler_chain_add(result->chain, llama_sampler_init_softmax()); } llama_sampler_chain_add(result->chain, llama_sampler_init_greedy()); } diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index dcd9803a2adc2..65cd4eb515c7f 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -46,7 +46,6 @@ actor LlamaContext { let sparams = llama_sampler_chain_default_params() self.sampling = llama_sampler_chain_init(sparams) llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4)) - llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax()) llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234)) } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 3866cfa27e13e..89d60ec2ec227 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -42,7 +42,6 @@ int main(int argc, char ** argv) { llama_sampler * smpl = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_softmax()); llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed)); // tokenize prompt @@ -96,7 +95,6 @@ int main(int argc, char ** argv) { llama_sampler * smpl2 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl2, llama_sampler_init_softmax()); llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed)); printf("\nsecond run: %s", params.prompt.c_str()); @@ -156,7 +154,6 @@ int main(int argc, char ** argv) { llama_sampler * smpl3 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl3, llama_sampler_init_softmax()); llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed)); printf("\nsingle seq run: %s", params.prompt.c_str()); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 5a7b3084fd7c5..df84af4a1083e 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -180,8 +180,6 @@ int main(int argc, char ** argv) { // target model sampling context (reuse the llama_context's sampling instance) struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams); - struct llama_sampler * softmax = llama_sampler_init_softmax(); - // draft sequence data std::vector drafts(n_seq_dft); @@ -624,7 +622,6 @@ int main(int argc, char ** argv) { common_sampler_free(drafts[s].smpl); } - llama_sampler_free(softmax); llama_batch_free(batch_dft); llama_free(ctx_tgt); diff --git a/include/llama.h b/include/llama.h index 02bc7f087c62b..2206ef27d0bc0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -217,6 +217,7 @@ extern "C" { typedef struct llama_token_data_array { // TODO: consider SoA + // NOTE: this pointer can be modified by the samplers llama_token_data * data; size_t size; int64_t selected; // this is the index in the data array (i.e. not the token id) @@ -1086,7 +1087,8 @@ extern "C" { /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. - LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void); + DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), + "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)"); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 2e655068272b8..af5117e88a241 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -427,6 +427,9 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl* static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_dist *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + cur_p->selected = llama_sample_dist(cur_p, ctx->rng); } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 1372bdf13f2f6..7868aaa7a0ddd 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -24,20 +24,22 @@ static void dump(const llama_token_data_array * cur_p) { llama_sampler_free(cnstr); \ } while(0) -static void test_top_k(const std::vector & probs, const std::vector & expected_probs, int k) { - const size_t n_vocab = probs.size(); +#define CUR_P_FROM_PROBS() \ + const size_t n_vocab = probs.size(); \ + std::vector cur; \ + cur.reserve(n_vocab); \ + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { \ + const float logit = logf(probs[token_id]); \ + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); \ + } \ + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false } - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - const float logit = logf(probs[token_id]); - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); - } +static void test_top_k(const std::vector & probs, const std::vector & expected_probs, int k) { + CUR_P_FROM_PROBS(); - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; - APPLY(llama_sampler_init_softmax(), &cur_p); DUMP(&cur_p); APPLY(llama_sampler_init_top_k(k), &cur_p); + APPLY(llama_sampler_init_dist (0), &cur_p); DUMP(&cur_p); GGML_ASSERT(cur_p.size == expected_probs.size()); @@ -47,19 +49,12 @@ static void test_top_k(const std::vector & probs, const std::vector & probs, const std::vector & expected_probs, float p) { - const size_t n_vocab = probs.size(); - - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - const float logit = logf(probs[token_id]); - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); - } + CUR_P_FROM_PROBS(); - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; - APPLY(llama_sampler_init_softmax(), &cur_p); DUMP(&cur_p); APPLY(llama_sampler_init_top_p(p, 1), &cur_p); + APPLY(llama_sampler_init_dist (0), &cur_p); + DUMP(&cur_p); DUMP(&cur_p); GGML_ASSERT(cur_p.size == expected_probs.size()); @@ -69,16 +64,8 @@ static void test_top_p(const std::vector & probs, const std::vector & probs, const std::vector & expected_probs, float z) { - const size_t n_vocab = probs.size(); + CUR_P_FROM_PROBS(); - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - const float logit = logf(probs[token_id]); - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); - } - - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; DUMP(&cur_p); APPLY(llama_sampler_init_tail_free(z, 1), &cur_p); DUMP(&cur_p); @@ -90,20 +77,12 @@ static void test_tfs(const std::vector & probs, const std::vector } static void test_min_p(const std::vector & probs, const std::vector & expected_probs, float p) { - const size_t n_vocab = probs.size(); - - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - const float logit = logf(probs[token_id]); - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); - } + CUR_P_FROM_PROBS(); - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; DUMP(&cur_p); APPLY(llama_sampler_init_min_p(p, 1), &cur_p); + APPLY(llama_sampler_init_dist (0), &cur_p); DUMP(&cur_p); - APPLY(llama_sampler_init_softmax(), &cur_p); GGML_ASSERT(cur_p.size == expected_probs.size()); for (size_t i = 0; i < cur_p.size; i++) { @@ -112,17 +91,8 @@ static void test_min_p(const std::vector & probs, const std::vector & probs, const std::vector & expected_probs, float p, float t) { - const size_t n_vocab = probs.size(); + CUR_P_FROM_PROBS(); - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - const float logit = logf(probs[token_id]); - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); - } - - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; - APPLY(llama_sampler_init_softmax(), &cur_p); DUMP(&cur_p); APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p); DUMP(&cur_p); @@ -134,16 +104,8 @@ static void test_xtc(const std::vector & probs, const std::vector } static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) { - const size_t n_vocab = probs.size(); - - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - const float logit = logf(probs[token_id]); - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); - } + CUR_P_FROM_PROBS(); - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; DUMP(&cur_p); APPLY(llama_sampler_init_typical(p, 1), &cur_p); DUMP(&cur_p); @@ -160,16 +122,7 @@ static void test_penalties( ) { GGML_ASSERT(probs.size() == expected_probs.size()); - const size_t n_vocab = probs.size(); - - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - const float logit = logf(probs[token_id]); - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); - } - - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + CUR_P_FROM_PROBS(); auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false); @@ -177,10 +130,9 @@ static void test_penalties( llama_sampler_accept(sampler, last_tokens[i]); } - APPLY(llama_sampler_init_softmax(), &cur_p); DUMP(&cur_p); APPLY(sampler, &cur_p); - APPLY(llama_sampler_init_softmax(), &cur_p); + APPLY(llama_sampler_init_dist(0), &cur_p); DUMP(&cur_p); GGML_ASSERT(cur_p.size == expected_probs.size()); @@ -214,7 +166,7 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler default : GGML_ABORT("Unknown sampler"); } - APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests + APPLY(llama_sampler_init_dist(0), &cur_p); const int size = cur_p.size; @@ -307,21 +259,20 @@ static void test_perf() { BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32); BENCH(llama_sampler_init_typical (0.5f, 1), data, 32); BENCH(llama_sampler_init_xtc (1.0f, 0.1f, 1, 1), data, 32); - BENCH(llama_sampler_init_softmax (), data, 32); } int main(void) { ggml_time_init(); - test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1); - test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0); - test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0); - test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f); - test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f); - test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f); test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f); test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f); From 33a69ec742fb469b188a382bd49f1595eaab1482 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Oct 2024 17:51:54 +0300 Subject: [PATCH 2/7] tests : replace macros with functions ggml-ci --- tests/test-sampling.cpp | 206 +++++++++++++++++++++------------------- 1 file changed, 108 insertions(+), 98 deletions(-) diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 7868aaa7a0ddd..df62c8bec89e6 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -18,155 +18,165 @@ static void dump(const llama_token_data_array * cur_p) { #define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0) -#define APPLY(__cnstr, __cur_p) do { \ - auto * cnstr = (__cnstr); \ - llama_sampler_apply(cnstr, (__cur_p)); \ - llama_sampler_free(cnstr); \ -} while(0) - -#define CUR_P_FROM_PROBS() \ - const size_t n_vocab = probs.size(); \ - std::vector cur; \ - cur.reserve(n_vocab); \ - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { \ - const float logit = logf(probs[token_id]); \ - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); \ - } \ - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false } - -static void test_top_k(const std::vector & probs, const std::vector & expected_probs, int k) { - CUR_P_FROM_PROBS(); - - DUMP(&cur_p); - APPLY(llama_sampler_init_top_k(k), &cur_p); - APPLY(llama_sampler_init_dist (0), &cur_p); - DUMP(&cur_p); - - GGML_ASSERT(cur_p.size == expected_probs.size()); - for (size_t i = 0; i < cur_p.size; i++) { - GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5); +struct sampler_tester { + sampler_tester(size_t n_vocab) { + cur.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + const float logit = logf(token_id); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false }; } -} -static void test_top_p(const std::vector & probs, const std::vector & expected_probs, float p) { - CUR_P_FROM_PROBS(); + sampler_tester(const std::vector & probs, const std::vector & probs_expected) : probs_expected(probs_expected) { + cur.reserve(probs.size()); + for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) { + const float logit = logf(probs[token_id]); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } - DUMP(&cur_p); - APPLY(llama_sampler_init_top_p(p, 1), &cur_p); - APPLY(llama_sampler_init_dist (0), &cur_p); - DUMP(&cur_p); - DUMP(&cur_p); + cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false }; + } - GGML_ASSERT(cur_p.size == expected_probs.size()); - for (size_t i = 0; i < cur_p.size; i++) { - GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); + void apply(llama_sampler * sampler) { + llama_sampler_apply(sampler, &cur_p); + llama_sampler_free(sampler); } + + void check() { + GGML_ASSERT(cur_p.size == probs_expected.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < 1e-5); + } + } + + llama_token_data_array cur_p; + +private: + const std::vector probs_expected; + + std::vector cur; +}; + +static void test_temp(const std::vector & probs, const std::vector & probs_expected, float temp) { + sampler_tester tester(probs, probs_expected); + + DUMP(&tester.cur_p); + tester.apply(llama_sampler_init_temp(temp)); + tester.apply(llama_sampler_init_dist(0)); + DUMP(&tester.cur_p); + + tester.check(); } -static void test_tfs(const std::vector & probs, const std::vector & expected_probs, float z) { - CUR_P_FROM_PROBS(); +static void test_top_k(const std::vector & probs, const std::vector & probs_expected, int k) { + sampler_tester tester(probs, probs_expected); - DUMP(&cur_p); - APPLY(llama_sampler_init_tail_free(z, 1), &cur_p); - DUMP(&cur_p); + DUMP(&tester.cur_p); + tester.apply(llama_sampler_init_top_k(k)); + tester.apply(llama_sampler_init_dist (0)); + DUMP(&tester.cur_p); - GGML_ASSERT(cur_p.size == expected_probs.size()); - for (size_t i = 0; i < cur_p.size; i++) { - GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); - } + tester.check(); } -static void test_min_p(const std::vector & probs, const std::vector & expected_probs, float p) { - CUR_P_FROM_PROBS(); +static void test_top_p(const std::vector & probs, const std::vector & probs_expected, float p) { + sampler_tester tester(probs, probs_expected); - DUMP(&cur_p); - APPLY(llama_sampler_init_min_p(p, 1), &cur_p); - APPLY(llama_sampler_init_dist (0), &cur_p); - DUMP(&cur_p); + DUMP(&tester.cur_p); + tester.apply(llama_sampler_init_top_p(p, 1)); + tester.apply(llama_sampler_init_dist (0)); + DUMP(&tester.cur_p); - GGML_ASSERT(cur_p.size == expected_probs.size()); - for (size_t i = 0; i < cur_p.size; i++) { - GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); - } + tester.check(); } -static void test_xtc(const std::vector & probs, const std::vector & expected_probs, float p, float t) { - CUR_P_FROM_PROBS(); +static void test_tfs(const std::vector & probs, const std::vector & probs_expected, float z) { + sampler_tester tester(probs, probs_expected); - DUMP(&cur_p); - APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p); - DUMP(&cur_p); + DUMP(&tester.cur_p); + tester.apply(llama_sampler_init_tail_free(z, 1)); + DUMP(&tester.cur_p); - GGML_ASSERT(cur_p.size == expected_probs.size()); - for (size_t i = 0; i < cur_p.size; i++) { - GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5); - } + tester.check(); } -static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) { - CUR_P_FROM_PROBS(); +static void test_min_p(const std::vector & probs, const std::vector & probs_expected, float p) { + sampler_tester tester(probs, probs_expected); - DUMP(&cur_p); - APPLY(llama_sampler_init_typical(p, 1), &cur_p); - DUMP(&cur_p); + DUMP(&tester.cur_p); + tester.apply(llama_sampler_init_min_p(p, 1)); + tester.apply(llama_sampler_init_dist (0)); + DUMP(&tester.cur_p); - GGML_ASSERT(cur_p.size == expected_probs.size()); - for (size_t i = 0; i < cur_p.size; i++) { - GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); - } + tester.check(); +} + +static void test_xtc(const std::vector & probs, const std::vector & probs_expected, float p, float t) { + sampler_tester tester(probs, probs_expected); + + DUMP(&tester.cur_p); + tester.apply(llama_sampler_init_xtc(p, t, 0, 0)); + DUMP(&tester.cur_p); + + tester.check(); +} + +static void test_typical(const std::vector & probs, const std::vector & probs_expected, float p) { + sampler_tester tester(probs, probs_expected); + + DUMP(&tester.cur_p); + tester.apply(llama_sampler_init_typical(p, 1)); + DUMP(&tester.cur_p); + + tester.check(); } static void test_penalties( const std::vector & probs, const std::vector & last_tokens, - const std::vector & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence + const std::vector & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence ) { - GGML_ASSERT(probs.size() == expected_probs.size()); + GGML_ASSERT(probs.size() == probs_expected.size()); - CUR_P_FROM_PROBS(); + sampler_tester tester(probs, probs_expected); + const size_t n_vocab = probs.size(); auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false); for (size_t i = 0; i < last_tokens.size(); i++) { llama_sampler_accept(sampler, last_tokens[i]); } - DUMP(&cur_p); - APPLY(sampler, &cur_p); - APPLY(llama_sampler_init_dist(0), &cur_p); - DUMP(&cur_p); + DUMP(&tester.cur_p); + tester.apply(sampler); + tester.apply(llama_sampler_init_dist(0)); + DUMP(&tester.cur_p); - GGML_ASSERT(cur_p.size == expected_probs.size()); - for (size_t i = 0; i < cur_p.size; i++) { - GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); - } + tester.check(); } static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p ) { - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - const float logit = logf(token_id); - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); - } - - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + sampler_tester tester(n_vocab); llama_token min_token_id = 0; const llama_token max_token_id = n_vocab-1; for (auto s : samplers_sequence) { switch (s){ - case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break; + case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break; case 'f': GGML_ABORT("tail_free test not implemented"); case 'y': GGML_ABORT("typical test not implemented"); - case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break; - case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break; + case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break; + case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break; case 't': GGML_ABORT("temperature test not implemented"); default : GGML_ABORT("Unknown sampler"); } - APPLY(llama_sampler_init_dist(0), &cur_p); + tester.apply(llama_sampler_init_dist(0)); + + auto & cur_p = tester.cur_p; const int size = cur_p.size; From cb75bebcad8b4f06bf4a03c23a5d9ad1d625ae7d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Oct 2024 17:19:23 +0300 Subject: [PATCH 3/7] sampling : change temperature sampler logic For t <= 0.0f, keep the max logit intact and set the rest to -inf --- common/sampling.cpp | 3 ++- include/llama.h | 6 ++++-- src/llama-sampling.cpp | 23 +++++++++++++++++++++++ tests/test-sampling.cpp | 3 +++ 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index f536c1e0ae667..8d9a39ef052b8 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -171,7 +171,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co params.penalize_nl, params.ignore_eos)); - if (params.temp > 0.0f) { + if (params.temp >= 0.0f) { if (params.mirostat == 0) { for (const auto & cnstr : params.samplers) { switch (cnstr) { @@ -214,6 +214,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co GGML_ASSERT(false && "unknown mirostat version"); } } else { + // negative temperatures will trigger "greedy" sampling: simply take the most likely token each time if (params.n_probs > 0) { // some use cases require to sample greedily, but still obtain the probabilities of the top tokens // ref: https://github.com/ggerganov/llama.cpp/pull/9605 diff --git a/include/llama.h b/include/llama.h index 2206ef27d0bc0..581469034a862 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1082,8 +1082,8 @@ extern "C" { // available samplers: - LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void); - LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); + LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); + LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. @@ -1104,6 +1104,8 @@ extern "C" { /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep); + + /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t); /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index af5117e88a241..fb6668facd0a3 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -915,6 +915,28 @@ static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl* static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { const auto * ctx = (llama_sampler_temp *) smpl->ctx; + + if (ctx->temp <= 0.0f) { + // find the token with the highest logit and set the rest to -inf + llama_token max_id = cur_p->data[0].id; + float max_logit = cur_p->data[0].logit; + + for (size_t i = 1; i < cur_p->size; ++i) { + if (cur_p->data[i].logit > max_logit) { + max_id = cur_p->data[i].id; + max_logit = cur_p->data[i].logit; + } + } + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].id != max_id) { + cur_p->data[i].logit = -INFINITY; + } + } + + return; + } + for (size_t i = 0; i < cur_p->size; ++i) { cur_p->data[i].logit /= ctx->temp; } @@ -964,6 +986,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke if (ctx->delta > 0) { const float min_temp = std::max(0.0f, ctx->temp - ctx->delta); const float max_temp = ctx->temp + ctx->delta; + float exponent_val = ctx->exponent; // no need to do anything if there is only one (or zero) candidates diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index df62c8bec89e6..8960ced8f0ea1 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -274,6 +274,9 @@ static void test_perf() { int main(void) { ggml_time_init(); + test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f); + test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4); From 57fb835e5ba14772678315e9b0cefc8f9e499b74 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Oct 2024 18:09:57 +0300 Subject: [PATCH 4/7] cont : no need for special "greedy" logic top-k == 1 is the same --- common/sampling.cpp | 87 +++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 8d9a39ef052b8..4ab3eface3384 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -171,59 +171,46 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co params.penalize_nl, params.ignore_eos)); - if (params.temp >= 0.0f) { - if (params.mirostat == 0) { - for (const auto & cnstr : params.samplers) { - switch (cnstr) { - case COMMON_SAMPLER_TYPE_TOP_K: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); - break; - case COMMON_SAMPLER_TYPE_TOP_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_MIN_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_XTC: - llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); - break; - case COMMON_SAMPLER_TYPE_TFS_Z: - llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_TYPICAL_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_TEMPERATURE: - llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); - break; - case COMMON_SAMPLER_TYPE_INFILL: - llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); - break; - default: - GGML_ASSERT(false && "unknown sampler type"); - } + if (params.mirostat == 0) { + for (const auto & cnstr : params.samplers) { + switch (cnstr) { + case COMMON_SAMPLER_TYPE_TOP_K: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); + break; + case COMMON_SAMPLER_TYPE_TOP_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_MIN_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_XTC: + llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + break; + case COMMON_SAMPLER_TYPE_TFS_Z: + llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_TYPICAL_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_TEMPERATURE: + llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); + break; + case COMMON_SAMPLER_TYPE_INFILL: + llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); + break; + default: + GGML_ASSERT(false && "unknown sampler type"); } - llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); - } else if (params.mirostat == 1) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); - } else if (params.mirostat == 2) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); - } else { - GGML_ASSERT(false && "unknown mirostat version"); } + llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + } else if (params.mirostat == 1) { + llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); + llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); + } else if (params.mirostat == 2) { + llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); + llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); } else { - // negative temperatures will trigger "greedy" sampling: simply take the most likely token each time - if (params.n_probs > 0) { - // some use cases require to sample greedily, but still obtain the probabilities of the top tokens - // ref: https://github.com/ggerganov/llama.cpp/pull/9605 - // - // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but - // it is much faster, since we avoid sorting all tokens and should give a good approximation - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs)); - } - llama_sampler_chain_add(result->chain, llama_sampler_init_greedy()); + GGML_ASSERT(false && "unknown mirostat version"); } return result; From cd978508acfda216d2c946b2b5d5b532b5ab6be3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Oct 2024 18:23:02 +0300 Subject: [PATCH 5/7] tests : init prob correctly --- tests/test-sampling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 8960ced8f0ea1..e9dada79598ff 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -33,7 +33,7 @@ struct sampler_tester { cur.reserve(probs.size()); for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) { const float logit = logf(probs[token_id]); - cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]}); } cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false }; From 4a5b5870f191aa8fd938046e0fba3de8dc3c2279 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Oct 2024 22:53:22 +0300 Subject: [PATCH 6/7] llama : handle temp <= 0.0 in the temp_ext sampler too ggml-ci --- src/llama-sampling.cpp | 60 ++++++++++++++++++++--------------------- tests/test-sampling.cpp | 14 ++++++++++ 2 files changed, 44 insertions(+), 30 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index fb6668facd0a3..3b2dcfbfc50eb 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -63,6 +63,33 @@ static void llama_log_softmax(float * array, size_t size) { } */ +static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) { + if (temp <= 0.0f) { + // find the token with the highest logit and set the rest to -inf + llama_token max_id = cur_p->data[0].id; + float max_logit = cur_p->data[0].logit; + + for (size_t i = 1; i < cur_p->size; ++i) { + if (cur_p->data[i].logit > max_logit) { + max_id = cur_p->data[i].id; + max_logit = cur_p->data[i].logit; + } + } + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].id != max_id) { + cur_p->data[i].logit = -INFINITY; + } + } + + return; + } + + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= temp; + } +} + static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) { GGML_ASSERT(cur_p->size > 0); @@ -916,30 +943,7 @@ static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl* static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { const auto * ctx = (llama_sampler_temp *) smpl->ctx; - if (ctx->temp <= 0.0f) { - // find the token with the highest logit and set the rest to -inf - llama_token max_id = cur_p->data[0].id; - float max_logit = cur_p->data[0].logit; - - for (size_t i = 1; i < cur_p->size; ++i) { - if (cur_p->data[i].logit > max_logit) { - max_id = cur_p->data[i].id; - max_logit = cur_p->data[i].logit; - } - } - - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].id != max_id) { - cur_p->data[i].logit = -INFINITY; - } - } - - return; - } - - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].logit /= ctx->temp; - } + llama_sampler_temp_impl(cur_p, ctx->temp); } static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) { @@ -1024,9 +1028,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke #endif // Apply the dynamically calculated temperature scaling - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].logit /= dyn_temp; - } + llama_sampler_temp_impl(cur_p, dyn_temp); // Re-compute softmax probabilities after scaling logits with dynamic temperature const double max_l_double = cur_p->data[0].logit; @@ -1050,9 +1052,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke } #endif } else { - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].logit /= ctx->temp; - } + llama_sampler_temp_impl(cur_p, ctx->temp); } } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index e9dada79598ff..05600e6f54e90 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -70,6 +70,17 @@ static void test_temp(const std::vector & probs, const std::vector tester.check(); } +static void test_temp_ext(const std::vector & probs, const std::vector & probs_expected, float temp, float delta, float exponent) { + sampler_tester tester(probs, probs_expected); + + DUMP(&tester.cur_p); + tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent)); + tester.apply(llama_sampler_init_dist (0)); + DUMP(&tester.cur_p); + + tester.check(); +} + static void test_top_k(const std::vector & probs, const std::vector & probs_expected, int k) { sampler_tester tester(probs, probs_expected); @@ -277,6 +288,9 @@ int main(void) { test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f); test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f); + test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f); + test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4); From 06159898e10c6281aeca45fb65398d94f532a887 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 18 Oct 2024 15:58:52 +0300 Subject: [PATCH 7/7] cont : avoid extra loop in temperature sampler for sub-zero temp ggml-ci --- src/llama-sampling.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 3b2dcfbfc50eb..29852ddf398d6 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -66,18 +66,15 @@ static void llama_log_softmax(float * array, size_t size) { static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) { if (temp <= 0.0f) { // find the token with the highest logit and set the rest to -inf - llama_token max_id = cur_p->data[0].id; - float max_logit = cur_p->data[0].logit; + size_t max_i = 0; + float max_l = cur_p->data[0].logit; for (size_t i = 1; i < cur_p->size; ++i) { - if (cur_p->data[i].logit > max_logit) { - max_id = cur_p->data[i].id; - max_logit = cur_p->data[i].logit; - } - } - - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].id != max_id) { + if (cur_p->data[i ].logit > max_l) { + cur_p->data[max_i].logit = -INFINITY; + max_i = i; + max_l = cur_p->data[i].logit; + } else { cur_p->data[i].logit = -INFINITY; } }