From d6f35c7ca52939a05d1eed5ddee83a0fc7c17639 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Mon, 9 Oct 2023 18:54:16 -0600 Subject: [PATCH 1/6] Layer skipping demo --- examples/perplexity/perplexity.cpp | 130 ++++++++++++++++++++++++++++- llama.cpp | 79 +++++++++++------- llama.h | 1 + 3 files changed, 179 insertions(+), 31 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 7d0038bd40757..7559c02873a07 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2,6 +2,7 @@ #include "common.h" #include "llama.h" +#include #include #include #include @@ -320,6 +321,31 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; + llama_batch batch = llama_batch_get_one(NULL, 0, 0, 0); + + const int32_t n_layers = 32; // model layer count + const int test_count = 6; // num perplexity chunks to run for each test + const size_t prune_target = 4; // prune this many of the worst results each pass + // end tunables + + // 1 = attn, 2 = mlp, 3 = both + int32_t test_skip_type = 0; // but don't mess with this, it's set automatically. + std::vector layers; + layers.resize(n_layers + 1); + std::fill(layers.begin(), layers.end(), 0); + batch.run_layers = layers.data(); + int32_t skip_layer = -1; + std::vector skips; + std::vector skip_types; + skip_types.resize(n_layers); + std::fill(skip_types.begin(), skip_types.end(), 0); + std::vector> pass_results; + std::vector worsts; + worsts.resize(n_layers); + std::fill(worsts.begin(), worsts.end(), 0); + int32_t curr_best_layer = -1, curr_best_type = 0; + double curr_best_ppl = -1, ref_ppl = -1; + int count = 0; double nll = 0.0; double nll2 = 0.0; @@ -327,8 +353,88 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); std::vector workers(std::thread::hardware_concurrency() - 1); + static const char * label = "?AMB"; + auto test_t_start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n_chunk; ++i) { + if (i > 0 && i % test_count == 0) { + auto test_t_end = std::chrono::high_resolution_clock::now(); + float test_t_total = std::chrono::duration(test_t_end - test_t_start).count(); + + skip_layer = n_layers; + for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { + int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3); + // printf("##%d, %d\n", new_sl, curr_skipped); + if (curr_skipped == 3) continue; // Already tested or perm skip. + skip_layer = new_sl; + test_skip_type = (curr_skipped & 1) != 0 ? 2 : 1; + break; + } + if (skip_layer >= n_layers) { + if (curr_best_layer == -1) break; + if (pass_results.size() >= prune_target * 2) { + std::sort(pass_results.begin(), pass_results.end(), + [](const std::tuple & a, const std::tuple & b) { + return std::get<2>(a) > std::get<2>(b); + } + ); + const size_t num_prune = std::min(pass_results.size(), prune_target); + for (size_t temp = 0; temp < num_prune; temp++) { + int32_t lidx = std::get<0>(pass_results[temp]); + if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue; + worsts[lidx] |= std::get<1>(pass_results[temp]); + printf("\nPrune[%zu]: %d (%d) - %.2f\n", temp, lidx, std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp])); + } + } + pass_results.clear(); + printf("\n\nADD SKIP %c%3d - ppl vs ref %.4f", + int(label[curr_best_type]), curr_best_layer, + curr_best_ppl - ref_ppl); + if (curr_best_ppl > ref_ppl * 1.75) break; + skip_types[curr_best_layer] += curr_best_type; + if (std::find(skips.begin(), skips.end(), curr_best_layer) == skips.end()) { + skips.push_back(curr_best_layer); + } + curr_best_layer = -1; + curr_best_ppl = -1; + curr_best_type = 0; + skip_layer = n_layers; + for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { + skip_types[new_sl] = (skip_types[new_sl] & 3) | (worsts[new_sl] << 2); + } + for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { + int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3); + // printf("||%d, %d\n", new_sl, curr_skipped); + if (curr_skipped == 3) continue; // Already tested or perm skip. + skip_layer = new_sl; + test_skip_type = (curr_skipped & 1) != 0 ? 2 : 1; + break; + } + if (skip_layer == -1 || skip_layer == n_layers) break; + } + + i = 0; + count = 0; + nll = 0; + nll2 = 0; + logit_history.clear(); + prob_history.clear(); + + for (int32_t i = 0; i < n_layers; i++) { + layers[i] = (skip_types[i] & 3) | (i == skip_layer ? test_skip_type : 0); + } + layers[n_layers] = -1; + printf("\nTEST %c%3d + [", int(label[test_skip_type]), skip_layer); + for (const auto l : skips) { + printf("%c%d, ", int(label[skip_types[l] & 3]), l); + } + printf("] - len: %3zu, best:(%c%3d @ %.3f), last took %.2f sec\n", + skips.size() + 1, + int(label[curr_best_type]), curr_best_layer, + curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0, + test_t_total); + test_t_start = std::chrono::high_resolution_clock::now(); + } const int start = i * n_ctx; const int end = start + n_ctx; @@ -353,7 +459,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par tokens[batch_start] = llama_token_bos(ctx); } - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + batch.n_tokens = batch_size; + batch.token = tokens.data() + batch_start; + batch.all_pos_0 = j * n_batch; + + if (llama_decode(ctx, batch)) { fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } @@ -367,7 +477,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const auto t_end = std::chrono::high_resolution_clock::now(); - if (i == 0) { + if (i == 0 && skip_layer < 0 && skips.empty()) { const float t_total = std::chrono::duration(t_end - t_start).count(); fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); @@ -396,8 +506,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par count += n_ctx - first - 1; // perplexity is e^(average negative log-likelihood) + double ppl = std::exp(nll / count); if (params.ppl_output_type == 0) { - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + printf("[%d]%.4lf,", i + 1, ppl); } else { double av = nll/count; double av2 = nll2/count - av*av; @@ -405,6 +516,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); } fflush(stdout); + if (skip_layer >= 0 && (i + 1 == test_count || (i > 1 && ppl > ref_ppl * 3))) { + i = test_count - 1; + skip_types[skip_layer] |= test_skip_type << 2; + if (curr_best_layer == -1 || ppl < curr_best_ppl) { + curr_best_layer = skip_layer; + curr_best_ppl = ppl; + curr_best_type = test_skip_type; + } + printf(" -- %.3f", ppl - ref_ppl); + pass_results.push_back({skip_layer, test_skip_type, ppl}); + } else if (skip_layer < 0) { + ref_ppl = ppl; + } } printf("\n"); diff --git a/llama.cpp b/llama.cpp index c63e6251c7676..ff47e632024dd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3252,7 +3252,31 @@ static struct ggml_cgraph * llm_build_llama( } } + int32_t * run_layer = batch.run_layers; + bool run_attn = false, run_mlp = false; + cur = inpL; + for (int il = 0; il < n_layer; ++il) { + run_attn = run_mlp = true; + if (run_layer != NULL) { + if (*run_layer >= 0) { + run_attn = (*run_layer & 1) == 0; + run_mlp = (*run_layer & 2) == 0; + run_layer++; + } else { + run_layer = NULL; + } + } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 1) { + // No idea why this is needed, but otherwise we run out of space + // when skipping attn or mlp (but not both) on the last layer + run_mlp = false; + } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 2) { + // No idea why this is needed, but otherwise we run out of space + // when skipping attn or mlp (but not both) on the last layer + run_attn = false; + } + if (!run_attn && !run_mlp) continue; + ggml_format_name(inpL, "layer_inp_%d", il); offload_func_t offload_func = llama_nop; @@ -3263,10 +3287,11 @@ static struct ggml_cgraph * llm_build_llama( } #endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; + struct ggml_tensor * inpFF = nullptr; - // norm - { + // self-attention + if (run_attn) { + // norm cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); offload_func(cur); ggml_set_name(cur, "rms_norm_0"); @@ -3275,10 +3300,7 @@ static struct ggml_cgraph * llm_build_llama( cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); offload_func(cur); ggml_set_name(cur, "attention_norm_0"); - } - // self-attention - { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); offload_func_kq(tmpk); @@ -3395,25 +3417,25 @@ static struct ggml_cgraph * llm_build_llama( cur); offload_func(cur); ggml_set_name(cur, "result_wo"); - } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + inpFF = ggml_add(ctx0, cur, inpL); + offload_func(inpFF); + ggml_set_name(inpFF, "inpFF"); + } else { + inpFF = inpL; + } // feed-forward network - { + if (run_mlp) { // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); + cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); + offload_func(cur); + ggml_set_name(cur, "rms_norm_1"); - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); - } + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + offload_func(cur); + ggml_set_name(cur, "ffn_norm"); struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, @@ -3441,18 +3463,18 @@ static struct ggml_cgraph * llm_build_llama( cur); offload_func(cur); ggml_set_name(cur, "result_w2"); - } - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); + cur = ggml_add(ctx0, cur, inpFF); + offload_func(cur); + ggml_set_name(cur, "inpFF_+_result_w2"); + } else { + cur = inpFF; + } // input for next layer inpL = cur; } - cur = inpL; - // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); @@ -9582,7 +9604,7 @@ int llama_eval_embd( int n_past) { llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1); - llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; + llama_batch batch = { n_tokens, nullptr, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; const int ret = llama_decode_internal(*ctx, batch); if (ret < 0) { @@ -9604,6 +9626,7 @@ struct llama_batch llama_batch_get_one( llama_seq_id seq_id) { return { /*n_tokens =*/ n_tokens, + /*run_layers =*/ nullptr, /*tokens =*/ tokens, /*embd =*/ nullptr, /*pos =*/ nullptr, @@ -9617,7 +9640,7 @@ struct llama_batch llama_batch_get_one( } struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) { - llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; + llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; if (embd) { batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd); diff --git a/llama.h b/llama.h index 306f5b383cb11..71d0b3e498e2b 100644 --- a/llama.h +++ b/llama.h @@ -132,6 +132,7 @@ extern "C" { // typedef struct llama_batch { int32_t n_tokens; + int32_t *run_layers; // end marked by negative value. llama_token * token; float * embd; From 0abf0064ca3eb08757ee890f04804dcb8df7d78e Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 19 Oct 2023 18:00:15 -0600 Subject: [PATCH 2/6] What if we do something crazy like add layers instead of removing them? --- examples/perplexity/perplexity.cpp | 72 ++++++---- examples/speculative/speculative.cpp | 196 +++++++++++++++++++++++---- 2 files changed, 219 insertions(+), 49 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 7559c02873a07..fb3b018f21864 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -323,10 +323,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par llama_batch batch = llama_batch_get_one(NULL, 0, 0, 0); - const int32_t n_layers = 32; // model layer count - const int test_count = 6; // num perplexity chunks to run for each test - const size_t prune_target = 4; // prune this many of the worst results each pass - // end tunables + // model layer count + const int32_t n_layers = 32; + + // num perplexity chunks to run for each test + const int test_count = 4; + + // prune this many of the worst results each pass + const size_t prune_target = 2; + + // start with all but first/last layers disabled and start adding them back + const bool anti_mode = true; + + // **** end tunables *** // 1 = attn, 2 = mlp, 3 = both int32_t test_skip_type = 0; // but don't mess with this, it's set automatically. @@ -340,11 +349,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par skip_types.resize(n_layers); std::fill(skip_types.begin(), skip_types.end(), 0); std::vector> pass_results; - std::vector worsts; - worsts.resize(n_layers); - std::fill(worsts.begin(), worsts.end(), 0); + std::vector extremes; + extremes.resize(n_layers); + std::fill(extremes.begin(), extremes.end(), 0); + if (anti_mode) { + // No pointing in starting with first/last layer disabled. + skip_types[0] = 15; + skip_types[n_layers - 1] = 15; + skips.push_back(0); skips.push_back(0 + n_layers); + skips.push_back(n_layers - 1); skips.push_back(n_layers - 1 + n_layers); + } int32_t curr_best_layer = -1, curr_best_type = 0; double curr_best_ppl = -1, ref_ppl = -1; + const int32_t mask = anti_mode ? 3 : 0; int count = 0; double nll = 0.0; @@ -372,35 +389,40 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } if (skip_layer >= n_layers) { if (curr_best_layer == -1) break; - if (pass_results.size() >= prune_target * 2) { + if (prune_target > 0 && pass_results.size() >= prune_target * 2) { std::sort(pass_results.begin(), pass_results.end(), [](const std::tuple & a, const std::tuple & b) { + if (anti_mode) return std::get<2>(b) > std::get<2>(a); return std::get<2>(a) > std::get<2>(b); } ); const size_t num_prune = std::min(pass_results.size(), prune_target); - for (size_t temp = 0; temp < num_prune; temp++) { + for (size_t temp = 0, pruned = 0; temp < pass_results.size(); temp++) { int32_t lidx = std::get<0>(pass_results[temp]); if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue; - worsts[lidx] |= std::get<1>(pass_results[temp]); - printf("\nPrune[%zu]: %d (%d) - %.2f\n", temp, lidx, std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp])); + extremes[lidx] |= std::get<1>(pass_results[temp]); + printf("\nPrune[%zu]: %d (%d) - %.2f\n", pruned + 1, lidx, + std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp])); + if (anti_mode) { + skip_types[lidx] |= std::get<1>(pass_results[temp]); + skips.push_back(std::get<1>(pass_results[temp]) == 1 ? lidx : -lidx); + } + if (++pruned >= num_prune) break; } } pass_results.clear(); - printf("\n\nADD SKIP %c%3d - ppl vs ref %.4f", + printf("\n\nADD %c%3d - ppl vs ref %.4f", int(label[curr_best_type]), curr_best_layer, curr_best_ppl - ref_ppl); - if (curr_best_ppl > ref_ppl * 1.75) break; + if (!anti_mode && curr_best_ppl > ref_ppl * 1.75) break; skip_types[curr_best_layer] += curr_best_type; - if (std::find(skips.begin(), skips.end(), curr_best_layer) == skips.end()) { - skips.push_back(curr_best_layer); - } + skips.push_back(curr_best_type == 1 ? curr_best_layer : curr_best_layer + n_layers); curr_best_layer = -1; curr_best_ppl = -1; curr_best_type = 0; skip_layer = n_layers; for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { - skip_types[new_sl] = (skip_types[new_sl] & 3) | (worsts[new_sl] << 2); + skip_types[new_sl] = (skip_types[new_sl] & 3) | (extremes[new_sl] << 2); } for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3); @@ -420,16 +442,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par logit_history.clear(); prob_history.clear(); + int alive = 0; for (int32_t i = 0; i < n_layers; i++) { - layers[i] = (skip_types[i] & 3) | (i == skip_layer ? test_skip_type : 0); + layers[i] = mask ^ ((skip_types[i] & 3) | (i == skip_layer ? test_skip_type : 0)); + alive += !(layers[i] & 1) + !(layers[i] & 2); } layers[n_layers] = -1; printf("\nTEST %c%3d + [", int(label[test_skip_type]), skip_layer); - for (const auto l : skips) { - printf("%c%d, ", int(label[skip_types[l] & 3]), l); + for (auto l : skips) { + printf("%c%d, ", int(label[skip_types[l % n_layers] & 3]), l % n_layers); } - printf("] - len: %3zu, best:(%c%3d @ %.3f), last took %.2f sec\n", - skips.size() + 1, + printf("] - live: %3d/%3d, best:(%c%3d @ %.3f), last took %.2f sec\n", + alive, n_layers * 2, int(label[curr_best_type]), curr_best_layer, curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0, test_t_total); @@ -477,7 +501,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const auto t_end = std::chrono::high_resolution_clock::now(); - if (i == 0 && skip_layer < 0 && skips.empty()) { + if (i == 0 && skip_layer < 0 && ref_ppl < 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); @@ -516,7 +540,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); } fflush(stdout); - if (skip_layer >= 0 && (i + 1 == test_count || (i > 1 && ppl > ref_ppl * 3))) { + if (skip_layer >= 0 && (i + 1 == test_count || (i > 1 && ppl > ref_ppl * 30))) { i = test_count - 1; skip_types[skip_layer] |= test_skip_type << 2; if (curr_best_layer == -1 || ppl < curr_best_ppl) { diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 894321ce9648c..5830b4fb36560 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -8,6 +8,8 @@ #include #include +#define DOFFS 10000 + struct seq_draft { bool active = false; bool drafting = false; @@ -17,10 +19,31 @@ struct seq_draft { std::vector i_batch_tgt; std::vector tokens; + std::vector tokens_p; struct llama_sampling_context * ctx_sampling; }; +static void save_logits(llama_context * ctx, std::vector & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) { + // printf("SAVE %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs); + // printf(""); + GGML_ASSERT(doffs + count <= 30); + memcpy( + v.data() + doffs * n_vocab, + llama_get_logits(ctx) + soffs * n_vocab, + sizeof(float) * size_t(n_vocab) * count); +} + +static void restore_logits(llama_context * ctx, std::vector & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) { + // printf(""); + // printf("REST %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs); + GGML_ASSERT(soffs + count <= 30); + memcpy( + llama_get_logits(ctx) + doffs * n_vocab, + v.data() + soffs * n_vocab, + sizeof(float) * size_t(n_vocab) * count); +} + int main(int argc, char ** argv) { gpt_params params; @@ -37,8 +60,10 @@ int main(int argc, char ** argv) { const int n_seq_dft = params.n_parallel; // TODO: make this configurable - const float p_accept = 0.80f; - const float p_split = 0.10f; + // const float p_accept = 0.80f; + // const float p_split = 0.10f; + const float p_accept = 0.5f; // 0.80f; + const float p_split = p_accept / 8; // 0.10f; #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("speculative", "log")); @@ -46,6 +71,8 @@ int main(int argc, char ** argv) { log_dump_cmdline(argc, argv); #endif // LOG_DISABLE_LOGS + bool self_speculation = false; + // init llama.cpp llama_backend_init(params.numa); @@ -60,9 +87,18 @@ int main(int argc, char ** argv) { std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); // load the draft model - params.model = params.model_draft; - params.n_gpu_layers = params.n_gpu_layers_draft; - std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); + if (params.model != params.model_draft) { + params.model = params.model_draft; + params.n_gpu_layers = params.n_gpu_layers_draft; + std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); + } else { + self_speculation = true; + model_dft = model_tgt; + ctx_dft = ctx_tgt; + } + + const int n_ctx = llama_n_ctx(ctx_tgt); + const int n_vocab = llama_n_vocab(model_tgt); // tokenize the prompt std::vector inp; @@ -84,14 +120,33 @@ int main(int argc, char ** argv) { fflush(stderr); + llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); + llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft); + std::vector logits_tgt, logits_dft; + const int n_input = inp.size(); const auto t_enc_start = ggml_time_us(); // eval the prompt with both models - llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); - llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); - llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0)); + llama_batch_clear(batch_tgt); + logits_tgt.resize(n_vocab * 30); + logits_dft.resize(n_vocab * 30); + for (int i = 0; i < n_input - 1; i++) { + llama_batch_add(batch_tgt, inp[i], i, { 0 }, false); + } + llama_decode(ctx_tgt, batch_tgt); + llama_batch_clear(batch_tgt); + llama_batch_add(batch_tgt, inp.back(), n_input - 1, { 0 }, true); + llama_decode(ctx_tgt, batch_tgt); + save_logits(ctx_tgt, logits_tgt, n_vocab); + if (!self_speculation) { + llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0 + DOFFS)); + } else { + // llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0 + DOFFS)); + llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, 0, -1); + } + // save_logits(ctx_dft, logits_dft, n_vocab, n_input); const auto t_enc_end = ggml_time_us(); @@ -104,6 +159,8 @@ int main(int argc, char ** argv) { int n_predict = 0; int n_drafted = 0; int n_accept = 0; + int n_split = 0; + int n_bad_split = 0; int n_past_tgt = inp.size(); int n_past_dft = inp.size(); @@ -124,8 +181,16 @@ int main(int argc, char ** argv) { drafts[s].ctx_sampling = llama_sampling_init(params.sparams); } - llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); - llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft); + // std::vector run_layers_dft = { + // 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 1, 0, 3, 3, 0, 3, 0, 1, 1, + // 3, 3, 3, 0, 2, 3, 2, 3, 3, 3, 1, 3, 0, 0, 2, 1, 0, 2, 0, 0, + // 0, 3, 0, 1, 0, 1, 1, 3, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 3, 0, + // 3, 1, 3, 3, 0, 1, 3, 3, 3, 1, 3, 0, 0, 0, 1, 1, 2, 0, 1, 1, -1, }; + std::vector run_layers_dft = { + 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 1, 0, 2, 0, 1, 1, + 1, 0, 1, 0, 0, 0, -1, }; + + batch_dft.run_layers = run_layers_dft.data(); const auto t_dec_start = ggml_time_us(); @@ -133,7 +198,11 @@ int main(int argc, char ** argv) { drafts[0].i_batch_tgt.resize(1); drafts[0].i_batch_tgt[0] = 0; + double avg_accepted = 0, avg_rejected = 0; + float min_accepted = 0, max_rejected = 0; + while (true) { + LOG("*** Draft start\n"); // print current draft sequences for (int s = 0; s < n_seq_dft; ++s) { if (!drafts[s].active) { @@ -152,9 +221,11 @@ int main(int argc, char ** argv) { LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); // sample from the target model + restore_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]); llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); llama_sampling_accept(ctx_sampling, ctx_tgt, id, true); + save_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]); //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str()); @@ -179,11 +250,26 @@ int main(int argc, char ** argv) { } if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) { - LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str()); + LOG("the sampled target token matches drafted token %d of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str()); s_keep = s; matches = true; + LOG("Derp[%d]: %6d (%5.4f)\n", s, drafts[s].tokens[i_dft], drafts[s].tokens_p[i_dft]); + if (min_accepted == 0) min_accepted = drafts[s].tokens_p[i_dft]; + else min_accepted = std::min(min_accepted, drafts[s].tokens_p[i_dft]); + avg_accepted += drafts[s].tokens_p[i_dft] * (avg_accepted == 0 ? 2 : 1); + avg_accepted /= 2; } else { + if (i_dft < (int) drafts[s].tokens.size() && id != drafts[s].tokens[i_dft]) { + if (i_dft == 0 && s > 0) n_bad_split++; + max_rejected = std::max(max_rejected, drafts[s].tokens_p[i_dft]); + avg_rejected += drafts[s].tokens_p[i_dft] * (avg_rejected == 0 ? 2 : 1); + avg_rejected /= 2; + LOG("-- Terminate sequence %d+%d: (%d, '%s') != target (%d, '%s') - rejected\n", + s, i_dft, drafts[s].tokens[i_dft], + llama_token_to_piece(ctx_dft, drafts[s].tokens[i_dft]).c_str(), + id, token_str.c_str()); + } drafts[s].active = false; } } @@ -204,6 +290,18 @@ int main(int argc, char ** argv) { { LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); + llama_kv_cache_seq_rm(ctx_dft, s_keep + DOFFS, n_past_dft, -1); + llama_kv_cache_seq_rm(ctx_tgt, s_keep, n_past_tgt, -1); + if (s_keep != 0) { + llama_kv_cache_seq_cp(ctx_dft, s_keep + DOFFS, 0 + DOFFS, -1, -1); + llama_kv_cache_seq_cp(ctx_tgt, s_keep, 0, -1, -1); + } + for (int s = 1; s < n_seq_dft; ++s) { + llama_kv_cache_seq_rm(ctx_dft, s + DOFFS, -1, -1); + llama_kv_cache_seq_rm(ctx_tgt, s, -1, -1); + } + + /* llama_kv_cache_seq_keep(ctx_dft, s_keep); llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1); llama_kv_cache_seq_keep(ctx_dft, 0); @@ -212,22 +310,28 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_keep(ctx_tgt, s_keep); llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1); llama_kv_cache_seq_keep(ctx_tgt, 0); + */ + } for (int s = 0; s < n_seq_dft; ++s) { drafts[s].active = false; drafts[s].tokens.clear(); + drafts[s].tokens_p.clear(); drafts[s].i_batch_tgt.clear(); } // note: will be erased after the speculation phase drafts[0].tokens.push_back(id); + drafts[0].tokens_p.push_back(0); drafts[0].i_batch_tgt.push_back(0); llama_batch_clear(batch_dft); - llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true); + llama_batch_add (batch_dft, id, n_past_dft, { 0 + DOFFS }, true); + + LOG("=== EVAL: DRAFT ACCEPTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str()); - llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); llama_decode (ctx_dft, batch_dft); + save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens); ++n_past_dft; @@ -254,6 +358,10 @@ int main(int argc, char ** argv) { llama_batch_clear(batch_tgt); llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); + // double avg_accepted = n_accept > 0 ? avg_accepted / double(n_accept) : 0; + LOG("Average accepted/rejected: %3.5f / %3.5f -- Min accepted/max rejected: %3.5f / %3.5f\n", + avg_accepted, avg_rejected, min_accepted, max_rejected); + // sample n_draft tokens from the draft model using tree-based sampling for (int i = 0; i < n_draft; ++i) { batch_dft.n_tokens = 0; @@ -267,17 +375,24 @@ int main(int argc, char ** argv) { continue; } + restore_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft, drafts[s].i_batch_dft); llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft); + save_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft, drafts[s].i_batch_dft); const auto & cur_p = drafts[s].ctx_sampling->cur; for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) { + if (cur_p[k].p < 1e-5f) continue; LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str()); } - if (cur_p[0].p < p_accept) { - LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept); + double accept_threshold = avg_rejected == 0 || avg_rejected == 0 || n_drafted < 16 + ? p_accept + : std::max(double(min_accepted * 0.98), avg_accepted * 0.75f); + // accept_threshold = 0.8; + if (cur_p[0].p < accept_threshold) { + LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, accept_threshold); drafts[s].drafting = false; continue; } @@ -286,11 +401,20 @@ int main(int argc, char ** argv) { // attempt to split the branch if the probability is high enough for (int f = 1; f < 8; ++f) { - if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) { - LOG("splitting seq %3d into %3d\n", s, n_seq_cur); - - llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); - llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); + // if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) { + // if (n_seq_cur < n_seq_dft && cur_p[f].p > cur_p[0].p / 5) { + double split_threshold = avg_accepted == 0 || avg_rejected == 0 || n_drafted < 16 + ? p_split + : ( std::max(double(min_accepted * 0.7), avg_accepted * 0.4) + * (n_seq_cur >= 2 ? 0.75 : 1.0) ); + // split_threshold = 0.1; + if (n_seq_cur < n_seq_dft && cur_p[f].p >= split_threshold) { + n_split++; + LOG(">>>%d<<< splitting seq %3d into %3d on %6d (%8.3f) '%s'\n", f, s, n_seq_cur, + cur_p[f].id, cur_p[f].p, llama_token_to_piece(ctx_dft, cur_p[f].id).c_str()); + + llama_kv_cache_seq_rm(ctx_dft, n_seq_cur + DOFFS, -1, -1); + llama_kv_cache_seq_cp(ctx_dft, s + DOFFS, n_seq_cur + DOFFS, -1, -1); // all previous tokens from this branch are now also part of the new branch for (int t = 0; t < batch_tgt.n_tokens; ++t) { @@ -309,6 +433,7 @@ int main(int argc, char ** argv) { drafts[n_seq_cur].skip = true; drafts[n_seq_cur].tokens = drafts[s].tokens; + drafts[n_seq_cur].tokens_p = drafts[s].tokens_p; drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft; drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; @@ -331,6 +456,7 @@ int main(int argc, char ** argv) { llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true); drafts[s].tokens.push_back(id); + drafts[s].tokens_p.push_back(cur_p[is].p); // add unique drafted tokens to the target batch drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); @@ -340,7 +466,7 @@ int main(int argc, char ** argv) { // add the token to the batch for batched decoding with the draft model drafts[s].i_batch_dft = batch_dft.n_tokens; - llama_batch_add(batch_dft, id, n_past_cur, { s }, true); + llama_batch_add(batch_dft, id, n_past_cur, { s + DOFFS }, true); if (batch_tgt.n_tokens > n_draft) { drafts[s].drafting = false; @@ -352,9 +478,18 @@ int main(int argc, char ** argv) { if (batch_dft.n_tokens == 0) { break; } + // LOG("Draft eval: %d\n", batch_dft.n_tokens); + // for (int x = 0; x < batch_dft.n_tokens; x++) { + // LOG("* %03d: seq %3d, pos %4d, token %6d '%s'", x, + // batch_dft.seq_id[x][0], batch_dft.pos[x], + // batch_dft.token[x], llama_token_to_piece(ctx_dft, batch_dft.token[x]).c_str()); + // } + + LOG("=== EVAL: DRAFTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str()); // evaluate the drafted tokens on the draft model llama_decode(ctx_dft, batch_dft); + save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens); ++n_past_cur; ++n_drafted; @@ -365,13 +500,17 @@ int main(int argc, char ** argv) { // evaluate the target model on the drafted tokens { - llama_kv_cache_seq_keep(ctx_tgt, 0); + // llama_kv_cache_seq_keep(ctx_tgt, 0); + for (int s = 1; s < n_seq_dft; ++s) { + llama_kv_cache_seq_rm(ctx_tgt, s, -1, -1); + } for (int s = 1; s < n_seq_dft; ++s) { llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); } - //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt)); + LOG("=== EVAL: TARGET ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); llama_decode(ctx_tgt, batch_tgt); + save_logits(ctx_tgt, logits_tgt, n_vocab, batch_tgt.n_tokens); ++n_past_tgt; } @@ -382,6 +521,7 @@ int main(int argc, char ** argv) { } drafts[s].tokens.erase(drafts[s].tokens.begin()); + drafts[s].tokens_p.erase(drafts[s].tokens_p.begin()); } } @@ -395,9 +535,13 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); LOG_TEE("n_draft = %d\n", n_draft); LOG_TEE("n_predict = %d\n", n_predict); + LOG_TEE("drafted = %.3f%%\n", 100.0f * n_drafted / n_predict); LOG_TEE("n_drafted = %d\n", n_drafted); LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_TEE("n_draft = %d\n", n_draft); + LOG_TEE("n_split = %d\n", n_split); + LOG_TEE("n_badsplit= %d\n", n_bad_split); LOG_TEE("\ndraft:\n"); llama_print_timings(ctx_dft); @@ -415,8 +559,10 @@ int main(int argc, char ** argv) { llama_free(ctx_tgt); llama_free_model(model_tgt); - llama_free(ctx_dft); - llama_free_model(model_dft); + if (!self_speculation) { + llama_free(ctx_dft); + llama_free_model(model_dft); + } llama_backend_free(); From fae6d9c70d14bedf2a274801dca78199cce27917 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 19 Oct 2023 18:11:15 -0600 Subject: [PATCH 3/6] Fix pushing in wrong halflayer idx --- examples/perplexity/perplexity.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index fb3b018f21864..bb416b459245a 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -405,7 +405,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp])); if (anti_mode) { skip_types[lidx] |= std::get<1>(pass_results[temp]); - skips.push_back(std::get<1>(pass_results[temp]) == 1 ? lidx : -lidx); + skips.push_back(std::get<1>(pass_results[temp]) == 1 ? lidx : lidx + n_layers); } if (++pruned >= num_prune) break; } From d6b44fb3aef1e81c4075412280e888d205eeebae Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 19 Oct 2023 21:14:23 -0600 Subject: [PATCH 4/6] Force measure to allocate more memory for 70Bs --- llama.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index ff47e632024dd..a4bd3932e9f91 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3266,14 +3266,11 @@ static struct ggml_cgraph * llm_build_llama( } else { run_layer = NULL; } - } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 1) { - // No idea why this is needed, but otherwise we run out of space - // when skipping attn or mlp (but not both) on the last layer - run_mlp = false; - } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 2) { - // No idea why this is needed, but otherwise we run out of space - // when skipping attn or mlp (but not both) on the last layer - run_attn = false; + } else if (ggml_allocr_is_measure(lctx.alloc)) { + if (il == 0 || il == n_layer - 1) run_mlp = false; + else if (il == 1 || il == n_layer - 2) run_attn = false; + else if (il & 1) run_mlp = false; + else run_attn = false; } if (!run_attn && !run_mlp) continue; From 8a569cfee5eecc4262d5fcb40440669cf98959c5 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Sat, 21 Oct 2023 04:26:47 -0600 Subject: [PATCH 5/6] perplexity anti-mode improvements --- examples/perplexity/perplexity.cpp | 32 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index bb416b459245a..62d55fee51f30 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -352,13 +352,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::vector extremes; extremes.resize(n_layers); std::fill(extremes.begin(), extremes.end(), 0); - if (anti_mode) { - // No pointing in starting with first/last layer disabled. - skip_types[0] = 15; - skip_types[n_layers - 1] = 15; - skips.push_back(0); skips.push_back(0 + n_layers); - skips.push_back(n_layers - 1); skips.push_back(n_layers - 1 + n_layers); - } + // if (anti_mode) { + // // No point in starting with first/last layer disabled. + // skip_types[0] = 15; + // skip_types[n_layers - 1] = 15; + // skips.push_back(0); skips.push_back(0 + n_layers); + // skips.push_back(n_layers - 1); skips.push_back(n_layers - 1 + n_layers); + // } int32_t curr_best_layer = -1, curr_best_type = 0; double curr_best_ppl = -1, ref_ppl = -1; const int32_t mask = anti_mode ? 3 : 0; @@ -389,7 +389,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } if (skip_layer >= n_layers) { if (curr_best_layer == -1) break; - if (prune_target > 0 && pass_results.size() >= prune_target * 2) { + if (anti_mode || (prune_target > 0 && pass_results.size() >= prune_target * 2)) { std::sort(pass_results.begin(), pass_results.end(), [](const std::tuple & a, const std::tuple & b) { if (anti_mode) return std::get<2>(b) > std::get<2>(a); @@ -399,14 +399,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const size_t num_prune = std::min(pass_results.size(), prune_target); for (size_t temp = 0, pruned = 0; temp < pass_results.size(); temp++) { int32_t lidx = std::get<0>(pass_results[temp]); - if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue; - extremes[lidx] |= std::get<1>(pass_results[temp]); - printf("\nPrune[%zu]: %d (%d) - %.2f\n", pruned + 1, lidx, - std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp])); if (anti_mode) { skip_types[lidx] |= std::get<1>(pass_results[temp]); skips.push_back(std::get<1>(pass_results[temp]) == 1 ? lidx : lidx + n_layers); } + if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue; + extremes[lidx] |= std::get<1>(pass_results[temp]); + printf("\nPrune[%zu]: %d (%d) - %.2f\n", pruned + 1, lidx, + std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp])); if (++pruned >= num_prune) break; } } @@ -414,9 +414,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par printf("\n\nADD %c%3d - ppl vs ref %.4f", int(label[curr_best_type]), curr_best_layer, curr_best_ppl - ref_ppl); - if (!anti_mode && curr_best_ppl > ref_ppl * 1.75) break; - skip_types[curr_best_layer] += curr_best_type; - skips.push_back(curr_best_type == 1 ? curr_best_layer : curr_best_layer + n_layers); + if (!anti_mode) { + if (curr_best_ppl > ref_ppl * 1.75) break; + skip_types[curr_best_layer] += curr_best_type; + skips.push_back(curr_best_type == 1 ? curr_best_layer : curr_best_layer + n_layers); + } curr_best_layer = -1; curr_best_ppl = -1; curr_best_type = 0; From 13e08d0efa8f1803cd71a8413eb531f9cc3dfb2e Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Mon, 23 Oct 2023 02:40:37 -0600 Subject: [PATCH 6/6] Sync latest changes --- examples/perplexity/perplexity.cpp | 11 +- examples/speculative/speculative.cpp | 301 ++++++++++++++++++++++----- 2 files changed, 258 insertions(+), 54 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 62d55fee51f30..c9b393caa9dca 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -397,6 +397,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } ); const size_t num_prune = std::min(pass_results.size(), prune_target); + if (num_prune > 0) printf("\nPruning: "); for (size_t temp = 0, pruned = 0; temp < pass_results.size(); temp++) { int32_t lidx = std::get<0>(pass_results[temp]); if (anti_mode) { @@ -405,17 +406,17 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue; extremes[lidx] |= std::get<1>(pass_results[temp]); - printf("\nPrune[%zu]: %d (%d) - %.2f\n", pruned + 1, lidx, + printf("[%zu: %d (%d) - %.2f], ", pruned + 1, lidx, std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp])); if (++pruned >= num_prune) break; } } pass_results.clear(); - printf("\n\nADD %c%3d - ppl vs ref %.4f", + printf("\n\nADD %c%3d - ppl vs ref %.4f - cur:[", int(label[curr_best_type]), curr_best_layer, curr_best_ppl - ref_ppl); if (!anti_mode) { - if (curr_best_ppl > ref_ppl * 1.75) break; + // if (curr_best_ppl > ref_ppl * 1.75) break; skip_types[curr_best_layer] += curr_best_type; skips.push_back(curr_best_type == 1 ? curr_best_layer : curr_best_layer + n_layers); } @@ -426,6 +427,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { skip_types[new_sl] = (skip_types[new_sl] & 3) | (extremes[new_sl] << 2); } + for (int32_t i = 0; i < n_layers; i++) { + const int val = mask ^ (skip_types[i] & 3); + printf("%d%s", val, i < n_layers - 1 ? ", " : "]"); + } for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3); // printf("||%d, %d\n", new_sl, curr_skipped); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 5830b4fb36560..3d8dc13477b62 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -15,6 +15,8 @@ struct seq_draft { bool drafting = false; bool skip = false; + int split_pos = 0; + int i_batch_dft = 0; std::vector i_batch_tgt; @@ -27,7 +29,7 @@ struct seq_draft { static void save_logits(llama_context * ctx, std::vector & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) { // printf("SAVE %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs); // printf(""); - GGML_ASSERT(doffs + count <= 30); + GGML_ASSERT(doffs + count < 64); memcpy( v.data() + doffs * n_vocab, llama_get_logits(ctx) + soffs * n_vocab, @@ -37,13 +39,47 @@ static void save_logits(llama_context * ctx, std::vector & v, const int n static void restore_logits(llama_context * ctx, std::vector & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) { // printf(""); // printf("REST %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs); - GGML_ASSERT(soffs + count <= 30); + GGML_ASSERT(soffs + count < 64); memcpy( llama_get_logits(ctx) + doffs * n_vocab, v.data() + soffs * n_vocab, sizeof(float) * size_t(n_vocab) * count); } +static llama_token_data_array normalize_candidates(const float * logits, const int n_vocab, std::vector & cur) { + cur.reserve(n_vocab); + cur.clear(); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + llama_sample_top_k(NULL, &cur_p, 100, 1); + llama_sample_softmax(NULL, &cur_p); + cur.resize(cur_p.size); + return cur_p; +} + +static int32_t find_normalized(const llama_token_data_array & tda, const llama_token id) { + llama_token_data *item = tda.data; + + for (int32_t i = 0; i < tda.size; i++, item++) + if (item->id == id) return i; + return -1; +} + +static double running_average(double & cur, double val, double n = 20) { + if (cur < 1e-5f) { + cur = val; + return cur; + } + // New average = old average * (n-1)/n + new value /n + cur = cur * (n - 1) / n + val / n; + return cur; +} + + int main(int argc, char ** argv) { gpt_params params; @@ -62,8 +98,8 @@ int main(int argc, char ** argv) { // TODO: make this configurable // const float p_accept = 0.80f; // const float p_split = 0.10f; - const float p_accept = 0.5f; // 0.80f; - const float p_split = p_accept / 8; // 0.10f; + const float p_accept = 0.75f; // 0.80f; + const float p_split = 0.6f; // p_accept / 8; // 0.10f; #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("speculative", "log")); @@ -130,8 +166,8 @@ int main(int argc, char ** argv) { // eval the prompt with both models llama_batch_clear(batch_tgt); - logits_tgt.resize(n_vocab * 30); - logits_dft.resize(n_vocab * 30); + logits_tgt.resize(n_vocab * 64); + logits_dft.resize(n_vocab * 64); for (int i = 0; i < n_input - 1; i++) { llama_batch_add(batch_tgt, inp[i], i, { 0 }, false); } @@ -146,7 +182,7 @@ int main(int argc, char ** argv) { // llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0 + DOFFS)); llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, 0, -1); } - // save_logits(ctx_dft, logits_dft, n_vocab, n_input); + save_logits(ctx_dft, logits_dft, n_vocab, n_input); const auto t_enc_end = ggml_time_us(); @@ -161,6 +197,11 @@ int main(int argc, char ** argv) { int n_accept = 0; int n_split = 0; int n_bad_split = 0; + int n_dup_split = 0; + int n_eff_split = 0; + int max_streak = 0; + + int64_t t_dft_sample = 0, t_dft_gen = 0, t_dft_accept = 0, t_tgt_predict = 0; int n_past_tgt = inp.size(); int n_past_dft = inp.size(); @@ -170,26 +211,35 @@ int main(int argc, char ** argv) { // target model sampling context struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); + struct llama_sampling_context * ctx_dft_sampling = llama_sampling_init(params.sparams); + std::vector normalized_candidates; + normalized_candidates.reserve(n_vocab); + llama_token_data_array normalized_p; // draft sequence data std::vector drafts(n_seq_dft); params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar - params.sparams.temp = std::max(0.01f, params.sparams.temp); + // params.sparams.temp = std::max(0.01f, params.sparams.temp); for (int s = 0; s < n_seq_dft; ++s) { drafts[s].ctx_sampling = llama_sampling_init(params.sparams); } - // std::vector run_layers_dft = { - // 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 1, 0, 3, 3, 0, 3, 0, 1, 1, - // 3, 3, 3, 0, 2, 3, 2, 3, 3, 3, 1, 3, 0, 0, 2, 1, 0, 2, 0, 0, - // 0, 3, 0, 1, 0, 1, 1, 3, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 3, 0, - // 3, 1, 3, 3, 0, 1, 3, 3, 3, 1, 3, 0, 0, 0, 1, 1, 2, 0, 1, 1, -1, }; + // 70B (80 layers) skips example std::vector run_layers_dft = { - 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 1, 0, 2, 0, 1, 1, - 1, 0, 1, 0, 0, 0, -1, }; + 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 1, 0, 3, 3, 0, 3, 0, 1, 1, + 3, 3, 3, 0, 2, 3, 2, 3, 3, 3, 1, 3, 0, 0, 2, 1, 0, 2, 0, 0, + 0, 3, 0, 1, 0, 1, 1, 3, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 3, 0, + 3, 1, 3, 3, 0, 1, 3, 3, 3, 1, 3, 0, 0, 0, 1, 1, 2, 0, 1, 1, -1, }; + + // 3B (26 layers) skips example + // std::vector run_layers_dft = { + // 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 1, 3, 0, 2, 3, 3, 1, 0, 2, 0, 1, 1, 2, 0, 0, + // // 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 2, 1, 3, 0, 2, 3, 3, 1, 1, 2, 1, 1, 1, 2, 0, 1, + // -1, }; + // NOTE: Comment this line out to disable skipping. batch_dft.run_layers = run_layers_dft.data(); const auto t_dec_start = ggml_time_us(); @@ -198,8 +248,13 @@ int main(int argc, char ** argv) { drafts[0].i_batch_tgt.resize(1); drafts[0].i_batch_tgt[0] = 0; - double avg_accepted = 0, avg_rejected = 0; - float min_accepted = 0, max_rejected = 0; + double avg_accepted = 0, avg_rejected = 0, tgt_avg_accepted = 0; + double avg_accept_delta = 0; + float min_accepted = 0, max_rejected = 0, tgt_min_accepted = 0; + + int64_t t_cur; + + std::vector>> doubt; while (true) { LOG("*** Draft start\n"); @@ -217,15 +272,37 @@ int main(int argc, char ** argv) { int i_dft = 0; int s_keep = 0; + float tgt_last_norm = 0, tgt_last_best_norm = 0, tgt_last_orig = 0; + while (true) { LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); // sample from the target model restore_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]); + normalized_p = normalize_candidates(llama_get_logits_ith(ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]), n_vocab, normalized_candidates); llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); + save_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]); + int32_t norm_pos = find_normalized(normalized_p, id); + int32_t orig_pos = find_normalized({ctx_sampling->cur.data(), ctx_sampling->cur.size(), false}, id); + if (norm_pos >= 0) { + tgt_last_norm = normalized_candidates[norm_pos].p; + tgt_last_best_norm = normalized_candidates[0].p; + running_average(tgt_avg_accepted, tgt_last_norm); + tgt_min_accepted = tgt_min_accepted < 1e-4 + ? tgt_last_norm + : std::min(tgt_min_accepted, tgt_last_norm); + } else { + tgt_last_norm = tgt_last_best_norm = tgt_avg_accepted; + } + if (orig_pos >= 0) { + tgt_last_orig = ctx_sampling->cur[orig_pos].p; + } + LOG("target sampled (%d, '%s') orig_p=%5.4f, norm_p=%5.4f\n", + id, llama_token_to_piece(ctx_tgt, id).c_str(), + orig_pos >= 0 ? ctx_sampling->cur[orig_pos].p : -1, + norm_pos >= 0 ? normalized_candidates[norm_pos].p : -1); llama_sampling_accept(ctx_sampling, ctx_tgt, id, true); - save_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]); //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str()); @@ -245,26 +322,30 @@ int main(int argc, char ** argv) { bool matches = false; for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].active) { + if (!drafts[s].active || i_dft < drafts[s].split_pos) { continue; } if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) { - LOG("the sampled target token matches drafted token %d of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str()); + LOG("the sampled target token matches drafted token %d of sequence %d (%d, '%s') - accepted\n", + i_dft, s, id, token_str.c_str()); + if (i_dft == 0 && s > 0) { + if (matches) n_dup_split++; + else n_eff_split++; + } s_keep = s; matches = true; LOG("Derp[%d]: %6d (%5.4f)\n", s, drafts[s].tokens[i_dft], drafts[s].tokens_p[i_dft]); if (min_accepted == 0) min_accepted = drafts[s].tokens_p[i_dft]; else min_accepted = std::min(min_accepted, drafts[s].tokens_p[i_dft]); - avg_accepted += drafts[s].tokens_p[i_dft] * (avg_accepted == 0 ? 2 : 1); - avg_accepted /= 2; + running_average(avg_accepted, drafts[s].tokens_p[i_dft]); + running_average(avg_accept_delta, tgt_last_norm - drafts[s].tokens_p[i_dft]); } else { if (i_dft < (int) drafts[s].tokens.size() && id != drafts[s].tokens[i_dft]) { if (i_dft == 0 && s > 0) n_bad_split++; max_rejected = std::max(max_rejected, drafts[s].tokens_p[i_dft]); - avg_rejected += drafts[s].tokens_p[i_dft] * (avg_rejected == 0 ? 2 : 1); - avg_rejected /= 2; + running_average(avg_rejected, drafts[s].tokens_p[i_dft]); LOG("-- Terminate sequence %d+%d: (%d, '%s') != target (%d, '%s') - rejected\n", s, i_dft, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_dft, drafts[s].tokens[i_dft]).c_str(), @@ -279,8 +360,27 @@ int main(int argc, char ** argv) { ++n_past_tgt; ++n_past_dft; ++i_dft; - + max_streak = std::max(max_streak, i_dft); continue; + } else { + for (size_t seqnum = 0; seqnum < doubt.size(); seqnum++) { + const std::vector> & sdoubt = doubt[seqnum]; + if (sdoubt.size() <= i_dft) continue; + const std::vector & sidoubt = sdoubt[i_dft]; + for (size_t cidx = 0; cidx < sidoubt.size(); cidx++) { + if (sidoubt[cidx].id == id) { + LOG("Shoulda picked seq %3zu, pos %4d, candidate %2zu @ p %5.4f: %6d '%s'\n", + seqnum, i_dft, cidx, sidoubt[cidx].p, + id, token_str.c_str()); + running_average(avg_accepted, sidoubt[cidx].p); + if (cidx < 2) { + running_average(avg_accept_delta, tgt_last_norm - sidoubt[cidx].p); + min_accepted = min_accepted < 1e-5f ? sidoubt[cidx].p : std::min(min_accepted, sidoubt[cidx].p); + } + break; + } + } + } } } @@ -315,6 +415,7 @@ int main(int argc, char ** argv) { } for (int s = 0; s < n_seq_dft; ++s) { + drafts[s].split_pos = 0; drafts[s].active = false; drafts[s].tokens.clear(); drafts[s].tokens_p.clear(); @@ -327,10 +428,18 @@ int main(int argc, char ** argv) { llama_batch_clear(batch_dft); llama_batch_add (batch_dft, id, n_past_dft, { 0 + DOFFS }, true); + if (self_speculation) { + // Copy KV items from non-brain-damaged model... Doesn't seem to help. + llama_kv_cache_seq_rm(ctx_dft, 0 + DOFFS, 0, n_past_dft - 2); + llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, 0, n_past_dft - 2); + // llama_kv_cache_seq_rm(ctx_dft, 0 + DOFFS, n_past_dft - 1, -1); + // llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, n_past_dft - 1, -1); + } LOG("=== EVAL: DRAFT ACCEPTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str()); - + t_cur = ggml_time_us(); llama_decode (ctx_dft, batch_dft); + t_dft_accept += ggml_time_us() - t_cur; save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens); ++n_past_dft; @@ -358,9 +467,14 @@ int main(int argc, char ** argv) { llama_batch_clear(batch_tgt); llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); + avg_rejected = std::max(0.05, std::min(avg_accepted - 0.05, avg_rejected)); + avg_accepted = std::max(0.05, std::max(avg_rejected + 0.05, avg_accepted)); // double avg_accepted = n_accept > 0 ? avg_accepted / double(n_accept) : 0; - LOG("Average accepted/rejected: %3.5f / %3.5f -- Min accepted/max rejected: %3.5f / %3.5f\n", - avg_accepted, avg_rejected, min_accepted, max_rejected); + LOG("STATS: Avg tacc/dacc/drej: %3.5f / %3.5f / %3.5f | Min dacc/min tacc/max drej: %3.5f / %3.5f / %3.5f | delta %3.5f | max streak %d | n_dft/pred/acc: %d / %d / %d\n", + tgt_avg_accepted, avg_accepted, avg_rejected, min_accepted, tgt_min_accepted, max_rejected, avg_accept_delta, max_streak, + n_drafted, n_predict, n_accept); + doubt.clear(); + doubt.resize(n_seq_dft); // sample n_draft tokens from the draft model using tree-based sampling for (int i = 0; i < n_draft; ++i) { @@ -371,43 +485,116 @@ int main(int argc, char ** argv) { } for (int s = 0; s < n_seq_dft; ++s) { + double accept_threshold, split_threshold; + if (!drafts[s].drafting || drafts[s].skip) { continue; } + doubt[s].push_back({}); + + if (avg_rejected == 0 || avg_rejected == 0 || n_drafted + n_predict < 6) { + accept_threshold = std::max(0.6f, tgt_last_norm); + } else { + + accept_threshold = (tgt_avg_accepted - avg_accept_delta) * 0.3; + accept_threshold *= std::min(0.8, std::max(0.1, double(tgt_last_norm * 1.0))); + accept_threshold = std::max(double(min_accepted) * 1.1, accept_threshold); + accept_threshold = std::max(std::max(avg_accepted * 0.9, avg_rejected * 1.1), accept_threshold); + accept_threshold += 1.0 - (1.2 * n_accept / n_drafted); + accept_threshold *= (1.3 - (std::min(n_seq_cur + i, 6) * 0.1)); + // + // accept_threshold = (tgt_avg_accepted - avg_accept_delta) * 0.3; + // accept_threshold *= std::min(0.8, std::max(0.1, double(tgt_last_norm * 1.0))); + // accept_threshold = std::max(double(min_accepted) * 1.1, accept_threshold); + // accept_threshold = std::max(std::max(avg_accepted * 0.9, avg_rejected * 1.1), accept_threshold); + // accept_threshold += 1.0 - (1.2 * n_accept / n_drafted); + // accept_threshold *= (0.7 + (std::min(n_seq_cur + i, 5) * 0.1)); + + } - restore_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft, drafts[s].i_batch_dft); - llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft); - save_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft, drafts[s].i_batch_dft); + std::vector cur_p; + { + llama_token d_id; + std::vector already_picked; + float * logits = NULL; + + t_cur = ggml_time_us(); + for (int cidx = 0; cidx < 9; cidx++) { + llama_sampling_cp(drafts[s].ctx_sampling, ctx_dft_sampling); + restore_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft); + logits = llama_get_logits(ctx_dft); + normalized_p = normalize_candidates(logits, n_vocab, normalized_candidates); + for (size_t x = 0; x < std::min(normalized_p.size, size_t(10)); x++) + doubt[s].back().push_back(normalized_p.data[x]); + for (const auto & tid : already_picked) + logits[tid] = std::numeric_limits::infinity() * -1; + d_id = llama_sampling_sample(ctx_dft_sampling, ctx_dft, NULL); + already_picked.push_back(d_id); + int32_t norm_pos = find_normalized(normalized_p, d_id); + if (norm_pos < 0) continue; + llama_token_data norm = normalized_candidates[norm_pos]; + if (norm.p < 0.2) continue; + if (ctx_dft_sampling->params.temp <= 0) { + llama_token_data_array tda = { ctx_dft_sampling->cur.data(), ctx_dft_sampling->cur.size(), false }; + llama_sample_top_k(ctx_dft, &tda, 100, 1); + llama_sample_softmax(ctx_dft, &tda); + ctx_dft_sampling->cur.resize(tda.size); + } - const auto & cur_p = drafts[s].ctx_sampling->cur; - for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) { - if (cur_p[k].p < 1e-5f) continue; - LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str()); + llama_token_data found; + found.id = -1; + for (const llama_token_data & td : ctx_dft_sampling->cur) { + if (td.id == d_id) { + found = td; + break; + } + } + GGML_ASSERT(found.id != -1); + LOG(" ** draft candidate %3d for seq %3d, pos %3d: %6d (%4.3f, norm %4.3f) '%s'\n", + cidx, s, i, found.id, found.p, norm_pos >= 0 ? normalized_candidates[norm_pos].p : -1, + llama_token_to_piece(ctx_dft, found.id).c_str()); + if (found.p < 0.3) continue; + if (norm.p < 1e-2f) break; + cur_p.push_back(normalized_candidates[norm_pos]); + } + + if (cur_p.size() > 1) { + std::sort(cur_p.begin() + 1, cur_p.end(), + [](const llama_token_data & a, const llama_token_data & b) { + return a.p > b.p; + } + ); + } + } - double accept_threshold = avg_rejected == 0 || avg_rejected == 0 || n_drafted < 16 - ? p_accept - : std::max(double(min_accepted * 0.98), avg_accepted * 0.75f); - // accept_threshold = 0.8; - if (cur_p[0].p < accept_threshold) { - LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, accept_threshold); + t_dft_sample += ggml_time_us() - t_cur; + + if (cur_p.empty()) { + LOG("stopping drafting for seq %3d, no viable candidates (%5.3f) \n", s, accept_threshold); + drafts[s].drafting = false; + continue; + } else if (cur_p[0].p < accept_threshold && (cur_p[0].p + (cur_p.size() < 2 ? 0 : cur_p[1].p)) < accept_threshold * 1.3) { + LOG("stopping drafting for seq %3d, pos %3d - probability too low: %.3f < %.3f\n", s, i, cur_p[0].p, accept_threshold); drafts[s].drafting = false; continue; } + if (cur_p[0].p < accept_threshold) { + split_threshold = 0.0; + } else { + split_threshold = cur_p[0].p / 10.0; + // split_threshold = std::max(0.01, cur_p[0].p * (n_seq_cur + i > 1 ? 0.15 : 0.2)); + } + std::vector sa(1, s); + + + // LOG("Check splits: %zu\n", cur_p.size()); // attempt to split the branch if the probability is high enough - for (int f = 1; f < 8; ++f) { - // if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) { - // if (n_seq_cur < n_seq_dft && cur_p[f].p > cur_p[0].p / 5) { - double split_threshold = avg_accepted == 0 || avg_rejected == 0 || n_drafted < 16 - ? p_split - : ( std::max(double(min_accepted * 0.7), avg_accepted * 0.4) - * (n_seq_cur >= 2 ? 0.75 : 1.0) ); - // split_threshold = 0.1; + for (int f = 1; f < std::min(8, int(cur_p.size()) - 1); ++f) { if (n_seq_cur < n_seq_dft && cur_p[f].p >= split_threshold) { n_split++; LOG(">>>%d<<< splitting seq %3d into %3d on %6d (%8.3f) '%s'\n", f, s, n_seq_cur, @@ -428,6 +615,7 @@ int main(int argc, char ** argv) { } // copy the draft state + drafts[n_seq_cur].split_pos = i; drafts[n_seq_cur].active = true; drafts[n_seq_cur].drafting = true; drafts[n_seq_cur].skip = true; @@ -443,6 +631,8 @@ int main(int argc, char ** argv) { n_seq_cur++; } else { + LOG("Not splitting seq %3d into %3d, choice %2d @ %6d (%8.3f) '%s'\n", s, n_seq_cur, f, + cur_p[f].id, cur_p[f].p, llama_token_to_piece(ctx_dft, cur_p[f].id).c_str()); break; } } @@ -488,7 +678,9 @@ int main(int argc, char ** argv) { LOG("=== EVAL: DRAFTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str()); // evaluate the drafted tokens on the draft model + t_cur = ggml_time_us(); llama_decode(ctx_dft, batch_dft); + t_dft_gen += ggml_time_us() - t_cur; save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens); ++n_past_cur; ++n_drafted; @@ -509,7 +701,9 @@ int main(int argc, char ** argv) { } LOG("=== EVAL: TARGET ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); + t_cur = ggml_time_us(); llama_decode(ctx_tgt, batch_tgt); + t_tgt_predict += ggml_time_us() - t_cur; save_logits(ctx_tgt, logits_tgt, n_vocab, batch_tgt.n_tokens); ++n_past_tgt; } @@ -531,7 +725,9 @@ int main(int argc, char ** argv) { LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - + LOG_TEE("times: target predict: %5.3f, draft gen/accept/sample: %5.3f / %5.3f / %5.3f\n", + t_tgt_predict / 1e6f, t_dft_gen / 1e6f, t_dft_accept / 1e6f, t_dft_sample / 1e6f); +// int64_t t_dft_sample = 0, t_dft_gen = 0, t_dft_accept = 0, t_tgt_predict = 0; LOG_TEE("\n"); LOG_TEE("n_draft = %d\n", n_draft); LOG_TEE("n_predict = %d\n", n_predict); @@ -541,7 +737,10 @@ int main(int argc, char ** argv) { LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("n_draft = %d\n", n_draft); LOG_TEE("n_split = %d\n", n_split); + LOG_TEE("n_effsplit= %d\n", n_eff_split); LOG_TEE("n_badsplit= %d\n", n_bad_split); + LOG_TEE("n_dupsplit= %d\n", n_dup_split); + LOG_TEE("max streak= %d\n", max_streak); LOG_TEE("\ndraft:\n"); llama_print_timings(ctx_dft);