diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 7d0038bd40757..c9b393caa9dca 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2,6 +2,7 @@ #include "common.h" #include "llama.h" +#include #include #include #include @@ -320,6 +321,48 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; + llama_batch batch = llama_batch_get_one(NULL, 0, 0, 0); + + // model layer count + const int32_t n_layers = 32; + + // num perplexity chunks to run for each test + const int test_count = 4; + + // prune this many of the worst results each pass + const size_t prune_target = 2; + + // start with all but first/last layers disabled and start adding them back + const bool anti_mode = true; + + // **** end tunables *** + + // 1 = attn, 2 = mlp, 3 = both + int32_t test_skip_type = 0; // but don't mess with this, it's set automatically. + std::vector layers; + layers.resize(n_layers + 1); + std::fill(layers.begin(), layers.end(), 0); + batch.run_layers = layers.data(); + int32_t skip_layer = -1; + std::vector skips; + std::vector skip_types; + skip_types.resize(n_layers); + std::fill(skip_types.begin(), skip_types.end(), 0); + std::vector> pass_results; + std::vector extremes; + extremes.resize(n_layers); + std::fill(extremes.begin(), extremes.end(), 0); + // if (anti_mode) { + // // No point in starting with first/last layer disabled. + // skip_types[0] = 15; + // skip_types[n_layers - 1] = 15; + // skips.push_back(0); skips.push_back(0 + n_layers); + // skips.push_back(n_layers - 1); skips.push_back(n_layers - 1 + n_layers); + // } + int32_t curr_best_layer = -1, curr_best_type = 0; + double curr_best_ppl = -1, ref_ppl = -1; + const int32_t mask = anti_mode ? 3 : 0; + int count = 0; double nll = 0.0; double nll2 = 0.0; @@ -327,8 +370,102 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); std::vector workers(std::thread::hardware_concurrency() - 1); + static const char * label = "?AMB"; + auto test_t_start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n_chunk; ++i) { + if (i > 0 && i % test_count == 0) { + auto test_t_end = std::chrono::high_resolution_clock::now(); + float test_t_total = std::chrono::duration(test_t_end - test_t_start).count(); + + skip_layer = n_layers; + for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { + int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3); + // printf("##%d, %d\n", new_sl, curr_skipped); + if (curr_skipped == 3) continue; // Already tested or perm skip. + skip_layer = new_sl; + test_skip_type = (curr_skipped & 1) != 0 ? 2 : 1; + break; + } + if (skip_layer >= n_layers) { + if (curr_best_layer == -1) break; + if (anti_mode || (prune_target > 0 && pass_results.size() >= prune_target * 2)) { + std::sort(pass_results.begin(), pass_results.end(), + [](const std::tuple & a, const std::tuple & b) { + if (anti_mode) return std::get<2>(b) > std::get<2>(a); + return std::get<2>(a) > std::get<2>(b); + } + ); + const size_t num_prune = std::min(pass_results.size(), prune_target); + if (num_prune > 0) printf("\nPruning: "); + for (size_t temp = 0, pruned = 0; temp < pass_results.size(); temp++) { + int32_t lidx = std::get<0>(pass_results[temp]); + if (anti_mode) { + skip_types[lidx] |= std::get<1>(pass_results[temp]); + skips.push_back(std::get<1>(pass_results[temp]) == 1 ? lidx : lidx + n_layers); + } + if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue; + extremes[lidx] |= std::get<1>(pass_results[temp]); + printf("[%zu: %d (%d) - %.2f], ", pruned + 1, lidx, + std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp])); + if (++pruned >= num_prune) break; + } + } + pass_results.clear(); + printf("\n\nADD %c%3d - ppl vs ref %.4f - cur:[", + int(label[curr_best_type]), curr_best_layer, + curr_best_ppl - ref_ppl); + if (!anti_mode) { + // if (curr_best_ppl > ref_ppl * 1.75) break; + skip_types[curr_best_layer] += curr_best_type; + skips.push_back(curr_best_type == 1 ? curr_best_layer : curr_best_layer + n_layers); + } + curr_best_layer = -1; + curr_best_ppl = -1; + curr_best_type = 0; + skip_layer = n_layers; + for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { + skip_types[new_sl] = (skip_types[new_sl] & 3) | (extremes[new_sl] << 2); + } + for (int32_t i = 0; i < n_layers; i++) { + const int val = mask ^ (skip_types[i] & 3); + printf("%d%s", val, i < n_layers - 1 ? ", " : "]"); + } + for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) { + int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3); + // printf("||%d, %d\n", new_sl, curr_skipped); + if (curr_skipped == 3) continue; // Already tested or perm skip. + skip_layer = new_sl; + test_skip_type = (curr_skipped & 1) != 0 ? 2 : 1; + break; + } + if (skip_layer == -1 || skip_layer == n_layers) break; + } + + i = 0; + count = 0; + nll = 0; + nll2 = 0; + logit_history.clear(); + prob_history.clear(); + + int alive = 0; + for (int32_t i = 0; i < n_layers; i++) { + layers[i] = mask ^ ((skip_types[i] & 3) | (i == skip_layer ? test_skip_type : 0)); + alive += !(layers[i] & 1) + !(layers[i] & 2); + } + layers[n_layers] = -1; + printf("\nTEST %c%3d + [", int(label[test_skip_type]), skip_layer); + for (auto l : skips) { + printf("%c%d, ", int(label[skip_types[l % n_layers] & 3]), l % n_layers); + } + printf("] - live: %3d/%3d, best:(%c%3d @ %.3f), last took %.2f sec\n", + alive, n_layers * 2, + int(label[curr_best_type]), curr_best_layer, + curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0, + test_t_total); + test_t_start = std::chrono::high_resolution_clock::now(); + } const int start = i * n_ctx; const int end = start + n_ctx; @@ -353,7 +490,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par tokens[batch_start] = llama_token_bos(ctx); } - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + batch.n_tokens = batch_size; + batch.token = tokens.data() + batch_start; + batch.all_pos_0 = j * n_batch; + + if (llama_decode(ctx, batch)) { fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } @@ -367,7 +508,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const auto t_end = std::chrono::high_resolution_clock::now(); - if (i == 0) { + if (i == 0 && skip_layer < 0 && ref_ppl < 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); @@ -396,8 +537,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par count += n_ctx - first - 1; // perplexity is e^(average negative log-likelihood) + double ppl = std::exp(nll / count); if (params.ppl_output_type == 0) { - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + printf("[%d]%.4lf,", i + 1, ppl); } else { double av = nll/count; double av2 = nll2/count - av*av; @@ -405,6 +547,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); } fflush(stdout); + if (skip_layer >= 0 && (i + 1 == test_count || (i > 1 && ppl > ref_ppl * 30))) { + i = test_count - 1; + skip_types[skip_layer] |= test_skip_type << 2; + if (curr_best_layer == -1 || ppl < curr_best_ppl) { + curr_best_layer = skip_layer; + curr_best_ppl = ppl; + curr_best_type = test_skip_type; + } + printf(" -- %.3f", ppl - ref_ppl); + pass_results.push_back({skip_layer, test_skip_type, ppl}); + } else if (skip_layer < 0) { + ref_ppl = ppl; + } } printf("\n"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 894321ce9648c..3d8dc13477b62 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -8,19 +8,78 @@ #include #include +#define DOFFS 10000 + struct seq_draft { bool active = false; bool drafting = false; bool skip = false; + int split_pos = 0; + int i_batch_dft = 0; std::vector i_batch_tgt; std::vector tokens; + std::vector tokens_p; struct llama_sampling_context * ctx_sampling; }; +static void save_logits(llama_context * ctx, std::vector & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) { + // printf("SAVE %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs); + // printf(""); + GGML_ASSERT(doffs + count < 64); + memcpy( + v.data() + doffs * n_vocab, + llama_get_logits(ctx) + soffs * n_vocab, + sizeof(float) * size_t(n_vocab) * count); +} + +static void restore_logits(llama_context * ctx, std::vector & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) { + // printf(""); + // printf("REST %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs); + GGML_ASSERT(soffs + count < 64); + memcpy( + llama_get_logits(ctx) + doffs * n_vocab, + v.data() + soffs * n_vocab, + sizeof(float) * size_t(n_vocab) * count); +} + +static llama_token_data_array normalize_candidates(const float * logits, const int n_vocab, std::vector & cur) { + cur.reserve(n_vocab); + cur.clear(); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + llama_sample_top_k(NULL, &cur_p, 100, 1); + llama_sample_softmax(NULL, &cur_p); + cur.resize(cur_p.size); + return cur_p; +} + +static int32_t find_normalized(const llama_token_data_array & tda, const llama_token id) { + llama_token_data *item = tda.data; + + for (int32_t i = 0; i < tda.size; i++, item++) + if (item->id == id) return i; + return -1; +} + +static double running_average(double & cur, double val, double n = 20) { + if (cur < 1e-5f) { + cur = val; + return cur; + } + // New average = old average * (n-1)/n + new value /n + cur = cur * (n - 1) / n + val / n; + return cur; +} + + int main(int argc, char ** argv) { gpt_params params; @@ -37,8 +96,10 @@ int main(int argc, char ** argv) { const int n_seq_dft = params.n_parallel; // TODO: make this configurable - const float p_accept = 0.80f; - const float p_split = 0.10f; + // const float p_accept = 0.80f; + // const float p_split = 0.10f; + const float p_accept = 0.75f; // 0.80f; + const float p_split = 0.6f; // p_accept / 8; // 0.10f; #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("speculative", "log")); @@ -46,6 +107,8 @@ int main(int argc, char ** argv) { log_dump_cmdline(argc, argv); #endif // LOG_DISABLE_LOGS + bool self_speculation = false; + // init llama.cpp llama_backend_init(params.numa); @@ -60,9 +123,18 @@ int main(int argc, char ** argv) { std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); // load the draft model - params.model = params.model_draft; - params.n_gpu_layers = params.n_gpu_layers_draft; - std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); + if (params.model != params.model_draft) { + params.model = params.model_draft; + params.n_gpu_layers = params.n_gpu_layers_draft; + std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); + } else { + self_speculation = true; + model_dft = model_tgt; + ctx_dft = ctx_tgt; + } + + const int n_ctx = llama_n_ctx(ctx_tgt); + const int n_vocab = llama_n_vocab(model_tgt); // tokenize the prompt std::vector inp; @@ -84,14 +156,33 @@ int main(int argc, char ** argv) { fflush(stderr); + llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); + llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft); + std::vector logits_tgt, logits_dft; + const int n_input = inp.size(); const auto t_enc_start = ggml_time_us(); // eval the prompt with both models - llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); - llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); - llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0)); + llama_batch_clear(batch_tgt); + logits_tgt.resize(n_vocab * 64); + logits_dft.resize(n_vocab * 64); + for (int i = 0; i < n_input - 1; i++) { + llama_batch_add(batch_tgt, inp[i], i, { 0 }, false); + } + llama_decode(ctx_tgt, batch_tgt); + llama_batch_clear(batch_tgt); + llama_batch_add(batch_tgt, inp.back(), n_input - 1, { 0 }, true); + llama_decode(ctx_tgt, batch_tgt); + save_logits(ctx_tgt, logits_tgt, n_vocab); + if (!self_speculation) { + llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0 + DOFFS)); + } else { + // llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0 + DOFFS)); + llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, 0, -1); + } + save_logits(ctx_dft, logits_dft, n_vocab, n_input); const auto t_enc_end = ggml_time_us(); @@ -104,6 +195,13 @@ int main(int argc, char ** argv) { int n_predict = 0; int n_drafted = 0; int n_accept = 0; + int n_split = 0; + int n_bad_split = 0; + int n_dup_split = 0; + int n_eff_split = 0; + int max_streak = 0; + + int64_t t_dft_sample = 0, t_dft_gen = 0, t_dft_accept = 0, t_tgt_predict = 0; int n_past_tgt = inp.size(); int n_past_dft = inp.size(); @@ -113,19 +211,36 @@ int main(int argc, char ** argv) { // target model sampling context struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); + struct llama_sampling_context * ctx_dft_sampling = llama_sampling_init(params.sparams); + std::vector normalized_candidates; + normalized_candidates.reserve(n_vocab); + llama_token_data_array normalized_p; // draft sequence data std::vector drafts(n_seq_dft); params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar - params.sparams.temp = std::max(0.01f, params.sparams.temp); + // params.sparams.temp = std::max(0.01f, params.sparams.temp); for (int s = 0; s < n_seq_dft; ++s) { drafts[s].ctx_sampling = llama_sampling_init(params.sparams); } - llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); - llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft); + // 70B (80 layers) skips example + std::vector run_layers_dft = { + 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 1, 0, 3, 3, 0, 3, 0, 1, 1, + 3, 3, 3, 0, 2, 3, 2, 3, 3, 3, 1, 3, 0, 0, 2, 1, 0, 2, 0, 0, + 0, 3, 0, 1, 0, 1, 1, 3, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 3, 0, + 3, 1, 3, 3, 0, 1, 3, 3, 3, 1, 3, 0, 0, 0, 1, 1, 2, 0, 1, 1, -1, }; + + // 3B (26 layers) skips example + // std::vector run_layers_dft = { + // 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 1, 3, 0, 2, 3, 3, 1, 0, 2, 0, 1, 1, 2, 0, 0, + // // 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 2, 1, 3, 0, 2, 3, 3, 1, 1, 2, 1, 1, 1, 2, 0, 1, + // -1, }; + + // NOTE: Comment this line out to disable skipping. + batch_dft.run_layers = run_layers_dft.data(); const auto t_dec_start = ggml_time_us(); @@ -133,7 +248,16 @@ int main(int argc, char ** argv) { drafts[0].i_batch_tgt.resize(1); drafts[0].i_batch_tgt[0] = 0; + double avg_accepted = 0, avg_rejected = 0, tgt_avg_accepted = 0; + double avg_accept_delta = 0; + float min_accepted = 0, max_rejected = 0, tgt_min_accepted = 0; + + int64_t t_cur; + + std::vector>> doubt; + while (true) { + LOG("*** Draft start\n"); // print current draft sequences for (int s = 0; s < n_seq_dft; ++s) { if (!drafts[s].active) { @@ -148,11 +272,35 @@ int main(int argc, char ** argv) { int i_dft = 0; int s_keep = 0; + float tgt_last_norm = 0, tgt_last_best_norm = 0, tgt_last_orig = 0; + while (true) { LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); // sample from the target model + restore_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]); + normalized_p = normalize_candidates(llama_get_logits_ith(ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]), n_vocab, normalized_candidates); llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); + save_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]); + int32_t norm_pos = find_normalized(normalized_p, id); + int32_t orig_pos = find_normalized({ctx_sampling->cur.data(), ctx_sampling->cur.size(), false}, id); + if (norm_pos >= 0) { + tgt_last_norm = normalized_candidates[norm_pos].p; + tgt_last_best_norm = normalized_candidates[0].p; + running_average(tgt_avg_accepted, tgt_last_norm); + tgt_min_accepted = tgt_min_accepted < 1e-4 + ? tgt_last_norm + : std::min(tgt_min_accepted, tgt_last_norm); + } else { + tgt_last_norm = tgt_last_best_norm = tgt_avg_accepted; + } + if (orig_pos >= 0) { + tgt_last_orig = ctx_sampling->cur[orig_pos].p; + } + LOG("target sampled (%d, '%s') orig_p=%5.4f, norm_p=%5.4f\n", + id, llama_token_to_piece(ctx_tgt, id).c_str(), + orig_pos >= 0 ? ctx_sampling->cur[orig_pos].p : -1, + norm_pos >= 0 ? normalized_candidates[norm_pos].p : -1); llama_sampling_accept(ctx_sampling, ctx_tgt, id, true); @@ -174,16 +322,35 @@ int main(int argc, char ** argv) { bool matches = false; for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].active) { + if (!drafts[s].active || i_dft < drafts[s].split_pos) { continue; } if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) { - LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str()); + LOG("the sampled target token matches drafted token %d of sequence %d (%d, '%s') - accepted\n", + i_dft, s, id, token_str.c_str()); + if (i_dft == 0 && s > 0) { + if (matches) n_dup_split++; + else n_eff_split++; + } s_keep = s; matches = true; + LOG("Derp[%d]: %6d (%5.4f)\n", s, drafts[s].tokens[i_dft], drafts[s].tokens_p[i_dft]); + if (min_accepted == 0) min_accepted = drafts[s].tokens_p[i_dft]; + else min_accepted = std::min(min_accepted, drafts[s].tokens_p[i_dft]); + running_average(avg_accepted, drafts[s].tokens_p[i_dft]); + running_average(avg_accept_delta, tgt_last_norm - drafts[s].tokens_p[i_dft]); } else { + if (i_dft < (int) drafts[s].tokens.size() && id != drafts[s].tokens[i_dft]) { + if (i_dft == 0 && s > 0) n_bad_split++; + max_rejected = std::max(max_rejected, drafts[s].tokens_p[i_dft]); + running_average(avg_rejected, drafts[s].tokens_p[i_dft]); + LOG("-- Terminate sequence %d+%d: (%d, '%s') != target (%d, '%s') - rejected\n", + s, i_dft, drafts[s].tokens[i_dft], + llama_token_to_piece(ctx_dft, drafts[s].tokens[i_dft]).c_str(), + id, token_str.c_str()); + } drafts[s].active = false; } } @@ -193,8 +360,27 @@ int main(int argc, char ** argv) { ++n_past_tgt; ++n_past_dft; ++i_dft; - + max_streak = std::max(max_streak, i_dft); continue; + } else { + for (size_t seqnum = 0; seqnum < doubt.size(); seqnum++) { + const std::vector> & sdoubt = doubt[seqnum]; + if (sdoubt.size() <= i_dft) continue; + const std::vector & sidoubt = sdoubt[i_dft]; + for (size_t cidx = 0; cidx < sidoubt.size(); cidx++) { + if (sidoubt[cidx].id == id) { + LOG("Shoulda picked seq %3zu, pos %4d, candidate %2zu @ p %5.4f: %6d '%s'\n", + seqnum, i_dft, cidx, sidoubt[cidx].p, + id, token_str.c_str()); + running_average(avg_accepted, sidoubt[cidx].p); + if (cidx < 2) { + running_average(avg_accept_delta, tgt_last_norm - sidoubt[cidx].p); + min_accepted = min_accepted < 1e-5f ? sidoubt[cidx].p : std::min(min_accepted, sidoubt[cidx].p); + } + break; + } + } + } } } @@ -204,6 +390,18 @@ int main(int argc, char ** argv) { { LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); + llama_kv_cache_seq_rm(ctx_dft, s_keep + DOFFS, n_past_dft, -1); + llama_kv_cache_seq_rm(ctx_tgt, s_keep, n_past_tgt, -1); + if (s_keep != 0) { + llama_kv_cache_seq_cp(ctx_dft, s_keep + DOFFS, 0 + DOFFS, -1, -1); + llama_kv_cache_seq_cp(ctx_tgt, s_keep, 0, -1, -1); + } + for (int s = 1; s < n_seq_dft; ++s) { + llama_kv_cache_seq_rm(ctx_dft, s + DOFFS, -1, -1); + llama_kv_cache_seq_rm(ctx_tgt, s, -1, -1); + } + + /* llama_kv_cache_seq_keep(ctx_dft, s_keep); llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1); llama_kv_cache_seq_keep(ctx_dft, 0); @@ -212,22 +410,37 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_keep(ctx_tgt, s_keep); llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1); llama_kv_cache_seq_keep(ctx_tgt, 0); + */ + } for (int s = 0; s < n_seq_dft; ++s) { + drafts[s].split_pos = 0; drafts[s].active = false; drafts[s].tokens.clear(); + drafts[s].tokens_p.clear(); drafts[s].i_batch_tgt.clear(); } // note: will be erased after the speculation phase drafts[0].tokens.push_back(id); + drafts[0].tokens_p.push_back(0); drafts[0].i_batch_tgt.push_back(0); llama_batch_clear(batch_dft); - llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true); + llama_batch_add (batch_dft, id, n_past_dft, { 0 + DOFFS }, true); + if (self_speculation) { + // Copy KV items from non-brain-damaged model... Doesn't seem to help. + llama_kv_cache_seq_rm(ctx_dft, 0 + DOFFS, 0, n_past_dft - 2); + llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, 0, n_past_dft - 2); + // llama_kv_cache_seq_rm(ctx_dft, 0 + DOFFS, n_past_dft - 1, -1); + // llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, n_past_dft - 1, -1); + } - llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); + LOG("=== EVAL: DRAFT ACCEPTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str()); + t_cur = ggml_time_us(); llama_decode (ctx_dft, batch_dft); + t_dft_accept += ggml_time_us() - t_cur; + save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens); ++n_past_dft; @@ -254,6 +467,15 @@ int main(int argc, char ** argv) { llama_batch_clear(batch_tgt); llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); + avg_rejected = std::max(0.05, std::min(avg_accepted - 0.05, avg_rejected)); + avg_accepted = std::max(0.05, std::max(avg_rejected + 0.05, avg_accepted)); + // double avg_accepted = n_accept > 0 ? avg_accepted / double(n_accept) : 0; + LOG("STATS: Avg tacc/dacc/drej: %3.5f / %3.5f / %3.5f | Min dacc/min tacc/max drej: %3.5f / %3.5f / %3.5f | delta %3.5f | max streak %d | n_dft/pred/acc: %d / %d / %d\n", + tgt_avg_accepted, avg_accepted, avg_rejected, min_accepted, tgt_min_accepted, max_rejected, avg_accept_delta, max_streak, + n_drafted, n_predict, n_accept); + doubt.clear(); + doubt.resize(n_seq_dft); + // sample n_draft tokens from the draft model using tree-based sampling for (int i = 0; i < n_draft; ++i) { batch_dft.n_tokens = 0; @@ -263,34 +485,123 @@ int main(int argc, char ** argv) { } for (int s = 0; s < n_seq_dft; ++s) { + double accept_threshold, split_threshold; + if (!drafts[s].drafting || drafts[s].skip) { continue; } + doubt[s].push_back({}); + + if (avg_rejected == 0 || avg_rejected == 0 || n_drafted + n_predict < 6) { + accept_threshold = std::max(0.6f, tgt_last_norm); + } else { + + accept_threshold = (tgt_avg_accepted - avg_accept_delta) * 0.3; + accept_threshold *= std::min(0.8, std::max(0.1, double(tgt_last_norm * 1.0))); + accept_threshold = std::max(double(min_accepted) * 1.1, accept_threshold); + accept_threshold = std::max(std::max(avg_accepted * 0.9, avg_rejected * 1.1), accept_threshold); + accept_threshold += 1.0 - (1.2 * n_accept / n_drafted); + accept_threshold *= (1.3 - (std::min(n_seq_cur + i, 6) * 0.1)); + // + // accept_threshold = (tgt_avg_accepted - avg_accept_delta) * 0.3; + // accept_threshold *= std::min(0.8, std::max(0.1, double(tgt_last_norm * 1.0))); + // accept_threshold = std::max(double(min_accepted) * 1.1, accept_threshold); + // accept_threshold = std::max(std::max(avg_accepted * 0.9, avg_rejected * 1.1), accept_threshold); + // accept_threshold += 1.0 - (1.2 * n_accept / n_drafted); + // accept_threshold *= (0.7 + (std::min(n_seq_cur + i, 5) * 0.1)); - llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft); + } + + std::vector cur_p; + { + llama_token d_id; + std::vector already_picked; + float * logits = NULL; + + t_cur = ggml_time_us(); + for (int cidx = 0; cidx < 9; cidx++) { + llama_sampling_cp(drafts[s].ctx_sampling, ctx_dft_sampling); + restore_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft); + logits = llama_get_logits(ctx_dft); + normalized_p = normalize_candidates(logits, n_vocab, normalized_candidates); + for (size_t x = 0; x < std::min(normalized_p.size, size_t(10)); x++) + doubt[s].back().push_back(normalized_p.data[x]); + for (const auto & tid : already_picked) + logits[tid] = std::numeric_limits::infinity() * -1; + d_id = llama_sampling_sample(ctx_dft_sampling, ctx_dft, NULL); + already_picked.push_back(d_id); + int32_t norm_pos = find_normalized(normalized_p, d_id); + if (norm_pos < 0) continue; + llama_token_data norm = normalized_candidates[norm_pos]; + if (norm.p < 0.2) continue; + if (ctx_dft_sampling->params.temp <= 0) { + llama_token_data_array tda = { ctx_dft_sampling->cur.data(), ctx_dft_sampling->cur.size(), false }; + llama_sample_top_k(ctx_dft, &tda, 100, 1); + llama_sample_softmax(ctx_dft, &tda); + ctx_dft_sampling->cur.resize(tda.size); + } - const auto & cur_p = drafts[s].ctx_sampling->cur; - for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) { - LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str()); + llama_token_data found; + found.id = -1; + for (const llama_token_data & td : ctx_dft_sampling->cur) { + if (td.id == d_id) { + found = td; + break; + } + } + GGML_ASSERT(found.id != -1); + LOG(" ** draft candidate %3d for seq %3d, pos %3d: %6d (%4.3f, norm %4.3f) '%s'\n", + cidx, s, i, found.id, found.p, norm_pos >= 0 ? normalized_candidates[norm_pos].p : -1, + llama_token_to_piece(ctx_dft, found.id).c_str()); + if (found.p < 0.3) continue; + if (norm.p < 1e-2f) break; + cur_p.push_back(normalized_candidates[norm_pos]); + } + + if (cur_p.size() > 1) { + std::sort(cur_p.begin() + 1, cur_p.end(), + [](const llama_token_data & a, const llama_token_data & b) { + return a.p > b.p; + } + ); + } + } - if (cur_p[0].p < p_accept) { - LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept); + t_dft_sample += ggml_time_us() - t_cur; + + if (cur_p.empty()) { + LOG("stopping drafting for seq %3d, no viable candidates (%5.3f) \n", s, accept_threshold); + drafts[s].drafting = false; + continue; + } else if (cur_p[0].p < accept_threshold && (cur_p[0].p + (cur_p.size() < 2 ? 0 : cur_p[1].p)) < accept_threshold * 1.3) { + LOG("stopping drafting for seq %3d, pos %3d - probability too low: %.3f < %.3f\n", s, i, cur_p[0].p, accept_threshold); drafts[s].drafting = false; continue; } + if (cur_p[0].p < accept_threshold) { + split_threshold = 0.0; + } else { + split_threshold = cur_p[0].p / 10.0; + // split_threshold = std::max(0.01, cur_p[0].p * (n_seq_cur + i > 1 ? 0.15 : 0.2)); + } + std::vector sa(1, s); + + + // LOG("Check splits: %zu\n", cur_p.size()); // attempt to split the branch if the probability is high enough - for (int f = 1; f < 8; ++f) { - if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) { - LOG("splitting seq %3d into %3d\n", s, n_seq_cur); + for (int f = 1; f < std::min(8, int(cur_p.size()) - 1); ++f) { + if (n_seq_cur < n_seq_dft && cur_p[f].p >= split_threshold) { + n_split++; + LOG(">>>%d<<< splitting seq %3d into %3d on %6d (%8.3f) '%s'\n", f, s, n_seq_cur, + cur_p[f].id, cur_p[f].p, llama_token_to_piece(ctx_dft, cur_p[f].id).c_str()); - llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); - llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); + llama_kv_cache_seq_rm(ctx_dft, n_seq_cur + DOFFS, -1, -1); + llama_kv_cache_seq_cp(ctx_dft, s + DOFFS, n_seq_cur + DOFFS, -1, -1); // all previous tokens from this branch are now also part of the new branch for (int t = 0; t < batch_tgt.n_tokens; ++t) { @@ -304,11 +615,13 @@ int main(int argc, char ** argv) { } // copy the draft state + drafts[n_seq_cur].split_pos = i; drafts[n_seq_cur].active = true; drafts[n_seq_cur].drafting = true; drafts[n_seq_cur].skip = true; drafts[n_seq_cur].tokens = drafts[s].tokens; + drafts[n_seq_cur].tokens_p = drafts[s].tokens_p; drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft; drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; @@ -318,6 +631,8 @@ int main(int argc, char ** argv) { n_seq_cur++; } else { + LOG("Not splitting seq %3d into %3d, choice %2d @ %6d (%8.3f) '%s'\n", s, n_seq_cur, f, + cur_p[f].id, cur_p[f].p, llama_token_to_piece(ctx_dft, cur_p[f].id).c_str()); break; } } @@ -331,6 +646,7 @@ int main(int argc, char ** argv) { llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true); drafts[s].tokens.push_back(id); + drafts[s].tokens_p.push_back(cur_p[is].p); // add unique drafted tokens to the target batch drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); @@ -340,7 +656,7 @@ int main(int argc, char ** argv) { // add the token to the batch for batched decoding with the draft model drafts[s].i_batch_dft = batch_dft.n_tokens; - llama_batch_add(batch_dft, id, n_past_cur, { s }, true); + llama_batch_add(batch_dft, id, n_past_cur, { s + DOFFS }, true); if (batch_tgt.n_tokens > n_draft) { drafts[s].drafting = false; @@ -352,9 +668,20 @@ int main(int argc, char ** argv) { if (batch_dft.n_tokens == 0) { break; } + // LOG("Draft eval: %d\n", batch_dft.n_tokens); + // for (int x = 0; x < batch_dft.n_tokens; x++) { + // LOG("* %03d: seq %3d, pos %4d, token %6d '%s'", x, + // batch_dft.seq_id[x][0], batch_dft.pos[x], + // batch_dft.token[x], llama_token_to_piece(ctx_dft, batch_dft.token[x]).c_str()); + // } + + LOG("=== EVAL: DRAFTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str()); // evaluate the drafted tokens on the draft model + t_cur = ggml_time_us(); llama_decode(ctx_dft, batch_dft); + t_dft_gen += ggml_time_us() - t_cur; + save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens); ++n_past_cur; ++n_drafted; @@ -365,13 +692,19 @@ int main(int argc, char ** argv) { // evaluate the target model on the drafted tokens { - llama_kv_cache_seq_keep(ctx_tgt, 0); + // llama_kv_cache_seq_keep(ctx_tgt, 0); + for (int s = 1; s < n_seq_dft; ++s) { + llama_kv_cache_seq_rm(ctx_tgt, s, -1, -1); + } for (int s = 1; s < n_seq_dft; ++s) { llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); } - //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt)); + LOG("=== EVAL: TARGET ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); + t_cur = ggml_time_us(); llama_decode(ctx_tgt, batch_tgt); + t_tgt_predict += ggml_time_us() - t_cur; + save_logits(ctx_tgt, logits_tgt, n_vocab, batch_tgt.n_tokens); ++n_past_tgt; } @@ -382,6 +715,7 @@ int main(int argc, char ** argv) { } drafts[s].tokens.erase(drafts[s].tokens.begin()); + drafts[s].tokens_p.erase(drafts[s].tokens_p.begin()); } } @@ -391,13 +725,22 @@ int main(int argc, char ** argv) { LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - + LOG_TEE("times: target predict: %5.3f, draft gen/accept/sample: %5.3f / %5.3f / %5.3f\n", + t_tgt_predict / 1e6f, t_dft_gen / 1e6f, t_dft_accept / 1e6f, t_dft_sample / 1e6f); +// int64_t t_dft_sample = 0, t_dft_gen = 0, t_dft_accept = 0, t_tgt_predict = 0; LOG_TEE("\n"); LOG_TEE("n_draft = %d\n", n_draft); LOG_TEE("n_predict = %d\n", n_predict); + LOG_TEE("drafted = %.3f%%\n", 100.0f * n_drafted / n_predict); LOG_TEE("n_drafted = %d\n", n_drafted); LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_TEE("n_draft = %d\n", n_draft); + LOG_TEE("n_split = %d\n", n_split); + LOG_TEE("n_effsplit= %d\n", n_eff_split); + LOG_TEE("n_badsplit= %d\n", n_bad_split); + LOG_TEE("n_dupsplit= %d\n", n_dup_split); + LOG_TEE("max streak= %d\n", max_streak); LOG_TEE("\ndraft:\n"); llama_print_timings(ctx_dft); @@ -415,8 +758,10 @@ int main(int argc, char ** argv) { llama_free(ctx_tgt); llama_free_model(model_tgt); - llama_free(ctx_dft); - llama_free_model(model_dft); + if (!self_speculation) { + llama_free(ctx_dft); + llama_free_model(model_dft); + } llama_backend_free(); diff --git a/llama.cpp b/llama.cpp index c63e6251c7676..a4bd3932e9f91 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3252,7 +3252,28 @@ static struct ggml_cgraph * llm_build_llama( } } + int32_t * run_layer = batch.run_layers; + bool run_attn = false, run_mlp = false; + cur = inpL; + for (int il = 0; il < n_layer; ++il) { + run_attn = run_mlp = true; + if (run_layer != NULL) { + if (*run_layer >= 0) { + run_attn = (*run_layer & 1) == 0; + run_mlp = (*run_layer & 2) == 0; + run_layer++; + } else { + run_layer = NULL; + } + } else if (ggml_allocr_is_measure(lctx.alloc)) { + if (il == 0 || il == n_layer - 1) run_mlp = false; + else if (il == 1 || il == n_layer - 2) run_attn = false; + else if (il & 1) run_mlp = false; + else run_attn = false; + } + if (!run_attn && !run_mlp) continue; + ggml_format_name(inpL, "layer_inp_%d", il); offload_func_t offload_func = llama_nop; @@ -3263,10 +3284,11 @@ static struct ggml_cgraph * llm_build_llama( } #endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; + struct ggml_tensor * inpFF = nullptr; - // norm - { + // self-attention + if (run_attn) { + // norm cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); offload_func(cur); ggml_set_name(cur, "rms_norm_0"); @@ -3275,10 +3297,7 @@ static struct ggml_cgraph * llm_build_llama( cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); offload_func(cur); ggml_set_name(cur, "attention_norm_0"); - } - // self-attention - { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); offload_func_kq(tmpk); @@ -3395,25 +3414,25 @@ static struct ggml_cgraph * llm_build_llama( cur); offload_func(cur); ggml_set_name(cur, "result_wo"); - } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + inpFF = ggml_add(ctx0, cur, inpL); + offload_func(inpFF); + ggml_set_name(inpFF, "inpFF"); + } else { + inpFF = inpL; + } // feed-forward network - { + if (run_mlp) { // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); + cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); + offload_func(cur); + ggml_set_name(cur, "rms_norm_1"); - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); - } + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + offload_func(cur); + ggml_set_name(cur, "ffn_norm"); struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, @@ -3441,18 +3460,18 @@ static struct ggml_cgraph * llm_build_llama( cur); offload_func(cur); ggml_set_name(cur, "result_w2"); - } - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); + cur = ggml_add(ctx0, cur, inpFF); + offload_func(cur); + ggml_set_name(cur, "inpFF_+_result_w2"); + } else { + cur = inpFF; + } // input for next layer inpL = cur; } - cur = inpL; - // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); @@ -9582,7 +9601,7 @@ int llama_eval_embd( int n_past) { llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1); - llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; + llama_batch batch = { n_tokens, nullptr, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; const int ret = llama_decode_internal(*ctx, batch); if (ret < 0) { @@ -9604,6 +9623,7 @@ struct llama_batch llama_batch_get_one( llama_seq_id seq_id) { return { /*n_tokens =*/ n_tokens, + /*run_layers =*/ nullptr, /*tokens =*/ tokens, /*embd =*/ nullptr, /*pos =*/ nullptr, @@ -9617,7 +9637,7 @@ struct llama_batch llama_batch_get_one( } struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) { - llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; + llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; if (embd) { batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd); diff --git a/llama.h b/llama.h index 306f5b383cb11..71d0b3e498e2b 100644 --- a/llama.h +++ b/llama.h @@ -132,6 +132,7 @@ extern "C" { // typedef struct llama_batch { int32_t n_tokens; + int32_t *run_layers; // end marked by negative value. llama_token * token; float * embd;