From d6f35c7ca52939a05d1eed5ddee83a0fc7c17639 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Mon, 9 Oct 2023 18:54:16 -0600
Subject: [PATCH 1/6] Layer skipping demo

---
 examples/perplexity/perplexity.cpp | 130 ++++++++++++++++++++++++++++-
 llama.cpp                          |  79 +++++++++++-------
 llama.h                            |   1 +
 3 files changed, 179 insertions(+), 31 deletions(-)
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 7d0038bd40757..7559c02873a07 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "llama.h"
 
+#include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@@ -320,6 +321,31 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_batch = params.n_batch;
 
+    llama_batch batch = llama_batch_get_one(NULL, 0, 0, 0);
+
+    const int32_t n_layers = 32; // model layer count
+    const int test_count = 6; // num perplexity chunks to run for each test
+    const size_t prune_target = 4; // prune this many of the worst results each pass
+    // end tunables
+
+    // 1 = attn, 2 = mlp, 3 = both
+    int32_t test_skip_type = 0; // but don't mess with this, it's set automatically.
+    std::vector<int32_t> layers;
+    layers.resize(n_layers + 1);
+    std::fill(layers.begin(), layers.end(), 0);
+    batch.run_layers = layers.data();
+    int32_t skip_layer = -1;
+    std::vector<int32_t> skips;
+    std::vector<int32_t> skip_types;
+    skip_types.resize(n_layers);
+    std::fill(skip_types.begin(), skip_types.end(), 0);
+    std::vector<std::tuple<int32_t, int32_t, double>> pass_results;
+    std::vector<int32_t> worsts;
+    worsts.resize(n_layers);
+    std::fill(worsts.begin(), worsts.end(), 0);
+    int32_t curr_best_layer = -1, curr_best_type = 0;
+    double curr_best_ppl = -1, ref_ppl = -1;
+
     int count = 0;
     double nll = 0.0;
     double nll2 = 0.0;
@@ -327,8 +353,88 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+    static const char * label = "?AMB";
 
+    auto test_t_start = std::chrono::high_resolution_clock::now();
     for (int i = 0; i < n_chunk; ++i) {
+        if (i > 0 && i % test_count == 0) {
+            auto test_t_end = std::chrono::high_resolution_clock::now();
+            float test_t_total = std::chrono::duration<float>(test_t_end - test_t_start).count();
+
+            skip_layer = n_layers;
+            for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) {
+                int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3);
+                // printf("##%d, %d\n", new_sl, curr_skipped);
+                if (curr_skipped == 3) continue; // Already tested or perm skip.
+                skip_layer = new_sl;
+                test_skip_type = (curr_skipped & 1) != 0 ? 2 : 1;
+                break;
+            }
+            if (skip_layer >= n_layers) {
+                if (curr_best_layer == -1) break;
+                if (pass_results.size() >= prune_target * 2) {
+                    std::sort(pass_results.begin(), pass_results.end(),
+                        [](const std::tuple<int32_t, int32_t, double> & a, const std::tuple<int32_t, int32_t, double> & b) {
+                            return std::get<2>(a) > std::get<2>(b);
+                        }
+                    );
+                    const size_t num_prune = std::min(pass_results.size(), prune_target);
+                    for (size_t temp = 0; temp < num_prune; temp++) {
+                        int32_t lidx = std::get<0>(pass_results[temp]);
+                        if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue;
+                        worsts[lidx] |= std::get<1>(pass_results[temp]);
+                        printf("\nPrune[%zu]: %d (%d) - %.2f\n", temp, lidx, std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp]));
+                    }
+                }
+                pass_results.clear();
+                printf("\n\nADD SKIP %c%3d - ppl vs ref %.4f",
+                    int(label[curr_best_type]), curr_best_layer,
+                    curr_best_ppl - ref_ppl);
+                if (curr_best_ppl > ref_ppl * 1.75) break;
+                skip_types[curr_best_layer] += curr_best_type;
+                if (std::find(skips.begin(), skips.end(), curr_best_layer) == skips.end()) {
+                    skips.push_back(curr_best_layer);
+                }
+                curr_best_layer = -1;
+                curr_best_ppl = -1;
+                curr_best_type = 0;
+                skip_layer = n_layers;
+                for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) {
+                    skip_types[new_sl] = (skip_types[new_sl] & 3) | (worsts[new_sl] << 2);
+                }
+                for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) {
+                    int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3);
+                    // printf("||%d, %d\n", new_sl, curr_skipped);
+                    if (curr_skipped == 3) continue; // Already tested or perm skip.
+                    skip_layer = new_sl;
+                    test_skip_type = (curr_skipped & 1) != 0 ? 2 : 1;
+                    break;
+                }
+                if (skip_layer == -1 || skip_layer == n_layers) break;
+            }
+
+            i = 0;
+            count = 0;
+            nll = 0;
+            nll2 = 0;
+            logit_history.clear();
+            prob_history.clear();
+
+            for (int32_t i = 0; i < n_layers; i++) {
+                layers[i] = (skip_types[i] & 3) | (i == skip_layer ? test_skip_type : 0);
+            }
+            layers[n_layers] = -1;
+            printf("\nTEST %c%3d + [", int(label[test_skip_type]), skip_layer);
+            for (const auto l : skips) {
+                printf("%c%d, ", int(label[skip_types[l] & 3]), l);
+            }
+            printf("] - len: %3zu, best:(%c%3d @ %.3f), last took %.2f sec\n",
+                skips.size() + 1,
+                int(label[curr_best_type]), curr_best_layer,
+                curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0,
+                test_t_total);
+            test_t_start = std::chrono::high_resolution_clock::now();
+        }
         const int start =     i * n_ctx;
         const int end   = start + n_ctx;
 
@@ -353,7 +459,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                 tokens[batch_start] = llama_token_bos(ctx);
             }
 
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            batch.n_tokens = batch_size;
+            batch.token = tokens.data() + batch_start;
+            batch.all_pos_0 = j * n_batch;
+
+            if (llama_decode(ctx, batch)) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
@@ -367,7 +477,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
         const auto t_end = std::chrono::high_resolution_clock::now();
 
-        if (i == 0) {
+        if (i == 0 && skip_layer < 0 && skips.empty()) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
             fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
@@ -396,8 +506,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         count += n_ctx - first - 1;
 
         // perplexity is e^(average negative log-likelihood)
+        double ppl = std::exp(nll / count);
         if (params.ppl_output_type == 0) {
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            printf("[%d]%.4lf,", i + 1, ppl);
         } else {
             double av = nll/count;
             double av2 = nll2/count - av*av;
@@ -405,6 +516,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
         }
         fflush(stdout);
+        if (skip_layer >= 0 && (i + 1 == test_count || (i > 1 && ppl > ref_ppl * 3))) {
+            i = test_count - 1;
+            skip_types[skip_layer] |= test_skip_type << 2;
+            if (curr_best_layer == -1 || ppl < curr_best_ppl) {
+                curr_best_layer = skip_layer;
+                curr_best_ppl = ppl;
+                curr_best_type = test_skip_type;
+            }
+            printf(" -- %.3f", ppl - ref_ppl);
+            pass_results.push_back({skip_layer, test_skip_type, ppl});
+        } else if (skip_layer < 0) {
+            ref_ppl = ppl;
+        }
     }
     printf("\n");
 
diff --git a/llama.cpp b/llama.cpp
index c63e6251c7676..ff47e632024dd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3252,7 +3252,31 @@ static struct ggml_cgraph * llm_build_llama(
         }
     }
 
+    int32_t * run_layer = batch.run_layers;
+    bool run_attn = false, run_mlp = false;
+    cur = inpL;
+
     for (int il = 0; il < n_layer; ++il) {
+        run_attn = run_mlp = true;
+        if (run_layer != NULL) {
+            if (*run_layer >= 0) {
+                run_attn = (*run_layer & 1) == 0;
+                run_mlp  = (*run_layer & 2) == 0;
+                run_layer++;
+            } else {
+                run_layer = NULL;
+            }
+        } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 1) {
+            // No idea why this is needed, but otherwise we run out of space
+            // when skipping attn or mlp (but not both) on the last layer
+            run_mlp = false;
+        } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 2) {
+            // No idea why this is needed, but otherwise we run out of space
+            // when skipping attn or mlp (but not both) on the last layer
+            run_attn = false;
+        }
+        if (!run_attn && !run_mlp) continue;
+
         ggml_format_name(inpL, "layer_inp_%d", il);
 
         offload_func_t offload_func = llama_nop;
@@ -3263,10 +3287,11 @@ static struct ggml_cgraph * llm_build_llama(
         }
 #endif // GGML_USE_CUBLAS
 
-        struct ggml_tensor * inpSA = inpL;
+        struct ggml_tensor * inpFF = nullptr;
 
-        // norm
-        {
+        // self-attention
+        if (run_attn) {
+            // norm
             cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
             offload_func(cur);
             ggml_set_name(cur, "rms_norm_0");
@@ -3275,10 +3300,7 @@ static struct ggml_cgraph * llm_build_llama(
             cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
             offload_func(cur);
             ggml_set_name(cur, "attention_norm_0");
-        }
 
-        // self-attention
-        {
             // compute Q and K and RoPE them
             struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
             offload_func_kq(tmpk);
@@ -3395,25 +3417,25 @@ static struct ggml_cgraph * llm_build_llama(
                     cur);
             offload_func(cur);
             ggml_set_name(cur, "result_wo");
-        }
 
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        offload_func(inpFF);
-        ggml_set_name(inpFF, "inpFF");
+            inpFF = ggml_add(ctx0, cur, inpL);
+            offload_func(inpFF);
+            ggml_set_name(inpFF, "inpFF");
+        } else {
+            inpFF = inpL;
+        }
 
         // feed-forward network
-        {
+        if (run_mlp) {
             // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
-                offload_func(cur);
-                ggml_set_name(cur, "rms_norm_1");
+            cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
+            offload_func(cur);
+            ggml_set_name(cur, "rms_norm_1");
 
-                // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                offload_func(cur);
-                ggml_set_name(cur, "ffn_norm");
-            }
+            // cur = cur*ffn_norm(broadcasted)
+            cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
+            offload_func(cur);
+            ggml_set_name(cur, "ffn_norm");
 
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
                     model.layers[il].w3,
@@ -3441,18 +3463,18 @@ static struct ggml_cgraph * llm_build_llama(
                     cur);
             offload_func(cur);
             ggml_set_name(cur, "result_w2");
-        }
 
-        cur = ggml_add(ctx0, cur, inpFF);
-        offload_func(cur);
-        ggml_set_name(cur, "inpFF_+_result_w2");
+            cur = ggml_add(ctx0, cur, inpFF);
+            offload_func(cur);
+            ggml_set_name(cur, "inpFF_+_result_w2");
+        } else {
+            cur = inpFF;
+        }
 
         // input for next layer
         inpL = cur;
     }
 
-    cur = inpL;
-
     // norm
     {
         cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
@@ -9582,7 +9604,7 @@ int llama_eval_embd(
                              int   n_past) {
     llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
 
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch = { n_tokens, nullptr, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
 
     const int ret = llama_decode_internal(*ctx, batch);
     if (ret < 0) {
@@ -9604,6 +9626,7 @@ struct llama_batch llama_batch_get_one(
             llama_seq_id   seq_id) {
     return {
         /*n_tokens       =*/ n_tokens,
+        /*run_layers     =*/ nullptr,
         /*tokens         =*/ tokens,
         /*embd           =*/ nullptr,
         /*pos            =*/ nullptr,
@@ -9617,7 +9640,7 @@ struct llama_batch llama_batch_get_one(
 }
 
 struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
diff --git a/llama.h b/llama.h
index 306f5b383cb11..71d0b3e498e2b 100644
--- a/llama.h
+++ b/llama.h
@@ -132,6 +132,7 @@ extern "C" {
     //
     typedef struct llama_batch {
         int32_t n_tokens;
+        int32_t *run_layers; // end marked by negative value.
 
         llama_token  *  token;
         float        *  embd;

From 0abf0064ca3eb08757ee890f04804dcb8df7d78e Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 19 Oct 2023 18:00:15 -0600
Subject: [PATCH 2/6] What if we do something crazy like add layers instead of
 removing them?

---
 examples/perplexity/perplexity.cpp   |  72 ++++++----
 examples/speculative/speculative.cpp | 196 +++++++++++++++++++++++----
 2 files changed, 219 insertions(+), 49 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 7559c02873a07..fb3b018f21864 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -323,10 +323,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
     llama_batch batch = llama_batch_get_one(NULL, 0, 0, 0);
 
-    const int32_t n_layers = 32; // model layer count
-    const int test_count = 6; // num perplexity chunks to run for each test
-    const size_t prune_target = 4; // prune this many of the worst results each pass
-    // end tunables
+    // model layer count
+    const int32_t n_layers = 32;
+
+    // num perplexity chunks to run for each test
+    const int test_count = 4;
+
+    // prune this many of the worst results each pass
+    const size_t prune_target = 2;
+
+    // start with all but first/last layers disabled and start adding them back
+    const bool anti_mode = true;
+
+    // **** end tunables ***
 
     // 1 = attn, 2 = mlp, 3 = both
     int32_t test_skip_type = 0; // but don't mess with this, it's set automatically.
@@ -340,11 +349,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     skip_types.resize(n_layers);
     std::fill(skip_types.begin(), skip_types.end(), 0);
     std::vector<std::tuple<int32_t, int32_t, double>> pass_results;
-    std::vector<int32_t> worsts;
-    worsts.resize(n_layers);
-    std::fill(worsts.begin(), worsts.end(), 0);
+    std::vector<int32_t> extremes;
+    extremes.resize(n_layers);
+    std::fill(extremes.begin(), extremes.end(), 0);
+    if (anti_mode) {
+        // No pointing in starting with first/last layer disabled.
+        skip_types[0] = 15;
+        skip_types[n_layers - 1] = 15;
+        skips.push_back(0); skips.push_back(0 + n_layers);
+        skips.push_back(n_layers - 1); skips.push_back(n_layers - 1 + n_layers);
+    }
     int32_t curr_best_layer = -1, curr_best_type = 0;
     double curr_best_ppl = -1, ref_ppl = -1;
+    const int32_t mask = anti_mode ? 3 : 0;
 
     int count = 0;
     double nll = 0.0;
@@ -372,35 +389,40 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             }
             if (skip_layer >= n_layers) {
                 if (curr_best_layer == -1) break;
-                if (pass_results.size() >= prune_target * 2) {
+                if (prune_target > 0 && pass_results.size() >= prune_target * 2) {
                     std::sort(pass_results.begin(), pass_results.end(),
                         [](const std::tuple<int32_t, int32_t, double> & a, const std::tuple<int32_t, int32_t, double> & b) {
+                            if (anti_mode) return std::get<2>(b) > std::get<2>(a);
                             return std::get<2>(a) > std::get<2>(b);
                         }
                     );
                     const size_t num_prune = std::min(pass_results.size(), prune_target);
-                    for (size_t temp = 0; temp < num_prune; temp++) {
+                    for (size_t temp = 0, pruned = 0; temp < pass_results.size(); temp++) {
                         int32_t lidx = std::get<0>(pass_results[temp]);
                         if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue;
-                        worsts[lidx] |= std::get<1>(pass_results[temp]);
-                        printf("\nPrune[%zu]: %d (%d) - %.2f\n", temp, lidx, std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp]));
+                        extremes[lidx] |= std::get<1>(pass_results[temp]);
+                        printf("\nPrune[%zu]: %d (%d) - %.2f\n", pruned + 1, lidx,
+                                std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp]));
+                        if (anti_mode) {
+                            skip_types[lidx] |= std::get<1>(pass_results[temp]);
+                            skips.push_back(std::get<1>(pass_results[temp]) == 1 ? lidx : -lidx);
+                        }
+                        if (++pruned >= num_prune) break;
                     }
                 }
                 pass_results.clear();
-                printf("\n\nADD SKIP %c%3d - ppl vs ref %.4f",
+                printf("\n\nADD %c%3d - ppl vs ref %.4f",
                     int(label[curr_best_type]), curr_best_layer,
                     curr_best_ppl - ref_ppl);
-                if (curr_best_ppl > ref_ppl * 1.75) break;
+                if (!anti_mode && curr_best_ppl > ref_ppl * 1.75) break;
                 skip_types[curr_best_layer] += curr_best_type;
-                if (std::find(skips.begin(), skips.end(), curr_best_layer) == skips.end()) {
-                    skips.push_back(curr_best_layer);
-                }
+                skips.push_back(curr_best_type == 1 ? curr_best_layer : curr_best_layer + n_layers);
                 curr_best_layer = -1;
                 curr_best_ppl = -1;
                 curr_best_type = 0;
                 skip_layer = n_layers;
                 for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) {
-                    skip_types[new_sl] = (skip_types[new_sl] & 3) | (worsts[new_sl] << 2);
+                    skip_types[new_sl] = (skip_types[new_sl] & 3) | (extremes[new_sl] << 2);
                 }
                 for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) {
                     int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3);
@@ -420,16 +442,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             logit_history.clear();
             prob_history.clear();
 
+            int alive = 0;
             for (int32_t i = 0; i < n_layers; i++) {
-                layers[i] = (skip_types[i] & 3) | (i == skip_layer ? test_skip_type : 0);
+                layers[i] = mask ^ ((skip_types[i] & 3) | (i == skip_layer ? test_skip_type : 0));
+                alive += !(layers[i] & 1) + !(layers[i] & 2);
             }
             layers[n_layers] = -1;
             printf("\nTEST %c%3d + [", int(label[test_skip_type]), skip_layer);
-            for (const auto l : skips) {
-                printf("%c%d, ", int(label[skip_types[l] & 3]), l);
+            for (auto l : skips) {
+                printf("%c%d, ", int(label[skip_types[l % n_layers] & 3]), l % n_layers);
             }
-            printf("] - len: %3zu, best:(%c%3d @ %.3f), last took %.2f sec\n",
-                skips.size() + 1,
+            printf("] - live: %3d/%3d, best:(%c%3d @ %.3f), last took %.2f sec\n",
+                alive, n_layers * 2,
                 int(label[curr_best_type]), curr_best_layer,
                 curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0,
                 test_t_total);
@@ -477,7 +501,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
         const auto t_end = std::chrono::high_resolution_clock::now();
 
-        if (i == 0 && skip_layer < 0 && skips.empty()) {
+        if (i == 0 && skip_layer < 0 && ref_ppl < 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
             fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
@@ -516,7 +540,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
         }
         fflush(stdout);
-        if (skip_layer >= 0 && (i + 1 == test_count || (i > 1 && ppl > ref_ppl * 3))) {
+        if (skip_layer >= 0 && (i + 1 == test_count || (i > 1 && ppl > ref_ppl * 30))) {
             i = test_count - 1;
             skip_types[skip_layer] |= test_skip_type << 2;
             if (curr_best_layer == -1 || ppl < curr_best_ppl) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 894321ce9648c..5830b4fb36560 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -8,6 +8,8 @@
 #include <string>
 #include <vector>
 
+#define DOFFS 10000
+
 struct seq_draft {
     bool active   = false;
     bool drafting = false;
@@ -17,10 +19,31 @@ struct seq_draft {
     std::vector<int> i_batch_tgt;
 
     std::vector<llama_token> tokens;
+    std::vector<float>       tokens_p;
 
     struct llama_sampling_context * ctx_sampling;
 };
 
+static void save_logits(llama_context * ctx, std::vector<float> & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) {
+    // printf("SAVE %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs);
+    // printf("<S>");
+    GGML_ASSERT(doffs + count <= 30);
+    memcpy(
+        v.data() + doffs * n_vocab,
+        llama_get_logits(ctx) + soffs * n_vocab,
+        sizeof(float) * size_t(n_vocab) * count);
+}
+
+static void restore_logits(llama_context * ctx, std::vector<float> & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) {
+    // printf("<R>");
+    // printf("REST %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs);
+    GGML_ASSERT(soffs + count <= 30);
+    memcpy(
+        llama_get_logits(ctx) + doffs * n_vocab,
+        v.data() + soffs * n_vocab,
+        sizeof(float) * size_t(n_vocab) * count);
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
@@ -37,8 +60,10 @@ int main(int argc, char ** argv) {
     const int n_seq_dft = params.n_parallel;
 
     // TODO: make this configurable
-    const float p_accept = 0.80f;
-    const float p_split  = 0.10f;
+    // const float p_accept = 0.80f;
+    // const float p_split  = 0.10f;
+    const float p_accept = 0.5f; // 0.80f;
+    const float p_split  = p_accept / 8; // 0.10f;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("speculative", "log"));
@@ -46,6 +71,8 @@ int main(int argc, char ** argv) {
     log_dump_cmdline(argc, argv);
 #endif // LOG_DISABLE_LOGS
 
+    bool self_speculation   = false;
+
     // init llama.cpp
     llama_backend_init(params.numa);
 
@@ -60,9 +87,18 @@ int main(int argc, char ** argv) {
     std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
     // load the draft model
-    params.model = params.model_draft;
-    params.n_gpu_layers = params.n_gpu_layers_draft;
-    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
+    if (params.model != params.model_draft) {
+        params.model = params.model_draft;
+        params.n_gpu_layers = params.n_gpu_layers_draft;
+        std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
+    } else {
+        self_speculation = true;
+        model_dft = model_tgt;
+        ctx_dft = ctx_tgt;
+    }
+
+    const int n_ctx   = llama_n_ctx(ctx_tgt);
+    const int n_vocab = llama_n_vocab(model_tgt);
 
     // tokenize the prompt
     std::vector<llama_token> inp;
@@ -84,14 +120,33 @@ int main(int argc, char ** argv) {
 
     fflush(stderr);
 
+    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
+    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
+    std::vector<float> logits_tgt, logits_dft;
+
     const int n_input = inp.size();
 
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt with both models
-    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
-    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0));
+    llama_batch_clear(batch_tgt);
+    logits_tgt.resize(n_vocab * 30);
+    logits_dft.resize(n_vocab * 30);
+    for (int i = 0; i < n_input - 1; i++) {
+        llama_batch_add(batch_tgt, inp[i], i, { 0 }, false);
+    }
+    llama_decode(ctx_tgt, batch_tgt);
+    llama_batch_clear(batch_tgt);
+    llama_batch_add(batch_tgt, inp.back(), n_input - 1, { 0 }, true);
+    llama_decode(ctx_tgt, batch_tgt);
+    save_logits(ctx_tgt, logits_tgt, n_vocab);
+    if (!self_speculation) {
+        llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0 + DOFFS));
+    } else {
+        // llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0 + DOFFS));
+        llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, 0, -1);
+    }
+    // save_logits(ctx_dft, logits_dft, n_vocab, n_input);
 
     const auto t_enc_end = ggml_time_us();
 
@@ -104,6 +159,8 @@ int main(int argc, char ** argv) {
     int n_predict = 0;
     int n_drafted = 0;
     int n_accept  = 0;
+    int n_split     = 0;
+    int n_bad_split = 0;
 
     int n_past_tgt = inp.size();
     int n_past_dft = inp.size();
@@ -124,8 +181,16 @@ int main(int argc, char ** argv) {
         drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
     }
 
-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
+    // std::vector<int32_t> run_layers_dft = {
+    //     0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 1, 0, 3, 3, 0, 3, 0, 1, 1,
+    //     3, 3, 3, 0, 2, 3, 2, 3, 3, 3, 1, 3, 0, 0, 2, 1, 0, 2, 0, 0,
+    //     0, 3, 0, 1, 0, 1, 1, 3, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 3, 0,
+    //     3, 1, 3, 3, 0, 1, 3, 3, 3, 1, 3, 0, 0, 0, 1, 1, 2, 0, 1, 1, -1, };
+    std::vector<int32_t> run_layers_dft = {
+        0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 1, 0, 2, 0, 1, 1,
+        1, 0, 1, 0, 0, 0, -1, };
+
+    batch_dft.run_layers = run_layers_dft.data();
 
     const auto t_dec_start = ggml_time_us();
 
@@ -133,7 +198,11 @@ int main(int argc, char ** argv) {
     drafts[0].i_batch_tgt.resize(1);
     drafts[0].i_batch_tgt[0] = 0;
 
+    double avg_accepted = 0, avg_rejected = 0;
+    float min_accepted = 0, max_rejected = 0;
+
     while (true) {
+        LOG("*** Draft start\n");
         // print current draft sequences
         for (int s = 0; s < n_seq_dft; ++s) {
             if (!drafts[s].active) {
@@ -152,9 +221,11 @@ int main(int argc, char ** argv) {
             LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
 
             // sample from the target model
+            restore_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]);
             llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
 
             llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
+            save_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]);
 
             //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
@@ -179,11 +250,26 @@ int main(int argc, char ** argv) {
                     }
 
                     if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
-                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
+                        LOG("the sampled target token matches drafted token %d of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
 
                         s_keep = s;
                         matches = true;
+                        LOG("Derp[%d]: %6d (%5.4f)\n", s, drafts[s].tokens[i_dft], drafts[s].tokens_p[i_dft]);
+                        if (min_accepted == 0) min_accepted = drafts[s].tokens_p[i_dft];
+                        else min_accepted = std::min(min_accepted, drafts[s].tokens_p[i_dft]);
+                        avg_accepted += drafts[s].tokens_p[i_dft] * (avg_accepted == 0 ? 2 : 1);
+                        avg_accepted /= 2;
                     } else {
+                        if (i_dft < (int) drafts[s].tokens.size() && id != drafts[s].tokens[i_dft]) {
+                            if (i_dft == 0 && s > 0) n_bad_split++;
+                            max_rejected = std::max(max_rejected, drafts[s].tokens_p[i_dft]);
+                            avg_rejected += drafts[s].tokens_p[i_dft] * (avg_rejected == 0 ? 2 : 1);
+                            avg_rejected /= 2;
+                            LOG("-- Terminate sequence %d+%d: (%d, '%s') != target (%d, '%s') - rejected\n",
+                                    s, i_dft, drafts[s].tokens[i_dft],
+                                    llama_token_to_piece(ctx_dft, drafts[s].tokens[i_dft]).c_str(),
+                                    id, token_str.c_str());
+                        }
                         drafts[s].active = false;
                     }
                 }
@@ -204,6 +290,18 @@ int main(int argc, char ** argv) {
             {
                 LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
+                llama_kv_cache_seq_rm(ctx_dft, s_keep + DOFFS, n_past_dft, -1);
+                llama_kv_cache_seq_rm(ctx_tgt, s_keep,         n_past_tgt, -1);
+                if (s_keep != 0) {
+                    llama_kv_cache_seq_cp(ctx_dft, s_keep + DOFFS, 0 + DOFFS, -1, -1);
+                    llama_kv_cache_seq_cp(ctx_tgt, s_keep,         0,         -1, -1);
+                }
+                for (int s = 1; s < n_seq_dft; ++s) {
+                    llama_kv_cache_seq_rm(ctx_dft, s + DOFFS, -1, -1);
+                    llama_kv_cache_seq_rm(ctx_tgt, s,         -1, -1);
+                }
+
+                /*
                 llama_kv_cache_seq_keep(ctx_dft, s_keep);
                 llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
                 llama_kv_cache_seq_keep(ctx_dft, 0);
@@ -212,22 +310,28 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_seq_keep(ctx_tgt, s_keep);
                 llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
                 llama_kv_cache_seq_keep(ctx_tgt, 0);
+                */
+
             }
 
             for (int s = 0; s < n_seq_dft; ++s) {
                 drafts[s].active = false;
                 drafts[s].tokens.clear();
+                drafts[s].tokens_p.clear();
                 drafts[s].i_batch_tgt.clear();
             }
             // note: will be erased after the speculation phase
             drafts[0].tokens.push_back(id);
+            drafts[0].tokens_p.push_back(0);
             drafts[0].i_batch_tgt.push_back(0);
 
             llama_batch_clear(batch_dft);
-            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
+            llama_batch_add  (batch_dft, id, n_past_dft, { 0 + DOFFS }, true);
+
+            LOG("=== EVAL: DRAFT ACCEPTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str());
 
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
             llama_decode         (ctx_dft, batch_dft);
+            save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens);
 
             ++n_past_dft;
 
@@ -254,6 +358,10 @@ int main(int argc, char ** argv) {
         llama_batch_clear(batch_tgt);
         llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
 
+        // double avg_accepted = n_accept > 0 ? avg_accepted / double(n_accept) : 0;
+        LOG("Average accepted/rejected: %3.5f / %3.5f -- Min accepted/max rejected: %3.5f / %3.5f\n",
+                avg_accepted, avg_rejected, min_accepted, max_rejected);
+
         // sample n_draft tokens from the draft model using tree-based sampling
         for (int i = 0; i < n_draft; ++i) {
             batch_dft.n_tokens = 0;
@@ -267,17 +375,24 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
+                restore_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft, drafts[s].i_batch_dft);
                 llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+                save_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft, drafts[s].i_batch_dft);
 
                 const auto & cur_p = drafts[s].ctx_sampling->cur;
 
                 for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
+                    if (cur_p[k].p < 1e-5f) continue;
                     LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                             k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
                 }
 
-                if (cur_p[0].p < p_accept) {
-                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
+                double accept_threshold = avg_rejected == 0 || avg_rejected == 0 || n_drafted < 16
+                        ? p_accept
+                        : std::max(double(min_accepted * 0.98), avg_accepted * 0.75f);
+                // accept_threshold = 0.8;
+                if (cur_p[0].p < accept_threshold) {
+                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, accept_threshold);
                     drafts[s].drafting = false;
                     continue;
                 }
@@ -286,11 +401,20 @@ int main(int argc, char ** argv) {
 
                 // attempt to split the branch if the probability is high enough
                 for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
-                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
-
-                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
-                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+                    // if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
+                    // if (n_seq_cur < n_seq_dft && cur_p[f].p > cur_p[0].p / 5) {
+                    double split_threshold = avg_accepted == 0 || avg_rejected == 0 || n_drafted < 16
+                            ? p_split
+                            : ( std::max(double(min_accepted * 0.7), avg_accepted * 0.4)
+                                * (n_seq_cur >= 2 ? 0.75 : 1.0) );
+                    // split_threshold = 0.1;
+                    if (n_seq_cur < n_seq_dft && cur_p[f].p >= split_threshold) {
+                        n_split++;
+                        LOG(">>>%d<<< splitting seq %3d into %3d on %6d (%8.3f) '%s'\n", f, s, n_seq_cur,
+                                cur_p[f].id, cur_p[f].p, llama_token_to_piece(ctx_dft, cur_p[f].id).c_str());
+
+                        llama_kv_cache_seq_rm(ctx_dft,            n_seq_cur + DOFFS, -1, -1);
+                        llama_kv_cache_seq_cp(ctx_dft, s + DOFFS, n_seq_cur + DOFFS, -1, -1);
 
                         // all previous tokens from this branch are now also part of the new branch
                         for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -309,6 +433,7 @@ int main(int argc, char ** argv) {
                         drafts[n_seq_cur].skip     = true;
 
                         drafts[n_seq_cur].tokens      = drafts[s].tokens;
+                        drafts[n_seq_cur].tokens_p    = drafts[s].tokens_p;
                         drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                         drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
 
@@ -331,6 +456,7 @@ int main(int argc, char ** argv) {
                     llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
 
                     drafts[s].tokens.push_back(id);
+                    drafts[s].tokens_p.push_back(cur_p[is].p);
 
                     // add unique drafted tokens to the target batch
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@@ -340,7 +466,7 @@ int main(int argc, char ** argv) {
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;
 
-                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
+                    llama_batch_add(batch_dft, id, n_past_cur, { s + DOFFS }, true);
 
                     if (batch_tgt.n_tokens > n_draft) {
                         drafts[s].drafting = false;
@@ -352,9 +478,18 @@ int main(int argc, char ** argv) {
             if (batch_dft.n_tokens == 0) {
                 break;
             }
+            // LOG("Draft eval: %d\n", batch_dft.n_tokens);
+            // for (int x = 0; x < batch_dft.n_tokens; x++) {
+            //     LOG("* %03d: seq %3d, pos %4d, token %6d '%s'", x,
+            //         batch_dft.seq_id[x][0], batch_dft.pos[x],
+            //         batch_dft.token[x], llama_token_to_piece(ctx_dft, batch_dft.token[x]).c_str());
+            // }
+
+            LOG("=== EVAL: DRAFTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str());
 
             // evaluate the drafted tokens on the draft model
             llama_decode(ctx_dft, batch_dft);
+            save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens);
             ++n_past_cur;
             ++n_drafted;
 
@@ -365,13 +500,17 @@ int main(int argc, char ** argv) {
 
         // evaluate the target model on the drafted tokens
         {
-            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            // llama_kv_cache_seq_keep(ctx_tgt, 0);
+            for (int s = 1; s < n_seq_dft; ++s) {
+                llama_kv_cache_seq_rm(ctx_tgt, s,    -1, -1);
+            }
             for (int s = 1; s < n_seq_dft; ++s) {
                 llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
             }
 
-            //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
+            LOG("=== EVAL: TARGET ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             llama_decode(ctx_tgt, batch_tgt);
+            save_logits(ctx_tgt, logits_tgt, n_vocab, batch_tgt.n_tokens);
             ++n_past_tgt;
         }
 
@@ -382,6 +521,7 @@ int main(int argc, char ** argv) {
             }
 
             drafts[s].tokens.erase(drafts[s].tokens.begin());
+            drafts[s].tokens_p.erase(drafts[s].tokens_p.begin());
         }
     }
 
@@ -395,9 +535,13 @@ int main(int argc, char ** argv) {
     LOG_TEE("\n");
     LOG_TEE("n_draft   = %d\n", n_draft);
     LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("drafted   = %.3f%%\n", 100.0f * n_drafted / n_predict);
     LOG_TEE("n_drafted = %d\n", n_drafted);
     LOG_TEE("n_accept  = %d\n", n_accept);
     LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("n_draft   = %d\n", n_draft);
+    LOG_TEE("n_split   = %d\n", n_split);
+    LOG_TEE("n_badsplit= %d\n", n_bad_split);
 
     LOG_TEE("\ndraft:\n");
     llama_print_timings(ctx_dft);
@@ -415,8 +559,10 @@ int main(int argc, char ** argv) {
     llama_free(ctx_tgt);
     llama_free_model(model_tgt);
 
-    llama_free(ctx_dft);
-    llama_free_model(model_dft);
+    if (!self_speculation) {
+        llama_free(ctx_dft);
+        llama_free_model(model_dft);
+    }
 
     llama_backend_free();
 

From fae6d9c70d14bedf2a274801dca78199cce27917 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 19 Oct 2023 18:11:15 -0600
Subject: [PATCH 3/6] Fix pushing in wrong halflayer idx

---
 examples/perplexity/perplexity.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index fb3b018f21864..bb416b459245a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -405,7 +405,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                                 std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp]));
                         if (anti_mode) {
                             skip_types[lidx] |= std::get<1>(pass_results[temp]);
-                            skips.push_back(std::get<1>(pass_results[temp]) == 1 ? lidx : -lidx);
+                            skips.push_back(std::get<1>(pass_results[temp]) == 1 ? lidx : lidx + n_layers);
                         }
                         if (++pruned >= num_prune) break;
                     }

From d6b44fb3aef1e81c4075412280e888d205eeebae Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 19 Oct 2023 21:14:23 -0600
Subject: [PATCH 4/6] Force measure to allocate more memory for 70Bs

---
 llama.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ff47e632024dd..a4bd3932e9f91 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3266,14 +3266,11 @@ static struct ggml_cgraph * llm_build_llama(
             } else {
                 run_layer = NULL;
             }
-        } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 1) {
-            // No idea why this is needed, but otherwise we run out of space
-            // when skipping attn or mlp (but not both) on the last layer
-            run_mlp = false;
-        } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 2) {
-            // No idea why this is needed, but otherwise we run out of space
-            // when skipping attn or mlp (but not both) on the last layer
-            run_attn = false;
+        } else if (ggml_allocr_is_measure(lctx.alloc)) {
+            if (il == 0 || il == n_layer - 1) run_mlp = false;
+            else if (il == 1 || il == n_layer - 2) run_attn = false;
+            else if (il & 1) run_mlp = false;
+            else run_attn = false;
         }
         if (!run_attn && !run_mlp) continue;
 

From 8a569cfee5eecc4262d5fcb40440669cf98959c5 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Sat, 21 Oct 2023 04:26:47 -0600
Subject: [PATCH 5/6] perplexity anti-mode improvements

---
 examples/perplexity/perplexity.cpp | 32 ++++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index bb416b459245a..62d55fee51f30 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -352,13 +352,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     std::vector<int32_t> extremes;
     extremes.resize(n_layers);
     std::fill(extremes.begin(), extremes.end(), 0);
-    if (anti_mode) {
-        // No pointing in starting with first/last layer disabled.
-        skip_types[0] = 15;
-        skip_types[n_layers - 1] = 15;
-        skips.push_back(0); skips.push_back(0 + n_layers);
-        skips.push_back(n_layers - 1); skips.push_back(n_layers - 1 + n_layers);
-    }
+    // if (anti_mode) {
+    //     // No point in starting with first/last layer disabled.
+    //     skip_types[0] = 15;
+    //     skip_types[n_layers - 1] = 15;
+    //     skips.push_back(0); skips.push_back(0 + n_layers);
+    //     skips.push_back(n_layers - 1); skips.push_back(n_layers - 1 + n_layers);
+    // }
     int32_t curr_best_layer = -1, curr_best_type = 0;
     double curr_best_ppl = -1, ref_ppl = -1;
     const int32_t mask = anti_mode ? 3 : 0;
@@ -389,7 +389,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             }
             if (skip_layer >= n_layers) {
                 if (curr_best_layer == -1) break;
-                if (prune_target > 0 && pass_results.size() >= prune_target * 2) {
+                if (anti_mode || (prune_target > 0 && pass_results.size() >= prune_target * 2)) {
                     std::sort(pass_results.begin(), pass_results.end(),
                         [](const std::tuple<int32_t, int32_t, double> & a, const std::tuple<int32_t, int32_t, double> & b) {
                             if (anti_mode) return std::get<2>(b) > std::get<2>(a);
@@ -399,14 +399,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                     const size_t num_prune = std::min(pass_results.size(), prune_target);
                     for (size_t temp = 0, pruned = 0; temp < pass_results.size(); temp++) {
                         int32_t lidx = std::get<0>(pass_results[temp]);
-                        if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue;
-                        extremes[lidx] |= std::get<1>(pass_results[temp]);
-                        printf("\nPrune[%zu]: %d (%d) - %.2f\n", pruned + 1, lidx,
-                                std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp]));
                         if (anti_mode) {
                             skip_types[lidx] |= std::get<1>(pass_results[temp]);
                             skips.push_back(std::get<1>(pass_results[temp]) == 1 ? lidx : lidx + n_layers);
                         }
+                        if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue;
+                        extremes[lidx] |= std::get<1>(pass_results[temp]);
+                        printf("\nPrune[%zu]: %d (%d) - %.2f\n", pruned + 1, lidx,
+                                std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp]));
                         if (++pruned >= num_prune) break;
                     }
                 }
@@ -414,9 +414,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                 printf("\n\nADD %c%3d - ppl vs ref %.4f",
                     int(label[curr_best_type]), curr_best_layer,
                     curr_best_ppl - ref_ppl);
-                if (!anti_mode && curr_best_ppl > ref_ppl * 1.75) break;
-                skip_types[curr_best_layer] += curr_best_type;
-                skips.push_back(curr_best_type == 1 ? curr_best_layer : curr_best_layer + n_layers);
+                if (!anti_mode) {
+                    if (curr_best_ppl > ref_ppl * 1.75) break;
+                    skip_types[curr_best_layer] += curr_best_type;
+                    skips.push_back(curr_best_type == 1 ? curr_best_layer : curr_best_layer + n_layers);
+                }
                 curr_best_layer = -1;
                 curr_best_ppl = -1;
                 curr_best_type = 0;

From 13e08d0efa8f1803cd71a8413eb531f9cc3dfb2e Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Mon, 23 Oct 2023 02:40:37 -0600
Subject: [PATCH 6/6] Sync latest changes

---
 examples/perplexity/perplexity.cpp   |  11 +-
 examples/speculative/speculative.cpp | 301 ++++++++++++++++++++++-----
 2 files changed, 258 insertions(+), 54 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 62d55fee51f30..c9b393caa9dca 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -397,6 +397,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                         }
                     );
                     const size_t num_prune = std::min(pass_results.size(), prune_target);
+                    if (num_prune > 0) printf("\nPruning: ");
                     for (size_t temp = 0, pruned = 0; temp < pass_results.size(); temp++) {
                         int32_t lidx = std::get<0>(pass_results[temp]);
                         if (anti_mode) {
@@ -405,17 +406,17 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                         }
                         if (lidx == curr_best_layer && std::get<1>(pass_results[temp]) == curr_best_type) continue;
                         extremes[lidx] |= std::get<1>(pass_results[temp]);
-                        printf("\nPrune[%zu]: %d (%d) - %.2f\n", pruned + 1, lidx,
+                        printf("[%zu: %d (%d) - %.2f], ", pruned + 1, lidx,
                                 std::get<1>(pass_results[temp]), std::get<2>(pass_results[temp]));
                         if (++pruned >= num_prune) break;
                     }
                 }
                 pass_results.clear();
-                printf("\n\nADD %c%3d - ppl vs ref %.4f",
+                printf("\n\nADD %c%3d - ppl vs ref %.4f - cur:[",
                     int(label[curr_best_type]), curr_best_layer,
                     curr_best_ppl - ref_ppl);
                 if (!anti_mode) {
-                    if (curr_best_ppl > ref_ppl * 1.75) break;
+                    // if (curr_best_ppl > ref_ppl * 1.75) break;
                     skip_types[curr_best_layer] += curr_best_type;
                     skips.push_back(curr_best_type == 1 ? curr_best_layer : curr_best_layer + n_layers);
                 }
@@ -426,6 +427,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                 for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) {
                     skip_types[new_sl] = (skip_types[new_sl] & 3) | (extremes[new_sl] << 2);
                 }
+                for (int32_t i = 0; i < n_layers; i++) {
+                    const int val = mask ^ (skip_types[i] & 3);
+                    printf("%d%s", val, i < n_layers - 1 ? ", " : "]");
+                }
                 for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) {
                     int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3);
                     // printf("||%d, %d\n", new_sl, curr_skipped);
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 5830b4fb36560..3d8dc13477b62 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -15,6 +15,8 @@ struct seq_draft {
     bool drafting = false;
     bool skip     = false;
 
+    int split_pos = 0;
+
     int i_batch_dft = 0;
     std::vector<int> i_batch_tgt;
 
@@ -27,7 +29,7 @@ struct seq_draft {
 static void save_logits(llama_context * ctx, std::vector<float> & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) {
     // printf("SAVE %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs);
     // printf("<S>");
-    GGML_ASSERT(doffs + count <= 30);
+    GGML_ASSERT(doffs + count < 64);
     memcpy(
         v.data() + doffs * n_vocab,
         llama_get_logits(ctx) + soffs * n_vocab,
@@ -37,13 +39,47 @@ static void save_logits(llama_context * ctx, std::vector<float> & v, const int n
 static void restore_logits(llama_context * ctx, std::vector<float> & v, const int n_vocab, const int count = 1, const int soffs = 0, const int doffs = 0) {
     // printf("<R>");
     // printf("REST %p: %d, %d, %d\n", (void *)ctx, count, soffs, doffs);
-    GGML_ASSERT(soffs + count <= 30);
+    GGML_ASSERT(soffs + count < 64);
     memcpy(
         llama_get_logits(ctx) + doffs * n_vocab,
         v.data() + soffs * n_vocab,
         sizeof(float) * size_t(n_vocab) * count);
 }
 
+static llama_token_data_array normalize_candidates(const float * logits, const int n_vocab, std::vector<llama_token_data> & cur) {
+    cur.reserve(n_vocab);
+    cur.clear();
+
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+
+    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+    llama_sample_top_k(NULL, &cur_p, 100, 1);
+    llama_sample_softmax(NULL, &cur_p);
+    cur.resize(cur_p.size);
+    return cur_p;
+}
+
+static int32_t find_normalized(const llama_token_data_array & tda, const llama_token id) {
+    llama_token_data *item = tda.data;
+
+    for (int32_t i = 0; i < tda.size; i++, item++)
+        if (item->id == id) return i;
+    return -1;
+}
+
+static double running_average(double & cur, double val, double n = 20) {
+    if (cur < 1e-5f) {
+        cur = val;
+        return cur;
+    }
+    // New average = old average * (n-1)/n + new value /n
+    cur = cur * (n - 1) / n + val / n;
+    return cur;
+}
+
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
@@ -62,8 +98,8 @@ int main(int argc, char ** argv) {
     // TODO: make this configurable
     // const float p_accept = 0.80f;
     // const float p_split  = 0.10f;
-    const float p_accept = 0.5f; // 0.80f;
-    const float p_split  = p_accept / 8; // 0.10f;
+    const float p_accept = 0.75f; // 0.80f;
+    const float p_split  = 0.6f; // p_accept / 8; // 0.10f;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("speculative", "log"));
@@ -130,8 +166,8 @@ int main(int argc, char ** argv) {
 
     // eval the prompt with both models
     llama_batch_clear(batch_tgt);
-    logits_tgt.resize(n_vocab * 30);
-    logits_dft.resize(n_vocab * 30);
+    logits_tgt.resize(n_vocab * 64);
+    logits_dft.resize(n_vocab * 64);
     for (int i = 0; i < n_input - 1; i++) {
         llama_batch_add(batch_tgt, inp[i], i, { 0 }, false);
     }
@@ -146,7 +182,7 @@ int main(int argc, char ** argv) {
         // llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0 + DOFFS));
         llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, 0, -1);
     }
-    // save_logits(ctx_dft, logits_dft, n_vocab, n_input);
+    save_logits(ctx_dft, logits_dft, n_vocab, n_input);
 
     const auto t_enc_end = ggml_time_us();
 
@@ -161,6 +197,11 @@ int main(int argc, char ** argv) {
     int n_accept  = 0;
     int n_split     = 0;
     int n_bad_split = 0;
+    int n_dup_split = 0;
+    int n_eff_split = 0;
+    int max_streak = 0;
+
+    int64_t t_dft_sample = 0, t_dft_gen = 0, t_dft_accept = 0, t_tgt_predict = 0;
 
     int n_past_tgt = inp.size();
     int n_past_dft = inp.size();
@@ -170,26 +211,35 @@ int main(int argc, char ** argv) {
 
     // target model sampling context
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct llama_sampling_context * ctx_dft_sampling = llama_sampling_init(params.sparams);
+    std::vector<llama_token_data> normalized_candidates;
+    normalized_candidates.reserve(n_vocab);
+    llama_token_data_array normalized_p;
 
     // draft sequence data
     std::vector<seq_draft> drafts(n_seq_dft);
 
     params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    params.sparams.temp = std::max(0.01f, params.sparams.temp);
+    // params.sparams.temp = std::max(0.01f, params.sparams.temp);
 
     for (int s = 0; s < n_seq_dft; ++s) {
         drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
     }
 
-    // std::vector<int32_t> run_layers_dft = {
-    //     0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 1, 0, 3, 3, 0, 3, 0, 1, 1,
-    //     3, 3, 3, 0, 2, 3, 2, 3, 3, 3, 1, 3, 0, 0, 2, 1, 0, 2, 0, 0,
-    //     0, 3, 0, 1, 0, 1, 1, 3, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 3, 0,
-    //     3, 1, 3, 3, 0, 1, 3, 3, 3, 1, 3, 0, 0, 0, 1, 1, 2, 0, 1, 1, -1, };
+    // 70B (80 layers) skips example
     std::vector<int32_t> run_layers_dft = {
-        0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 1, 0, 2, 0, 1, 1,
-        1, 0, 1, 0, 0, 0, -1, };
+        0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 1, 0, 3, 3, 0, 3, 0, 1, 1,
+        3, 3, 3, 0, 2, 3, 2, 3, 3, 3, 1, 3, 0, 0, 2, 1, 0, 2, 0, 0,
+        0, 3, 0, 1, 0, 1, 1, 3, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 3, 0,
+        3, 1, 3, 3, 0, 1, 3, 3, 3, 1, 3, 0, 0, 0, 1, 1, 2, 0, 1, 1, -1, };
+
+    // 3B (26 layers) skips example
+    // std::vector<int32_t> run_layers_dft = {
+    //        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 1, 3, 0, 2, 3, 3, 1, 0, 2, 0, 1, 1, 2, 0, 0,
+    //     // 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 2, 1, 3, 0, 2, 3, 3, 1, 1, 2, 1, 1, 1, 2, 0, 1,
+    //     -1, };
 
+    // NOTE: Comment this line out to disable skipping.
     batch_dft.run_layers = run_layers_dft.data();
 
     const auto t_dec_start = ggml_time_us();
@@ -198,8 +248,13 @@ int main(int argc, char ** argv) {
     drafts[0].i_batch_tgt.resize(1);
     drafts[0].i_batch_tgt[0] = 0;
 
-    double avg_accepted = 0, avg_rejected = 0;
-    float min_accepted = 0, max_rejected = 0;
+    double avg_accepted = 0, avg_rejected = 0, tgt_avg_accepted = 0;
+    double avg_accept_delta = 0;
+    float min_accepted = 0, max_rejected = 0, tgt_min_accepted = 0;
+
+    int64_t t_cur;
+
+    std::vector<std::vector<std::vector<llama_token_data>>> doubt;
 
     while (true) {
         LOG("*** Draft start\n");
@@ -217,15 +272,37 @@ int main(int argc, char ** argv) {
         int i_dft  = 0;
         int s_keep = 0;
 
+        float tgt_last_norm = 0, tgt_last_best_norm = 0, tgt_last_orig = 0;
+
         while (true) {
             LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
 
             // sample from the target model
             restore_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]);
+            normalized_p = normalize_candidates(llama_get_logits_ith(ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]), n_vocab, normalized_candidates);
             llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+            save_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]);
+            int32_t norm_pos = find_normalized(normalized_p, id);
+            int32_t orig_pos = find_normalized({ctx_sampling->cur.data(), ctx_sampling->cur.size(), false}, id);
+            if (norm_pos >= 0) {
+                tgt_last_norm = normalized_candidates[norm_pos].p;
+                tgt_last_best_norm = normalized_candidates[0].p;
+                running_average(tgt_avg_accepted, tgt_last_norm);
+                tgt_min_accepted = tgt_min_accepted < 1e-4
+                        ? tgt_last_norm
+                        : std::min(tgt_min_accepted, tgt_last_norm);
+            } else {
+                tgt_last_norm = tgt_last_best_norm = tgt_avg_accepted;
+            }
+            if (orig_pos >= 0) {
+                tgt_last_orig = ctx_sampling->cur[orig_pos].p;
+            }
+            LOG("target sampled (%d, '%s') orig_p=%5.4f, norm_p=%5.4f\n",
+                    id, llama_token_to_piece(ctx_tgt, id).c_str(),
+                    orig_pos >= 0 ? ctx_sampling->cur[orig_pos].p : -1,
+                    norm_pos >= 0 ? normalized_candidates[norm_pos].p : -1);
 
             llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
-            save_logits(ctx_tgt, logits_tgt, n_vocab, 1, drafts[s_keep].i_batch_tgt[i_dft], drafts[s_keep].i_batch_tgt[i_dft]);
 
             //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
@@ -245,26 +322,30 @@ int main(int argc, char ** argv) {
                 bool matches = false;
 
                 for (int s = 0; s < n_seq_dft; ++s) {
-                    if (!drafts[s].active) {
+                    if (!drafts[s].active || i_dft < drafts[s].split_pos) {
                         continue;
                     }
 
                     if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
-                        LOG("the sampled target token matches drafted token %d of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
+                        LOG("the sampled target token matches drafted token %d of sequence %d (%d, '%s') - accepted\n",
+                                i_dft, s, id, token_str.c_str());
 
+                        if (i_dft == 0 && s > 0) {
+                            if (matches) n_dup_split++;
+                            else n_eff_split++;
+                        }
                         s_keep = s;
                         matches = true;
                         LOG("Derp[%d]: %6d (%5.4f)\n", s, drafts[s].tokens[i_dft], drafts[s].tokens_p[i_dft]);
                         if (min_accepted == 0) min_accepted = drafts[s].tokens_p[i_dft];
                         else min_accepted = std::min(min_accepted, drafts[s].tokens_p[i_dft]);
-                        avg_accepted += drafts[s].tokens_p[i_dft] * (avg_accepted == 0 ? 2 : 1);
-                        avg_accepted /= 2;
+                        running_average(avg_accepted, drafts[s].tokens_p[i_dft]);
+                        running_average(avg_accept_delta, tgt_last_norm - drafts[s].tokens_p[i_dft]);
                     } else {
                         if (i_dft < (int) drafts[s].tokens.size() && id != drafts[s].tokens[i_dft]) {
                             if (i_dft == 0 && s > 0) n_bad_split++;
                             max_rejected = std::max(max_rejected, drafts[s].tokens_p[i_dft]);
-                            avg_rejected += drafts[s].tokens_p[i_dft] * (avg_rejected == 0 ? 2 : 1);
-                            avg_rejected /= 2;
+                            running_average(avg_rejected, drafts[s].tokens_p[i_dft]);
                             LOG("-- Terminate sequence %d+%d: (%d, '%s') != target (%d, '%s') - rejected\n",
                                     s, i_dft, drafts[s].tokens[i_dft],
                                     llama_token_to_piece(ctx_dft, drafts[s].tokens[i_dft]).c_str(),
@@ -279,8 +360,27 @@ int main(int argc, char ** argv) {
                     ++n_past_tgt;
                     ++n_past_dft;
                     ++i_dft;
-
+                    max_streak = std::max(max_streak, i_dft);
                     continue;
+                } else {
+                    for (size_t seqnum = 0; seqnum < doubt.size(); seqnum++) {
+                        const std::vector<std::vector<llama_token_data>> & sdoubt = doubt[seqnum];
+                        if (sdoubt.size() <= i_dft) continue;
+                        const std::vector<llama_token_data> & sidoubt = sdoubt[i_dft];
+                        for (size_t cidx = 0; cidx < sidoubt.size(); cidx++) {
+                            if (sidoubt[cidx].id == id) {
+                                LOG("Shoulda picked seq %3zu, pos %4d, candidate %2zu @ p %5.4f: %6d '%s'\n",
+                                        seqnum, i_dft, cidx, sidoubt[cidx].p,
+                                        id, token_str.c_str());
+                                running_average(avg_accepted, sidoubt[cidx].p);
+                                if (cidx < 2) {
+                                    running_average(avg_accept_delta, tgt_last_norm - sidoubt[cidx].p);
+                                    min_accepted = min_accepted < 1e-5f ? sidoubt[cidx].p : std::min(min_accepted, sidoubt[cidx].p);
+                                }
+                                break;
+                            }
+                        }
+                    }
                 }
             }
 
@@ -315,6 +415,7 @@ int main(int argc, char ** argv) {
             }
 
             for (int s = 0; s < n_seq_dft; ++s) {
+                drafts[s].split_pos = 0;
                 drafts[s].active = false;
                 drafts[s].tokens.clear();
                 drafts[s].tokens_p.clear();
@@ -327,10 +428,18 @@ int main(int argc, char ** argv) {
 
             llama_batch_clear(batch_dft);
             llama_batch_add  (batch_dft, id, n_past_dft, { 0 + DOFFS }, true);
+            if (self_speculation) {
+                // Copy KV items from non-brain-damaged model... Doesn't seem to help.
+                llama_kv_cache_seq_rm(ctx_dft, 0 + DOFFS, 0, n_past_dft - 2);
+                llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, 0, n_past_dft - 2);
+                // llama_kv_cache_seq_rm(ctx_dft, 0 + DOFFS, n_past_dft - 1, -1);
+                // llama_kv_cache_seq_cp(ctx_dft, 0, 0 + DOFFS, n_past_dft - 1, -1);
+            }
 
             LOG("=== EVAL: DRAFT ACCEPTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str());
-
+            t_cur = ggml_time_us();
             llama_decode         (ctx_dft, batch_dft);
+            t_dft_accept += ggml_time_us() - t_cur;
             save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens);
 
             ++n_past_dft;
@@ -358,9 +467,14 @@ int main(int argc, char ** argv) {
         llama_batch_clear(batch_tgt);
         llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
 
+        avg_rejected = std::max(0.05, std::min(avg_accepted - 0.05, avg_rejected));
+        avg_accepted = std::max(0.05, std::max(avg_rejected + 0.05, avg_accepted));
         // double avg_accepted = n_accept > 0 ? avg_accepted / double(n_accept) : 0;
-        LOG("Average accepted/rejected: %3.5f / %3.5f -- Min accepted/max rejected: %3.5f / %3.5f\n",
-                avg_accepted, avg_rejected, min_accepted, max_rejected);
+        LOG("STATS: Avg tacc/dacc/drej: %3.5f / %3.5f / %3.5f | Min dacc/min tacc/max drej: %3.5f / %3.5f / %3.5f | delta %3.5f | max streak %d | n_dft/pred/acc: %d / %d / %d\n",
+                tgt_avg_accepted, avg_accepted, avg_rejected, min_accepted, tgt_min_accepted, max_rejected, avg_accept_delta, max_streak,
+                n_drafted, n_predict, n_accept);
+        doubt.clear();
+        doubt.resize(n_seq_dft);
 
         // sample n_draft tokens from the draft model using tree-based sampling
         for (int i = 0; i < n_draft; ++i) {
@@ -371,43 +485,116 @@ int main(int argc, char ** argv) {
             }
 
             for (int s = 0; s < n_seq_dft; ++s) {
+                double accept_threshold, split_threshold;
+
                 if (!drafts[s].drafting || drafts[s].skip) {
                     continue;
                 }
+                doubt[s].push_back({});
+
+                if (avg_rejected == 0 || avg_rejected == 0 || n_drafted + n_predict < 6) {
+                    accept_threshold = std::max(0.6f, tgt_last_norm);
+                } else {
+
+                    accept_threshold = (tgt_avg_accepted - avg_accept_delta) * 0.3;
+                    accept_threshold *= std::min(0.8, std::max(0.1, double(tgt_last_norm * 1.0)));
+                    accept_threshold = std::max(double(min_accepted) * 1.1, accept_threshold);
+                    accept_threshold = std::max(std::max(avg_accepted * 0.9, avg_rejected * 1.1), accept_threshold);
+                    accept_threshold += 1.0 - (1.2 * n_accept / n_drafted);
+                    accept_threshold *= (1.3 - (std::min(n_seq_cur + i, 6) * 0.1));
+                    //
+                    // accept_threshold = (tgt_avg_accepted - avg_accept_delta) * 0.3;
+                    // accept_threshold *= std::min(0.8, std::max(0.1, double(tgt_last_norm * 1.0)));
+                    // accept_threshold = std::max(double(min_accepted) * 1.1, accept_threshold);
+                    // accept_threshold = std::max(std::max(avg_accepted * 0.9, avg_rejected * 1.1), accept_threshold);
+                    // accept_threshold += 1.0 - (1.2 * n_accept / n_drafted);
+                    // accept_threshold *= (0.7 + (std::min(n_seq_cur + i, 5) * 0.1));
+
+                }
 
-                restore_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft, drafts[s].i_batch_dft);
-                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
-                save_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft, drafts[s].i_batch_dft);
+                std::vector<llama_token_data> cur_p;
+                {
+                    llama_token d_id;
+                    std::vector<llama_token> already_picked;
+                    float * logits = NULL;
+
+                    t_cur = ggml_time_us();
+                    for (int cidx = 0; cidx < 9; cidx++) {
+                        llama_sampling_cp(drafts[s].ctx_sampling, ctx_dft_sampling);
+                        restore_logits(ctx_dft, logits_dft, n_vocab, 1, drafts[s].i_batch_dft);
+                        logits = llama_get_logits(ctx_dft);
+                        normalized_p = normalize_candidates(logits, n_vocab, normalized_candidates);
+                        for (size_t x = 0; x < std::min(normalized_p.size, size_t(10)); x++)
+                            doubt[s].back().push_back(normalized_p.data[x]);
+                        for (const auto & tid : already_picked)
+                            logits[tid] = std::numeric_limits<float>::infinity() * -1;
+                        d_id = llama_sampling_sample(ctx_dft_sampling, ctx_dft, NULL);
+                        already_picked.push_back(d_id);
+                        int32_t norm_pos = find_normalized(normalized_p, d_id);
+                        if (norm_pos < 0) continue;
+                        llama_token_data norm = normalized_candidates[norm_pos];
+                        if (norm.p < 0.2) continue;
+                        if (ctx_dft_sampling->params.temp <= 0) {
+                            llama_token_data_array tda = { ctx_dft_sampling->cur.data(), ctx_dft_sampling->cur.size(), false };
+                            llama_sample_top_k(ctx_dft, &tda, 100, 1);
+                            llama_sample_softmax(ctx_dft, &tda);
+                            ctx_dft_sampling->cur.resize(tda.size);
+                        }
 
-                const auto & cur_p = drafts[s].ctx_sampling->cur;
 
-                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
-                    if (cur_p[k].p < 1e-5f) continue;
-                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                        llama_token_data found;
+                        found.id = -1;
+                        for (const llama_token_data & td : ctx_dft_sampling->cur) {
+                            if (td.id == d_id) {
+                                found = td;
+                                break;
+                            }
+                        }
+                        GGML_ASSERT(found.id != -1);
+                        LOG(" ** draft candidate %3d for seq %3d, pos %3d: %6d (%4.3f, norm %4.3f) '%s'\n",
+                            cidx, s, i, found.id, found.p, norm_pos >= 0 ? normalized_candidates[norm_pos].p : -1,
+                            llama_token_to_piece(ctx_dft, found.id).c_str());
+                        if (found.p < 0.3) continue;
+                        if (norm.p < 1e-2f) break;
+                        cur_p.push_back(normalized_candidates[norm_pos]);
+                    }
+
+                    if (cur_p.size() > 1) {
+                        std::sort(cur_p.begin() + 1, cur_p.end(),
+                            [](const llama_token_data & a, const llama_token_data & b) {
+                                return a.p > b.p;
+                            }
+                        );
+                    }
+
                 }
 
-                double accept_threshold = avg_rejected == 0 || avg_rejected == 0 || n_drafted < 16
-                        ? p_accept
-                        : std::max(double(min_accepted * 0.98), avg_accepted * 0.75f);
-                // accept_threshold = 0.8;
-                if (cur_p[0].p < accept_threshold) {
-                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, accept_threshold);
+                t_dft_sample += ggml_time_us() - t_cur;
+
+                if (cur_p.empty()) {
+                    LOG("stopping drafting for seq %3d, no viable candidates (%5.3f) \n", s, accept_threshold);
+                    drafts[s].drafting = false;
+                    continue;
+                } else if (cur_p[0].p < accept_threshold && (cur_p[0].p + (cur_p.size() < 2 ? 0 : cur_p[1].p)) < accept_threshold * 1.3) {
+                    LOG("stopping drafting for seq %3d, pos %3d - probability too low: %.3f < %.3f\n", s, i, cur_p[0].p, accept_threshold);
                     drafts[s].drafting = false;
                     continue;
                 }
 
+                if (cur_p[0].p < accept_threshold) {
+                    split_threshold = 0.0;
+                } else {
+                    split_threshold = cur_p[0].p / 10.0;
+                    // split_threshold = std::max(0.01, cur_p[0].p * (n_seq_cur + i > 1 ? 0.15 : 0.2));
+                }
+
                 std::vector<int> sa(1, s);
 
+
+
+                // LOG("Check splits: %zu\n", cur_p.size());
                 // attempt to split the branch if the probability is high enough
-                for (int f = 1; f < 8; ++f) {
-                    // if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
-                    // if (n_seq_cur < n_seq_dft && cur_p[f].p > cur_p[0].p / 5) {
-                    double split_threshold = avg_accepted == 0 || avg_rejected == 0 || n_drafted < 16
-                            ? p_split
-                            : ( std::max(double(min_accepted * 0.7), avg_accepted * 0.4)
-                                * (n_seq_cur >= 2 ? 0.75 : 1.0) );
-                    // split_threshold = 0.1;
+                for (int f = 1; f < std::min(8, int(cur_p.size()) - 1); ++f) {
                     if (n_seq_cur < n_seq_dft && cur_p[f].p >= split_threshold) {
                         n_split++;
                         LOG(">>>%d<<< splitting seq %3d into %3d on %6d (%8.3f) '%s'\n", f, s, n_seq_cur,
@@ -428,6 +615,7 @@ int main(int argc, char ** argv) {
                         }
 
                         // copy the draft state
+                        drafts[n_seq_cur].split_pos = i;
                         drafts[n_seq_cur].active   = true;
                         drafts[n_seq_cur].drafting = true;
                         drafts[n_seq_cur].skip     = true;
@@ -443,6 +631,8 @@ int main(int argc, char ** argv) {
 
                         n_seq_cur++;
                     } else {
+                        LOG("Not splitting seq %3d into %3d, choice %2d @ %6d (%8.3f) '%s'\n", s, n_seq_cur, f,
+                                cur_p[f].id, cur_p[f].p, llama_token_to_piece(ctx_dft, cur_p[f].id).c_str());
                         break;
                     }
                 }
@@ -488,7 +678,9 @@ int main(int argc, char ** argv) {
             LOG("=== EVAL: DRAFTED ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_dft).c_str());
 
             // evaluate the drafted tokens on the draft model
+            t_cur = ggml_time_us();
             llama_decode(ctx_dft, batch_dft);
+            t_dft_gen += ggml_time_us() - t_cur;
             save_logits(ctx_dft, logits_dft, n_vocab, batch_dft.n_tokens);
             ++n_past_cur;
             ++n_drafted;
@@ -509,7 +701,9 @@ int main(int argc, char ** argv) {
             }
 
             LOG("=== EVAL: TARGET ===: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            t_cur = ggml_time_us();
             llama_decode(ctx_tgt, batch_tgt);
+            t_tgt_predict += ggml_time_us() - t_cur;
             save_logits(ctx_tgt, logits_tgt, n_vocab, batch_tgt.n_tokens);
             ++n_past_tgt;
         }
@@ -531,7 +725,9 @@ int main(int argc, char ** argv) {
 
     LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
     LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
-
+    LOG_TEE("times: target predict: %5.3f, draft gen/accept/sample: %5.3f / %5.3f / %5.3f\n",
+            t_tgt_predict / 1e6f, t_dft_gen / 1e6f, t_dft_accept / 1e6f, t_dft_sample / 1e6f);
+// int64_t t_dft_sample = 0, t_dft_gen = 0, t_dft_accept = 0, t_tgt_predict = 0;
     LOG_TEE("\n");
     LOG_TEE("n_draft   = %d\n", n_draft);
     LOG_TEE("n_predict = %d\n", n_predict);
@@ -541,7 +737,10 @@ int main(int argc, char ** argv) {
     LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
     LOG_TEE("n_draft   = %d\n", n_draft);
     LOG_TEE("n_split   = %d\n", n_split);
+    LOG_TEE("n_effsplit= %d\n", n_eff_split);
     LOG_TEE("n_badsplit= %d\n", n_bad_split);
+    LOG_TEE("n_dupsplit= %d\n", n_dup_split);
+    LOG_TEE("max streak= %d\n", max_streak);
 
     LOG_TEE("\ndraft:\n");
     llama_print_timings(ctx_dft);