Exhaustively test skipping attn and MLP layers

KerfuffleV2 · KerfuffleV2 · commit 4a368edc80b6 · 2023-10-17T16:17:53.000-06:00
Cover an extra alloc case where skipping could fail
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -324,16 +324,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     llama_batch batch = llama_batch_get_one(NULL, 0, 0, 0);
 
     const int32_t n_layers = 26;
-    const int test_count = 15;
+    const int test_count = 10;
     // 1 = attn, 2 = mlp, 3 = both
-    int32_t test_skip_type = 1;
+    int32_t test_skip_type = 0;
     std::vector<int32_t> layers;
     layers.resize(n_layers + 1);
     std::fill(layers.begin(), layers.end(), 0);
     batch.run_layers = layers.data();
     int32_t skip_layer = -1;
     std::vector<int32_t> skips;
-    int32_t curr_best_layer = -1;
+    std::vector<int32_t> skip_types;
+    skip_types.resize(n_layers);
+    std::fill(skip_types.begin(), skip_types.end(), 0);
+    int32_t curr_best_layer = -1, curr_best_type = 0;
     double curr_best_ppl = -1, ref_ppl = -1;
 
     int count = 0;
@@ -343,32 +346,47 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+    static const char * label = "?AMB";
 
     auto test_t_start = std::chrono::high_resolution_clock::now();
     for (int i = 0; i < n_chunk; ++i) {
         if (i > 0 && i % test_count == 0) {
             auto test_t_end = std::chrono::high_resolution_clock::now();
             float test_t_total = std::chrono::duration<float>(test_t_end - test_t_start).count();
-            for (int32_t new_sl = std::max(0, skip_layer + 1); new_sl <= n_layers ; new_sl++) {
-                if (std::find(skips.begin(), skips.end(), new_sl) != skips.end()) continue;
+
+            skip_layer = n_layers;
+            for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) {
+                int32_t curr_skipped = (skip_types[new_sl] >> 2) | (skip_types[new_sl] & 3);
+                if (curr_skipped == 3) continue; // Already tested or perm skip.
                 skip_layer = new_sl;
+                test_skip_type = (curr_skipped & 1) != 0 ? 2 : 1;
                 break;
             }
             if (skip_layer >= n_layers) {
                 if (curr_best_layer == -1) break;
-                printf("\n\nADD SKIP %3d - ppl vs ref %.4f", curr_best_layer, curr_best_ppl - ref_ppl);
+                printf("\n\nADD SKIP %c%3d - ppl vs ref %.4f",
+                    int(label[curr_best_type]), curr_best_layer,
+                    curr_best_ppl - ref_ppl);
                 if (curr_best_ppl >= ref_ppl * 5) break;
-                skips.push_back(curr_best_layer);
+                skip_types[curr_best_layer] += curr_best_type;
+                if (std::find(skips.begin(), skips.end(), curr_best_layer) == skips.end()) {
+                    skips.push_back(curr_best_layer);
+                }
+                for (int i = 0; i < n_layers; i++) skip_types[i] &= 3;
                 curr_best_layer = -1;
                 curr_best_ppl = -1;
-                skip_layer = -1;
-                for (int32_t new_sl = skip_layer + 1; new_sl <= n_layers; new_sl++) {
-                    if (std::find(skips.begin(), skips.end(), new_sl) != skips.end()) continue;
+                curr_best_type = 0;
+                skip_layer = n_layers;
+                for (int32_t new_sl = 0; new_sl < n_layers; new_sl++) {
+                    skip_types[new_sl] &= 3;
+                    if (skip_types[new_sl] == 3) continue; // Already tested or perm skip.
                     skip_layer = new_sl;
+                    test_skip_type = (skip_types[new_sl] & 1) != 0 ? 2 : 1;
                     break;
                 }
                 if (skip_layer == -1 || skip_layer == n_layers) break;
             }
+
             i = 0;
             count = 0;
             nll = 0;
@@ -377,18 +395,16 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             prob_history.clear();
 
             for (int32_t i = 0; i < n_layers; i++) {
-                if (i == skip_layer || std::find(skips.begin(), skips.end(), i) != skips.end()) {
-                    layers[i] = test_skip_type;
-                } else {
-                    layers[i] = 0;
-                }
+                layers[i] = (skip_types[i] & 3) | (i == skip_layer ? test_skip_type : 0);
             }
             layers[n_layers] = -1;
-            printf("\nSKIP %3d + [", skip_layer);
-            for (const auto l : skips) printf("%d,", l);
-            printf("] - len: %3zu, best:(%3d: %.3f), took %.2f sec\n",
+            printf("\nTEST %c%3d + [", int(label[test_skip_type]), skip_layer);
+            for (const auto l : skips) {
+                printf("%c%d, ", int(label[skip_types[l] & 3]), l);
+            }
+            printf("] - len: %3zu, best:(%c%3d @ %.3f), last took %.2f sec\n",
                 skips.size() + 1,
-                curr_best_layer,
+                int(label[curr_best_type]), curr_best_layer,
                 curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0,
                 test_t_total);
             test_t_start = std::chrono::high_resolution_clock::now();
@@ -475,10 +491,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         fflush(stdout);
         if (skip_layer >= 0 && i + 1 == test_count) {
             double ppl = std::exp(nll / count);
+            skip_types[skip_layer] |= test_skip_type << 2;
             if (curr_best_layer == -1 || ppl < curr_best_ppl) {
                 curr_best_layer = skip_layer;
                 curr_best_ppl = ppl;
+                curr_best_type = test_skip_type;
             }
+            printf(" -- %.3f", ppl - ref_ppl);
         } else if (skip_layer < 0) {
             ref_ppl = std::exp(nll / count);
         }
diff --git a/llama.cpp b/llama.cpp
@@ -3246,6 +3246,10 @@ static struct ggml_cgraph * llm_build_llama(
             // No idea why this is needed, but otherwise we run out of space
             // when skipping attn or mlp (but not both) on the last layer
             run_mlp = false;
+        } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 2) {
+            // No idea why this is needed, but otherwise we run out of space
+            // when skipping attn or mlp (but not both) on the last layer
+            run_attn = false;
         }
         if (!run_attn && !run_mlp) continue;