@@ -324,16 +324,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
324
324
llama_batch batch = llama_batch_get_one (NULL , 0 , 0 , 0 );
325
325
326
326
const int32_t n_layers = 26 ;
327
- const int test_count = 15 ;
327
+ const int test_count = 10 ;
328
328
// 1 = attn, 2 = mlp, 3 = both
329
- int32_t test_skip_type = 1 ;
329
+ int32_t test_skip_type = 0 ;
330
330
std::vector<int32_t > layers;
331
331
layers.resize (n_layers + 1 );
332
332
std::fill (layers.begin (), layers.end (), 0 );
333
333
batch.run_layers = layers.data ();
334
334
int32_t skip_layer = -1 ;
335
335
std::vector<int32_t > skips;
336
- int32_t curr_best_layer = -1 ;
336
+ std::vector<int32_t > skip_types;
337
+ skip_types.resize (n_layers);
338
+ std::fill (skip_types.begin (), skip_types.end (), 0 );
339
+ int32_t curr_best_layer = -1 , curr_best_type = 0 ;
337
340
double curr_best_ppl = -1 , ref_ppl = -1 ;
338
341
339
342
int count = 0 ;
@@ -343,32 +346,47 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
343
346
fprintf (stderr, " %s: calculating perplexity over %d chunks, batch_size=%d\n " , __func__, n_chunk, n_batch);
344
347
345
348
std::vector<std::thread> workers (std::thread::hardware_concurrency () - 1 );
349
+ static const char * label = " ?AMB" ;
346
350
347
351
auto test_t_start = std::chrono::high_resolution_clock::now ();
348
352
for (int i = 0 ; i < n_chunk; ++i) {
349
353
if (i > 0 && i % test_count == 0 ) {
350
354
auto test_t_end = std::chrono::high_resolution_clock::now ();
351
355
float test_t_total = std::chrono::duration<float >(test_t_end - test_t_start).count ();
352
- for (int32_t new_sl = std::max (0 , skip_layer + 1 ); new_sl <= n_layers ; new_sl++) {
353
- if (std::find (skips.begin (), skips.end (), new_sl) != skips.end ()) continue ;
356
+
357
+ skip_layer = n_layers;
358
+ for (int32_t new_sl = 0 ; new_sl < n_layers; new_sl++) {
359
+ int32_t curr_skipped = (skip_types[new_sl] >> 2 ) | (skip_types[new_sl] & 3 );
360
+ if (curr_skipped == 3 ) continue ; // Already tested or perm skip.
354
361
skip_layer = new_sl;
362
+ test_skip_type = (curr_skipped & 1 ) != 0 ? 2 : 1 ;
355
363
break ;
356
364
}
357
365
if (skip_layer >= n_layers) {
358
366
if (curr_best_layer == -1 ) break ;
359
- printf (" \n\n ADD SKIP %3d - ppl vs ref %.4f" , curr_best_layer, curr_best_ppl - ref_ppl);
367
+ printf (" \n\n ADD SKIP %c%3d - ppl vs ref %.4f" ,
368
+ int (label[curr_best_type]), curr_best_layer,
369
+ curr_best_ppl - ref_ppl);
360
370
if (curr_best_ppl >= ref_ppl * 5 ) break ;
361
- skips.push_back (curr_best_layer);
371
+ skip_types[curr_best_layer] += curr_best_type;
372
+ if (std::find (skips.begin (), skips.end (), curr_best_layer) == skips.end ()) {
373
+ skips.push_back (curr_best_layer);
374
+ }
375
+ for (int i = 0 ; i < n_layers; i++) skip_types[i] &= 3 ;
362
376
curr_best_layer = -1 ;
363
377
curr_best_ppl = -1 ;
364
- skip_layer = -1 ;
365
- for (int32_t new_sl = skip_layer + 1 ; new_sl <= n_layers; new_sl++) {
366
- if (std::find (skips.begin (), skips.end (), new_sl) != skips.end ()) continue ;
378
+ curr_best_type = 0 ;
379
+ skip_layer = n_layers;
380
+ for (int32_t new_sl = 0 ; new_sl < n_layers; new_sl++) {
381
+ skip_types[new_sl] &= 3 ;
382
+ if (skip_types[new_sl] == 3 ) continue ; // Already tested or perm skip.
367
383
skip_layer = new_sl;
384
+ test_skip_type = (skip_types[new_sl] & 1 ) != 0 ? 2 : 1 ;
368
385
break ;
369
386
}
370
387
if (skip_layer == -1 || skip_layer == n_layers) break ;
371
388
}
389
+
372
390
i = 0 ;
373
391
count = 0 ;
374
392
nll = 0 ;
@@ -377,18 +395,16 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
377
395
prob_history.clear ();
378
396
379
397
for (int32_t i = 0 ; i < n_layers; i++) {
380
- if (i == skip_layer || std::find (skips.begin (), skips.end (), i) != skips.end ()) {
381
- layers[i] = test_skip_type;
382
- } else {
383
- layers[i] = 0 ;
384
- }
398
+ layers[i] = (skip_types[i] & 3 ) | (i == skip_layer ? test_skip_type : 0 );
385
399
}
386
400
layers[n_layers] = -1 ;
387
- printf (" \n SKIP %3d + [" , skip_layer);
388
- for (const auto l : skips) printf (" %d," , l);
389
- printf (" ] - len: %3zu, best:(%3d: %.3f), took %.2f sec\n " ,
401
+ printf (" \n TEST %c%3d + [" , int (label[test_skip_type]), skip_layer);
402
+ for (const auto l : skips) {
403
+ printf (" %c%d, " , int (label[skip_types[l] & 3 ]), l);
404
+ }
405
+ printf (" ] - len: %3zu, best:(%c%3d @ %.3f), last took %.2f sec\n " ,
390
406
skips.size () + 1 ,
391
- curr_best_layer,
407
+ int (label[curr_best_type]), curr_best_layer,
392
408
curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0 ,
393
409
test_t_total);
394
410
test_t_start = std::chrono::high_resolution_clock::now ();
@@ -475,10 +491,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
475
491
fflush (stdout);
476
492
if (skip_layer >= 0 && i + 1 == test_count) {
477
493
double ppl = std::exp (nll / count);
494
+ skip_types[skip_layer] |= test_skip_type << 2 ;
478
495
if (curr_best_layer == -1 || ppl < curr_best_ppl) {
479
496
curr_best_layer = skip_layer;
480
497
curr_best_ppl = ppl;
498
+ curr_best_type = test_skip_type;
481
499
}
500
+ printf (" -- %.3f" , ppl - ref_ppl);
482
501
} else if (skip_layer < 0 ) {
483
502
ref_ppl = std::exp (nll / count);
484
503
}
0 commit comments