@@ -364,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
364
364
return size / ggml_blck_size (type);
365
365
}
366
366
367
- struct llama_load_tensor_shard {
368
- std::vector<uint32_t > ne;
369
- size_t size;
370
- enum ggml_type type;
371
- size_t file_idx;
372
- size_t file_off;
373
-
374
- void calc_size () {
375
- size = llama_calc_tensor_size (ne, type);
376
- }
377
- };
378
-
379
- enum llama_split_type {
380
- SPLIT_NONE,
381
- SPLIT_BY_COLUMNS,
382
- SPLIT_BY_ROWS
383
- };
384
-
385
367
struct llama_load_tensor {
386
- std::vector<llama_load_tensor_shard> shards;
387
-
388
368
std::string name;
389
369
enum ggml_type type = GGML_TYPE_F32;
390
- llama_split_type split_type = SPLIT_NONE;
391
370
std::vector<uint32_t > ne;
371
+ size_t file_off;
392
372
size_t size;
393
373
struct ggml_tensor * ggml_tensor = NULL ;
394
374
uint8_t * data;
395
-
396
- llama_load_tensor (const std::string & name) : name(name) {}
397
-
398
- void calc_all () {
399
- calc_type ();
400
- calc_split_type ();
401
- calc_ne ();
402
- calc_size ();
403
- }
404
-
405
- void calc_type () {
406
- const auto & first_shard = shards.at (0 );
407
- for (const auto & shard : shards) {
408
- if (shard.type != first_shard.type ) {
409
- throw std::runtime_error (format (" inconsistent tensor shard type in '%s'" , name.c_str ()));
410
- }
411
- }
412
- type = first_shard.type ;
413
- }
414
-
415
- void calc_split_type () {
416
- if (shards.at (0 ).ne .size () == 1 || // 1D tensors are just duplicated in every file
417
- shards.size () == 1 ) { // only one file?
418
- split_type = SPLIT_NONE;
419
- } else if (name.find (" tok_embeddings." ) == 0 ||
420
- name.find (" .attention.wo.weight" ) != std::string::npos ||
421
- name.find (" .feed_forward.w2.weight" ) != std::string::npos) {
422
- split_type = SPLIT_BY_COLUMNS;
423
- } else {
424
- split_type = SPLIT_BY_ROWS;
425
- }
426
- }
427
-
428
- void calc_ne () {
429
- const auto & first_shard = shards.at (0 );
430
- for (const auto & shard : shards) {
431
- if (shard.ne != first_shard.ne ) {
432
- throw std::runtime_error (format (" inconsistent tensor shard shape in '%s': first was %s, other was %s" ,
433
- name.c_str (), llama_format_tensor_shape (first_shard.ne ).c_str (), llama_format_tensor_shape (shard.ne ).c_str ()));
434
- }
435
- }
436
- ne = first_shard.ne ;
437
- LLAMA_ASSERT (shards.size () <= UINT32_MAX);
438
- uint32_t n_shards = (uint32_t ) shards.size ();
439
- switch (split_type) {
440
- case SPLIT_NONE:
441
- ne = first_shard.ne ;
442
- break ;
443
- case SPLIT_BY_COLUMNS:
444
- ne = {checked_mul<uint32_t >(first_shard.ne [0 ], n_shards),
445
- first_shard.ne [1 ]};
446
- break ;
447
- case SPLIT_BY_ROWS:
448
- ne = {first_shard.ne [0 ],
449
- checked_mul<uint32_t >(first_shard.ne [1 ], n_shards)};
450
- break ;
451
- }
452
- }
453
-
454
- void calc_size () {
455
- size = llama_calc_tensor_size (ne, type);
456
- }
457
375
};
458
376
459
377
struct llama_load_tensors_map {
@@ -476,13 +394,13 @@ struct llama_file_loader {
476
394
llama_hparams hparams;
477
395
llama_vocab vocab;
478
396
479
- llama_file_loader (const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
397
+ llama_file_loader (const char * fname, llama_load_tensors_map & tensors_map)
480
398
: file(fname, " rb" ) {
481
399
fprintf (stderr, " llama.cpp: loading model from %s\n " , fname);
482
400
read_magic ();
483
401
read_hparams ();
484
402
read_vocab ();
485
- read_tensor_metadata (file_idx, tensors_map);
403
+ read_tensor_metadata (tensors_map);
486
404
}
487
405
void read_magic () {
488
406
uint32_t magic = file.read_u32 ();
@@ -539,19 +457,19 @@ struct llama_file_loader {
539
457
tok_score.score = score;
540
458
}
541
459
}
542
- void read_tensor_metadata (size_t file_idx, llama_load_tensors_map & tensors_map) {
460
+ void read_tensor_metadata (llama_load_tensors_map & tensors_map) {
543
461
while (file.tell () < file.size ) {
544
- llama_load_tensor_shard shard ;
462
+ llama_load_tensor tensor ;
545
463
uint32_t n_dims = file.read_u32 ();
546
464
uint32_t name_len = file.read_u32 ();
547
- shard .type = (enum ggml_type) file.read_u32 ();
548
- shard .ne .resize (n_dims);
549
- file.read_raw (shard .ne .data (), sizeof (shard .ne [0 ]) * n_dims);
465
+ tensor .type = (enum ggml_type) file.read_u32 ();
466
+ tensor .ne .resize (n_dims);
467
+ file.read_raw (tensor .ne .data (), sizeof (tensor .ne [0 ]) * n_dims);
550
468
std::string name = file.read_string (name_len);
551
469
if (n_dims < 1 || n_dims > 2 ) {
552
470
throw std::runtime_error (format (" llama.cpp: tensor '%s' should not be %u-dimensional" , name.c_str (), n_dims));
553
471
}
554
- switch (shard .type ) {
472
+ switch (tensor .type ) {
555
473
case GGML_TYPE_F32:
556
474
case GGML_TYPE_F16:
557
475
case GGML_TYPE_Q4_0:
@@ -566,30 +484,20 @@ struct llama_file_loader {
566
484
case GGML_TYPE_Q6_K:
567
485
break ;
568
486
default : {
569
- throw std::runtime_error (format (" unrecognized tensor type %u\n " , shard .type ));
487
+ throw std::runtime_error (format (" unrecognized tensor type %u\n " , tensor .type ));
570
488
}
571
489
}
572
490
573
- if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
574
- // skip to the next multiple of 32 bytes
575
- file.seek (-static_cast <ptrdiff_t >(file.tell ()) & 31 , SEEK_CUR);
576
- }
577
- shard.file_idx = file_idx;
578
- shard.file_off = file.tell ();
491
+ // skip to the next multiple of 32 bytes
492
+ file.seek (-static_cast <ptrdiff_t >(file.tell ()) & 31 , SEEK_CUR);
579
493
580
- shard.calc_size ();
581
- file.seek (shard.size , SEEK_CUR);
494
+ tensor.file_off = file.tell ();
495
+ tensor.name = name;
496
+ tensor.size = llama_calc_tensor_size (tensor.ne , tensor.type );
497
+ file.seek (tensor.size , SEEK_CUR);
582
498
583
- auto it = tensors_map.name_to_idx .find (name);
584
- size_t idx;
585
- if (it != tensors_map.name_to_idx .end ()) {
586
- idx = it->second ;
587
- } else {
588
- tensors_map.tensors .emplace_back (name);
589
- idx = tensors_map.tensors .size () - 1 ;
590
- tensors_map.name_to_idx .emplace (name, idx);
591
- }
592
- tensors_map.tensors .at (idx).shards .push_back (shard);
499
+ tensors_map.tensors .push_back (tensor);
500
+ tensors_map.name_to_idx [name] = tensors_map.tensors .size () - 1 ;
593
501
}
594
502
}
595
503
};
@@ -659,56 +567,19 @@ struct llama_file_saver {
659
567
};
660
568
661
569
struct llama_model_loader {
662
- std::vector<std:: unique_ptr<llama_file_loader>> file_loaders ;
570
+ std::unique_ptr<llama_file_loader> file_loader ;
663
571
llama_load_tensors_map tensors_map;
664
572
bool use_mmap;
665
573
size_t num_ggml_tensors_created = 0 ;
666
574
struct ggml_context * ggml_ctx = NULL ;
667
575
std::unique_ptr<llama_mmap> mapping;
668
576
669
- llama_model_loader (const std::string & fname_base, bool use_mmap, bool vocab_only) {
670
- auto * first_file = new llama_file_loader (fname_base.c_str (), 0 , tensors_map);
671
- file_loaders.emplace_back (first_file);
672
- uint32_t n_parts = vocab_only ? 1 : guess_n_parts ();
673
- for (uint32_t i = 1 ; i < n_parts; i++) {
674
- std::string fname = fname_base + " ." + std::to_string (i);
675
- auto * ith_file = new llama_file_loader (fname.c_str (), i, tensors_map);
676
- file_loaders.emplace_back (ith_file);
677
- if (ith_file->hparams != first_file->hparams ) {
678
- throw std::runtime_error (format (" llama.cpp: hparams inconsistent between files" ));
679
- }
680
- }
577
+ llama_model_loader (const std::string & fname_base, bool use_mmap) {
578
+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader (fname_base.c_str (), tensors_map));
681
579
if (!llama_mmap::SUPPORTED) {
682
580
use_mmap = false ;
683
581
}
684
- if (use_mmap && alignment_prevents_mmap ()) {
685
- fprintf (stderr, " llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n " );
686
- use_mmap = false ;
687
- }
688
582
this ->use_mmap = use_mmap;
689
- for (llama_load_tensor & lt : tensors_map.tensors ) {
690
- lt.calc_all ();
691
- }
692
- }
693
-
694
- bool alignment_prevents_mmap () {
695
- for (const llama_load_tensor & lt : tensors_map.tensors ) {
696
- for (const llama_load_tensor_shard & shard : lt.shards ) {
697
- if (shard.file_off & 3 ) {
698
- return true ;
699
- }
700
- }
701
- }
702
- return false ;
703
- }
704
-
705
- uint32_t guess_n_parts () const {
706
- auto it = tensors_map.name_to_idx .find (" tok_embeddings.weight" );
707
- if (it == tensors_map.name_to_idx .end ()) {
708
- throw std::runtime_error (std::string (" missing tok_embeddings.weight" ));
709
- }
710
- const llama_load_tensor & lt = tensors_map.tensors .at (it->second );
711
- return file_loaders.at (0 )->hparams .n_embd / lt.shards .at (0 ).ne .at (0 );
712
583
}
713
584
714
585
void calc_sizes (size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -774,7 +645,7 @@ struct llama_model_loader {
774
645
}
775
646
776
647
if (use_mmap) {
777
- mapping.reset (new llama_mmap (&file_loaders. at ( 0 ) ->file , prefetch_size, ggml_is_numa ()));
648
+ mapping.reset (new llama_mmap (&file_loader ->file , prefetch_size, ggml_is_numa ()));
778
649
if (lmlock) {
779
650
lmlock->init (mapping->addr );
780
651
}
@@ -830,45 +701,13 @@ struct llama_model_loader {
830
701
831
702
void load_data_for (llama_load_tensor & lt) {
832
703
if (use_mmap) {
833
- LLAMA_ASSERT (lt.shards .size () == 1 );
834
- lt.data = (uint8_t *) mapping->addr + lt.shards .at (0 ).file_off ;
835
- } else if (lt.split_type == SPLIT_NONE) {
836
- llama_file & file = file_loaders.at (lt.shards .at (0 ).file_idx )->file ;
837
- file.seek (lt.shards .at (0 ).file_off , SEEK_SET);
704
+ lt.data = (uint8_t *) mapping->addr + lt.file_off ;
705
+ } else {
706
+ llama_file & file = file_loader->file ;
707
+ file.seek (lt.file_off , SEEK_SET);
838
708
file.read_raw (lt.data , lt.size );
839
- } else if (lt.split_type == SPLIT_BY_ROWS) {
840
- size_t offset = 0 ;
841
- for (llama_load_tensor_shard & shard : lt.shards ) {
842
- llama_file & file = file_loaders.at (shard.file_idx )->file ;
843
- file.seek (shard.file_off , SEEK_SET);
844
- file.read_raw (lt.data + offset, shard.size );
845
- offset += shard.size ;
846
- }
847
- LLAMA_ASSERT (offset == lt.size );
848
- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
849
- // Let's load the data into temporary buffers to ensure the OS performs large loads.
850
- std::vector<llama_buffer> tmp_bufs (lt.shards .size ());
851
- for (size_t i = 0 ; i < lt.shards .size (); i++) {
852
- llama_load_tensor_shard & shard = lt.shards .at (i);
853
- llama_file & file = file_loaders.at (shard.file_idx )->file ;
854
- file.seek (shard.file_off , SEEK_SET);
855
- tmp_bufs.at (i).resize (shard.size );
856
- file.read_raw (tmp_bufs.at (i).addr , shard.size );
857
- }
858
- // Then reshape.
859
- size_t num_rows = lt.ne .at (1 );
860
- size_t per_shard_row_size = lt.shards .at (0 ).size / num_rows;
861
- size_t out_offset = 0 ;
862
- for (size_t row = 0 ; row < num_rows; row++) {
863
- for (llama_buffer & tmp_buf : tmp_bufs) {
864
- memcpy (lt.data + out_offset,
865
- tmp_buf.addr + row * per_shard_row_size,
866
- per_shard_row_size);
867
- out_offset += per_shard_row_size;
868
- }
869
- }
870
- LLAMA_ASSERT (out_offset == lt.size );
871
709
}
710
+
872
711
if (0 ) {
873
712
print_checksum (lt);
874
713
}
@@ -1067,12 +906,12 @@ static void llama_model_load_internal(
1067
906
1068
907
model.t_start_us = ggml_time_us ();
1069
908
1070
- std::unique_ptr<llama_model_loader> ml (new llama_model_loader (fname, use_mmap, vocab_only ));
909
+ std::unique_ptr<llama_model_loader> ml (new llama_model_loader (fname, use_mmap));
1071
910
1072
- vocab = std::move (ml->file_loaders . at ( 0 ) ->vocab );
1073
- model.hparams = ml->file_loaders . at ( 0 ) ->hparams ;
911
+ vocab = std::move (ml->file_loader ->vocab );
912
+ model.hparams = ml->file_loader ->hparams ;
1074
913
model.n_gpu_layers = n_gpu_layers;
1075
- llama_file_version file_version = ml->file_loaders . at ( 0 ) ->file_version ;
914
+ llama_file_version file_version = ml->file_loader ->file_version ;
1076
915
auto & hparams = model.hparams ;
1077
916
1078
917
{
@@ -1106,7 +945,6 @@ static void llama_model_load_internal(
1106
945
fprintf (stderr, " %s: n_rot = %u\n " , __func__, hparams.n_rot );
1107
946
fprintf (stderr, " %s: ftype = %u (%s)\n " , __func__, hparams.ftype , llama_ftype_name (hparams.ftype ));
1108
947
fprintf (stderr, " %s: n_ff = %u\n " , __func__, n_ff);
1109
- fprintf (stderr, " %s: n_parts = %zu\n " , __func__, ml->file_loaders .size ());
1110
948
fprintf (stderr, " %s: model size = %s\n " , __func__, llama_model_type_name (model.type ));
1111
949
}
1112
950
@@ -2461,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2461
2299
nthread = std::thread::hardware_concurrency ();
2462
2300
}
2463
2301
2464
- std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ,
2465
- /* vocab_only*/ false ));
2466
- llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders .at (0 ).get (), params->ftype );
2302
+ std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ));
2303
+ llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loader .get (), params->ftype );
2467
2304
2468
2305
#ifdef GGML_USE_K_QUANTS
2469
2306
int n_attention_wv = 0 ;
@@ -2897,7 +2734,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2897
2734
llama_buffer base_buf;
2898
2735
if (path_base_model) {
2899
2736
fprintf (stderr, " %s: loading base model from '%s'\n " , __func__, path_base_model);
2900
- model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true , /* vocab_only */ false ));
2737
+ model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true ));
2901
2738
2902
2739
size_t ctx_size;
2903
2740
size_t mmapped_size;
@@ -2915,7 +2752,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2915
2752
2916
2753
// maybe this should in llama_model_loader
2917
2754
if (model_loader->use_mmap ) {
2918
- model_loader->mapping .reset (new llama_mmap (&model_loader->file_loaders . at ( 0 ) ->file , /* prefetch */ 0 , ggml_is_numa ()));
2755
+ model_loader->mapping .reset (new llama_mmap (&model_loader->file_loader ->file , /* prefetch */ 0 , ggml_is_numa ()));
2919
2756
}
2920
2757
}
2921
2758
0 commit comments