Skip to content

Commit b922bc3

Browse files
authored
llama : remove shards weight file support (#2000)
* Remove multiple shards * Remove multiple file loaders * Remove llama_load_tensor_shard class * Simplify load logic * Remove dead code guess_n_parts function * Remove vocab_only from constructor of llama_model_loader * Remove alignment_prevents_mmap which is not more needed. * Remove useless check
1 parent 7f9753f commit b922bc3

File tree

1 file changed

+35
-198
lines changed

1 file changed

+35
-198
lines changed

llama.cpp

+35-198
Original file line numberDiff line numberDiff line change
@@ -364,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
364364
return size / ggml_blck_size(type);
365365
}
366366

367-
struct llama_load_tensor_shard {
368-
std::vector<uint32_t> ne;
369-
size_t size;
370-
enum ggml_type type;
371-
size_t file_idx;
372-
size_t file_off;
373-
374-
void calc_size() {
375-
size = llama_calc_tensor_size(ne, type);
376-
}
377-
};
378-
379-
enum llama_split_type {
380-
SPLIT_NONE,
381-
SPLIT_BY_COLUMNS,
382-
SPLIT_BY_ROWS
383-
};
384-
385367
struct llama_load_tensor {
386-
std::vector<llama_load_tensor_shard> shards;
387-
388368
std::string name;
389369
enum ggml_type type = GGML_TYPE_F32;
390-
llama_split_type split_type = SPLIT_NONE;
391370
std::vector<uint32_t> ne;
371+
size_t file_off;
392372
size_t size;
393373
struct ggml_tensor * ggml_tensor = NULL;
394374
uint8_t * data;
395-
396-
llama_load_tensor(const std::string & name) : name(name) {}
397-
398-
void calc_all() {
399-
calc_type();
400-
calc_split_type();
401-
calc_ne();
402-
calc_size();
403-
}
404-
405-
void calc_type() {
406-
const auto & first_shard = shards.at(0);
407-
for (const auto & shard : shards) {
408-
if (shard.type != first_shard.type) {
409-
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
410-
}
411-
}
412-
type = first_shard.type;
413-
}
414-
415-
void calc_split_type() {
416-
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
417-
shards.size() == 1) { // only one file?
418-
split_type = SPLIT_NONE;
419-
} else if (name.find("tok_embeddings.") == 0 ||
420-
name.find(".attention.wo.weight") != std::string::npos ||
421-
name.find(".feed_forward.w2.weight") != std::string::npos) {
422-
split_type = SPLIT_BY_COLUMNS;
423-
} else {
424-
split_type = SPLIT_BY_ROWS;
425-
}
426-
}
427-
428-
void calc_ne() {
429-
const auto & first_shard = shards.at(0);
430-
for (const auto & shard : shards) {
431-
if (shard.ne != first_shard.ne) {
432-
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
433-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
434-
}
435-
}
436-
ne = first_shard.ne;
437-
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
438-
uint32_t n_shards = (uint32_t) shards.size();
439-
switch (split_type) {
440-
case SPLIT_NONE:
441-
ne = first_shard.ne;
442-
break;
443-
case SPLIT_BY_COLUMNS:
444-
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
445-
first_shard.ne[1]};
446-
break;
447-
case SPLIT_BY_ROWS:
448-
ne = {first_shard.ne[0],
449-
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
450-
break;
451-
}
452-
}
453-
454-
void calc_size() {
455-
size = llama_calc_tensor_size(ne, type);
456-
}
457375
};
458376

459377
struct llama_load_tensors_map {
@@ -476,13 +394,13 @@ struct llama_file_loader {
476394
llama_hparams hparams;
477395
llama_vocab vocab;
478396

479-
llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
397+
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
480398
: file(fname, "rb") {
481399
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
482400
read_magic();
483401
read_hparams();
484402
read_vocab();
485-
read_tensor_metadata(file_idx, tensors_map);
403+
read_tensor_metadata(tensors_map);
486404
}
487405
void read_magic() {
488406
uint32_t magic = file.read_u32();
@@ -539,19 +457,19 @@ struct llama_file_loader {
539457
tok_score.score = score;
540458
}
541459
}
542-
void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
460+
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
543461
while (file.tell() < file.size) {
544-
llama_load_tensor_shard shard;
462+
llama_load_tensor tensor;
545463
uint32_t n_dims = file.read_u32();
546464
uint32_t name_len = file.read_u32();
547-
shard.type = (enum ggml_type) file.read_u32();
548-
shard.ne.resize(n_dims);
549-
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
465+
tensor.type = (enum ggml_type) file.read_u32();
466+
tensor.ne.resize(n_dims);
467+
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
550468
std::string name = file.read_string(name_len);
551469
if (n_dims < 1 || n_dims > 2) {
552470
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
553471
}
554-
switch (shard.type) {
472+
switch (tensor.type) {
555473
case GGML_TYPE_F32:
556474
case GGML_TYPE_F16:
557475
case GGML_TYPE_Q4_0:
@@ -566,30 +484,20 @@ struct llama_file_loader {
566484
case GGML_TYPE_Q6_K:
567485
break;
568486
default: {
569-
throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
487+
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
570488
}
571489
}
572490

573-
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
574-
// skip to the next multiple of 32 bytes
575-
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
576-
}
577-
shard.file_idx = file_idx;
578-
shard.file_off = file.tell();
491+
// skip to the next multiple of 32 bytes
492+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
579493

580-
shard.calc_size();
581-
file.seek(shard.size, SEEK_CUR);
494+
tensor.file_off = file.tell();
495+
tensor.name = name;
496+
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
497+
file.seek(tensor.size, SEEK_CUR);
582498

583-
auto it = tensors_map.name_to_idx.find(name);
584-
size_t idx;
585-
if (it != tensors_map.name_to_idx.end()) {
586-
idx = it->second;
587-
} else {
588-
tensors_map.tensors.emplace_back(name);
589-
idx = tensors_map.tensors.size() - 1;
590-
tensors_map.name_to_idx.emplace(name, idx);
591-
}
592-
tensors_map.tensors.at(idx).shards.push_back(shard);
499+
tensors_map.tensors.push_back(tensor);
500+
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
593501
}
594502
}
595503
};
@@ -659,56 +567,19 @@ struct llama_file_saver {
659567
};
660568

661569
struct llama_model_loader {
662-
std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
570+
std::unique_ptr<llama_file_loader> file_loader;
663571
llama_load_tensors_map tensors_map;
664572
bool use_mmap;
665573
size_t num_ggml_tensors_created = 0;
666574
struct ggml_context * ggml_ctx = NULL;
667575
std::unique_ptr<llama_mmap> mapping;
668576

669-
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
670-
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
671-
file_loaders.emplace_back(first_file);
672-
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
673-
for (uint32_t i = 1; i < n_parts; i++) {
674-
std::string fname = fname_base + "." + std::to_string(i);
675-
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
676-
file_loaders.emplace_back(ith_file);
677-
if (ith_file->hparams != first_file->hparams) {
678-
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
679-
}
680-
}
577+
llama_model_loader(const std::string & fname_base, bool use_mmap) {
578+
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
681579
if (!llama_mmap::SUPPORTED) {
682580
use_mmap = false;
683581
}
684-
if (use_mmap && alignment_prevents_mmap()) {
685-
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
686-
use_mmap = false;
687-
}
688582
this->use_mmap = use_mmap;
689-
for (llama_load_tensor & lt : tensors_map.tensors) {
690-
lt.calc_all();
691-
}
692-
}
693-
694-
bool alignment_prevents_mmap() {
695-
for (const llama_load_tensor & lt : tensors_map.tensors) {
696-
for (const llama_load_tensor_shard & shard : lt.shards) {
697-
if (shard.file_off & 3) {
698-
return true;
699-
}
700-
}
701-
}
702-
return false;
703-
}
704-
705-
uint32_t guess_n_parts() const {
706-
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
707-
if (it == tensors_map.name_to_idx.end()) {
708-
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
709-
}
710-
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
711-
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
712583
}
713584

714585
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -774,7 +645,7 @@ struct llama_model_loader {
774645
}
775646

776647
if (use_mmap) {
777-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
648+
mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
778649
if (lmlock) {
779650
lmlock->init(mapping->addr);
780651
}
@@ -830,45 +701,13 @@ struct llama_model_loader {
830701

831702
void load_data_for(llama_load_tensor & lt) {
832703
if (use_mmap) {
833-
LLAMA_ASSERT(lt.shards.size() == 1);
834-
lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
835-
} else if (lt.split_type == SPLIT_NONE) {
836-
llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
837-
file.seek(lt.shards.at(0).file_off, SEEK_SET);
704+
lt.data = (uint8_t *) mapping->addr + lt.file_off;
705+
} else {
706+
llama_file & file = file_loader->file;
707+
file.seek(lt.file_off, SEEK_SET);
838708
file.read_raw(lt.data, lt.size);
839-
} else if (lt.split_type == SPLIT_BY_ROWS) {
840-
size_t offset = 0;
841-
for (llama_load_tensor_shard & shard : lt.shards) {
842-
llama_file & file = file_loaders.at(shard.file_idx)->file;
843-
file.seek(shard.file_off, SEEK_SET);
844-
file.read_raw(lt.data + offset, shard.size);
845-
offset += shard.size;
846-
}
847-
LLAMA_ASSERT(offset == lt.size);
848-
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
849-
// Let's load the data into temporary buffers to ensure the OS performs large loads.
850-
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
851-
for (size_t i = 0; i < lt.shards.size(); i++) {
852-
llama_load_tensor_shard & shard = lt.shards.at(i);
853-
llama_file & file = file_loaders.at(shard.file_idx)->file;
854-
file.seek(shard.file_off, SEEK_SET);
855-
tmp_bufs.at(i).resize(shard.size);
856-
file.read_raw(tmp_bufs.at(i).addr, shard.size);
857-
}
858-
// Then reshape.
859-
size_t num_rows = lt.ne.at(1);
860-
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
861-
size_t out_offset = 0;
862-
for (size_t row = 0; row < num_rows; row++) {
863-
for (llama_buffer & tmp_buf : tmp_bufs) {
864-
memcpy(lt.data + out_offset,
865-
tmp_buf.addr + row * per_shard_row_size,
866-
per_shard_row_size);
867-
out_offset += per_shard_row_size;
868-
}
869-
}
870-
LLAMA_ASSERT(out_offset == lt.size);
871709
}
710+
872711
if (0) {
873712
print_checksum(lt);
874713
}
@@ -1067,12 +906,12 @@ static void llama_model_load_internal(
1067906

1068907
model.t_start_us = ggml_time_us();
1069908

1070-
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
909+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
1071910

1072-
vocab = std::move(ml->file_loaders.at(0)->vocab);
1073-
model.hparams = ml->file_loaders.at(0)->hparams;
911+
vocab = std::move(ml->file_loader->vocab);
912+
model.hparams = ml->file_loader->hparams;
1074913
model.n_gpu_layers = n_gpu_layers;
1075-
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
914+
llama_file_version file_version = ml->file_loader->file_version;
1076915
auto & hparams = model.hparams;
1077916

1078917
{
@@ -1106,7 +945,6 @@ static void llama_model_load_internal(
1106945
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1107946
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1108947
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1109-
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
1110948
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1111949
}
1112950

@@ -2461,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
24612299
nthread = std::thread::hardware_concurrency();
24622300
}
24632301

2464-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2465-
/*vocab_only*/ false));
2466-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2302+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
2303+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
24672304

24682305
#ifdef GGML_USE_K_QUANTS
24692306
int n_attention_wv = 0;
@@ -2897,7 +2734,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
28972734
llama_buffer base_buf;
28982735
if (path_base_model) {
28992736
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2900-
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2737+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
29012738

29022739
size_t ctx_size;
29032740
size_t mmapped_size;
@@ -2915,7 +2752,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
29152752

29162753
// maybe this should in llama_model_loader
29172754
if (model_loader->use_mmap) {
2918-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
2755+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
29192756
}
29202757
}
29212758

0 commit comments

Comments
 (0)