Skip to content

Commit 0678318

Browse files
committed
llama : refactor gguf_buffer (WIP)
1 parent 797088a commit 0678318

File tree

2 files changed

+30
-49
lines changed

2 files changed

+30
-49
lines changed

gguf-llama.cpp

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3012,11 +3012,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
30123012
// quantization
30133013
//
30143014

3015-
static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
3016-
if (output.size < nelements * sizeof(float)) {
3017-
output.resize(nelements * sizeof(float));
3015+
static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
3016+
if (output.size() < nelements) {
3017+
output.resize(nelements);
30183018
}
3019-
float * f32_output = (float *) output.addr;
3019+
float * f32_output = (float *) output.data();
30203020

30213021
ggml_type_traits_t qtype;
30223022
if (ggml_is_quantized(tensor.type)) {
@@ -3134,10 +3134,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
31343134
};
31353135

31363136
size_t idx = 0;
3137+
3138+
std::vector<uint8_t> read_data;
3139+
std::vector<uint8_t> work;
3140+
31373141
for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
3138-
gguf_buffer read_data;
31393142
read_data.resize(tensor.size);
3140-
tensor.data = read_data.addr;
3143+
tensor.data = read_data.data();
31413144
model_loader->load_data_for(tensor);
31423145

31433146
LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
@@ -3156,7 +3159,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
31563159
enum ggml_type new_type;
31573160
void * new_data;
31583161
size_t new_size;
3159-
gguf_buffer work;
31603162

31613163
if (!quantize) {
31623164
new_type = tensor.type;
@@ -3214,35 +3216,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
32143216
}
32153217
#endif
32163218

3219+
const size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
3220+
32173221
float * f32_data;
3218-
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
3219-
gguf_buffer f32_conv_buf;
3222+
std::vector<float> f32_conv_buf;
32203223

32213224
if (tensor.type == GGML_TYPE_F32) {
32223225
f32_data = (float *) tensor.data;
32233226
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
32243227
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
32253228
} else {
32263229
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
3227-
f32_data = (float *) f32_conv_buf.addr;
3230+
f32_data = (float *) f32_conv_buf.data();
32283231
}
32293232

32303233
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
32313234
fflush(stdout);
32323235

32333236
work.resize(nelements * 4); // upper bound on size
3234-
new_data = work.addr;
3237+
new_data = work.data();
32353238
std::vector<int64_t> hist_cur(1 << 4, 0);
32363239

3237-
int chunk_size = 32 * 512;
3240+
const int chunk_size = 32 * 512;
32383241
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
32393242
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
32403243
if (nthread_use < 2) {
32413244
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
32423245
} else {
32433246
size_t counter = 0;
32443247
new_size = 0;
3245-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
3248+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () {
32463249
std::vector<int64_t> local_hist;
32473250
size_t local_size = 0;
32483251
while (true) {
@@ -3315,8 +3318,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
33153318
}
33163319
}
33173320

3318-
3319-
33203321
//
33213322
// interface implementation
33223323
//
@@ -3565,7 +3566,6 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
35653566

35663567
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
35673568

3568-
35693569
// create a temporary ggml context to store the lora tensors
35703570
// todo: calculate size from biggest possible tensor
35713571
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
@@ -3583,11 +3583,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
35833583
model_tensors.insert(kv);
35843584
}
35853585

3586-
35873586
// load base model
35883587
std::unique_ptr<llama_model_loader> model_loader;
35893588
ggml_context * base_ctx = NULL;
3590-
gguf_buffer base_buf;
3589+
std::vector<uint8_t> base_buf;
35913590
if (path_base_model) {
35923591
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
35933592
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
@@ -3598,8 +3597,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
35983597
base_buf.resize(ctx_size);
35993598

36003599
ggml_init_params base_params;
3601-
base_params.mem_size = base_buf.size;
3602-
base_params.mem_buffer = base_buf.addr;
3600+
base_params.mem_size = base_buf.size();
3601+
base_params.mem_buffer = base_buf.data();
36033602
base_params.no_alloc = model_loader->use_mmap;
36043603

36053604
base_ctx = ggml_init(base_params);

gguf-util.h

Lines changed: 11 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -476,66 +476,48 @@ struct gguf_mlock {
476476

477477
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
478478
struct gguf_buffer {
479-
uint8_t * addr = NULL;
479+
void * addr = NULL;
480480
size_t size = 0;
481481

482-
gguf_buffer() = default;
483-
484482
void resize(size_t len) {
485-
#ifdef GGML_USE_METAL
486483
free(addr);
487-
int result = posix_memalign((void **) &addr, getpagesize(), len);
488-
if (result == 0) {
489-
memset(addr, 0, len);
490-
}
491-
else {
492-
addr = NULL;
493-
}
484+
#ifdef GGML_USE_METAL
485+
const int result = posix_memalign((void **) &addr, getpagesize(), len);
486+
GGML_ASSERT(result == 0);
494487
#else
495-
delete[] addr;
496-
addr = new uint8_t[len];
488+
addr = malloc(len);
497489
#endif
490+
GGML_ASSERT(addr);
498491
size = len;
499492
}
500493

501494
~gguf_buffer() {
502-
#ifdef GGML_USE_METAL
503495
free(addr);
504-
#else
505-
delete[] addr;
506-
#endif
507496
addr = NULL;
508497
}
509-
510-
// disable copy and move
511-
gguf_buffer(const gguf_buffer&) = delete;
512-
gguf_buffer(gguf_buffer&&) = delete;
513-
gguf_buffer& operator=(const gguf_buffer&) = delete;
514-
gguf_buffer& operator=(gguf_buffer&&) = delete;
515498
};
516499

517500
#ifdef GGML_USE_CUBLAS
518501
#include "ggml-cuda.h"
519502
struct gguf_ctx_buffer {
520503
uint8_t * addr = NULL;
521-
bool is_cuda;
522504
size_t size = 0;
523505

524-
gguf_ctx_buffer() = default;
506+
bool is_cuda = false;
525507

526-
void resize(size_t size) {
508+
void resize(size_t len) {
527509
free();
528510

529-
addr = (uint8_t *) ggml_cuda_host_malloc(size);
511+
addr = (uint8_t *) ggml_cuda_host_malloc(len);
530512
if (addr) {
531513
is_cuda = true;
532514
}
533515
else {
534516
// fall back to pageable memory
535-
addr = new uint8_t[size];
517+
addr = new uint8_t[len];
536518
is_cuda = false;
537519
}
538-
this->size = size;
520+
size = len;
539521
}
540522

541523
void free() {

0 commit comments

Comments
 (0)