@@ -3012,11 +3012,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
3012
3012
// quantization
3013
3013
//
3014
3014
3015
- static void llama_convert_tensor_internal (const gguf_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
3016
- if (output.size < nelements * sizeof ( float ) ) {
3017
- output.resize (nelements * sizeof ( float ) );
3015
+ static void llama_convert_tensor_internal (const gguf_load_tensor & tensor, std::vector< float > & output, const size_t nelements, const int nthread) {
3016
+ if (output.size () < nelements) {
3017
+ output.resize (nelements);
3018
3018
}
3019
- float * f32_output = (float *) output.addr ;
3019
+ float * f32_output = (float *) output.data () ;
3020
3020
3021
3021
ggml_type_traits_t qtype;
3022
3022
if (ggml_is_quantized (tensor.type )) {
@@ -3134,10 +3134,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3134
3134
};
3135
3135
3136
3136
size_t idx = 0 ;
3137
+
3138
+ std::vector<uint8_t > read_data;
3139
+ std::vector<uint8_t > work;
3140
+
3137
3141
for (gguf_load_tensor & tensor : model_loader->tensors_map .tensors ) {
3138
- gguf_buffer read_data;
3139
3142
read_data.resize (tensor.size );
3140
- tensor.data = read_data.addr ;
3143
+ tensor.data = read_data.data () ;
3141
3144
model_loader->load_data_for (tensor);
3142
3145
3143
3146
LLAMA_LOG_INFO (" [%4zu/%4zu] %36s - %16s, type = %6s, " ,
@@ -3156,7 +3159,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3156
3159
enum ggml_type new_type;
3157
3160
void * new_data;
3158
3161
size_t new_size;
3159
- gguf_buffer work;
3160
3162
3161
3163
if (!quantize) {
3162
3164
new_type = tensor.type ;
@@ -3214,35 +3216,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3214
3216
}
3215
3217
#endif
3216
3218
3219
+ const size_t nelements = tensor.ne .at (0 ) * tensor.ne .at (1 );
3220
+
3217
3221
float * f32_data;
3218
- size_t nelements = tensor.ne .at (0 ) * tensor.ne .at (1 );
3219
- gguf_buffer f32_conv_buf;
3222
+ std::vector<float > f32_conv_buf;
3220
3223
3221
3224
if (tensor.type == GGML_TYPE_F32) {
3222
3225
f32_data = (float *) tensor.data ;
3223
3226
} else if (ggml_is_quantized (tensor.type ) && !params->allow_requantize ) {
3224
3227
throw std::runtime_error (format (" requantizing from type %s is disabled" , ggml_type_name (tensor.type )));
3225
3228
} else {
3226
3229
llama_convert_tensor_internal (tensor, f32_conv_buf, nelements, nthread);
3227
- f32_data = (float *) f32_conv_buf.addr ;
3230
+ f32_data = (float *) f32_conv_buf.data () ;
3228
3231
}
3229
3232
3230
3233
LLAMA_LOG_INFO (" quantizing to %s .. " , ggml_type_name (new_type));
3231
3234
fflush (stdout);
3232
3235
3233
3236
work.resize (nelements * 4 ); // upper bound on size
3234
- new_data = work.addr ;
3237
+ new_data = work.data () ;
3235
3238
std::vector<int64_t > hist_cur (1 << 4 , 0 );
3236
3239
3237
- int chunk_size = 32 * 512 ;
3240
+ const int chunk_size = 32 * 512 ;
3238
3241
const int nchunk = (nelements + chunk_size - 1 )/chunk_size;
3239
3242
const int nthread_use = nthread > 1 ? std::max (1 , std::min (nthread, nchunk)) : 1 ;
3240
3243
if (nthread_use < 2 ) {
3241
3244
new_size = ggml_quantize_chunk (new_type, f32_data, new_data, 0 , nelements, hist_cur.data ());
3242
3245
} else {
3243
3246
size_t counter = 0 ;
3244
3247
new_size = 0 ;
3245
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size ] () {
3248
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () {
3246
3249
std::vector<int64_t > local_hist;
3247
3250
size_t local_size = 0 ;
3248
3251
while (true ) {
@@ -3315,8 +3318,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3315
3318
}
3316
3319
}
3317
3320
3318
-
3319
-
3320
3321
//
3321
3322
// interface implementation
3322
3323
//
@@ -3565,7 +3566,6 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3565
3566
3566
3567
LLAMA_LOG_INFO (" %s: r = %d, alpha = %d, scaling = %.2f\n " , __func__, lora_r, lora_alpha, scaling);
3567
3568
3568
-
3569
3569
// create a temporary ggml context to store the lora tensors
3570
3570
// todo: calculate size from biggest possible tensor
3571
3571
std::vector<uint8_t > lora_buf (1024ull * 1024ull * 1024ull );
@@ -3583,11 +3583,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3583
3583
model_tensors.insert (kv);
3584
3584
}
3585
3585
3586
-
3587
3586
// load base model
3588
3587
std::unique_ptr<llama_model_loader> model_loader;
3589
3588
ggml_context * base_ctx = NULL ;
3590
- gguf_buffer base_buf;
3589
+ std::vector< uint8_t > base_buf;
3591
3590
if (path_base_model) {
3592
3591
LLAMA_LOG_INFO (" %s: loading base model from '%s'\n " , __func__, path_base_model);
3593
3592
model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true ));
@@ -3598,8 +3597,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3598
3597
base_buf.resize (ctx_size);
3599
3598
3600
3599
ggml_init_params base_params;
3601
- base_params.mem_size = base_buf.size ;
3602
- base_params.mem_buffer = base_buf.addr ;
3600
+ base_params.mem_size = base_buf.size () ;
3601
+ base_params.mem_buffer = base_buf.data () ;
3603
3602
base_params.no_alloc = model_loader->use_mmap ;
3604
3603
3605
3604
base_ctx = ggml_init (base_params);
0 commit comments