llama : refactor gguf_buffer (WIP)

ggerganov · ggerganov · commit 06783188741c · 2023-08-14T14:45:08.000+03:00
diff --git a/gguf-llama.cpp b/gguf-llama.cpp
@@ -3012,11 +3012,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
 // quantization
 //
 
-static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
-    if (output.size < nelements * sizeof(float)) {
-        output.resize(nelements * sizeof(float));
+static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
+    if (output.size() < nelements) {
+        output.resize(nelements);
     }
-    float * f32_output = (float *) output.addr;
+    float * f32_output = (float *) output.data();
 
     ggml_type_traits_t qtype;
     if (ggml_is_quantized(tensor.type)) {
@@ -3134,10 +3134,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     };
 
     size_t idx = 0;
+
+    std::vector<uint8_t> read_data;
+    std::vector<uint8_t> work;
+
     for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
-        gguf_buffer read_data;
         read_data.resize(tensor.size);
-        tensor.data = read_data.addr;
+        tensor.data = read_data.data();
         model_loader->load_data_for(tensor);
 
         LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
@@ -3156,7 +3159,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         enum ggml_type new_type;
         void * new_data;
         size_t new_size;
-        gguf_buffer work;
 
         if (!quantize) {
             new_type = tensor.type;
@@ -3214,35 +3216,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             }
 #endif
 
+            const size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
+
             float * f32_data;
-            size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
-            gguf_buffer f32_conv_buf;
+            std::vector<float> f32_conv_buf;
 
             if (tensor.type == GGML_TYPE_F32) {
                 f32_data = (float *) tensor.data;
             } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
                 throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
             } else {
                 llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
-                f32_data = (float *) f32_conv_buf.addr;
+                f32_data = (float *) f32_conv_buf.data();
             }
 
             LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
             fflush(stdout);
 
             work.resize(nelements * 4); // upper bound on size
-            new_data = work.addr;
+            new_data = work.data();
             std::vector<int64_t> hist_cur(1 << 4, 0);
 
-            int chunk_size = 32 * 512;
+            const int chunk_size = 32 * 512;
             const int nchunk = (nelements + chunk_size - 1)/chunk_size;
             const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
             if (nthread_use < 2) {
                 new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
             } else {
                 size_t counter = 0;
                 new_size = 0;
-                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
+                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () {
                     std::vector<int64_t> local_hist;
                     size_t local_size = 0;
                     while (true) {
@@ -3315,8 +3318,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     }
 }
 
-
-
 //
 // interface implementation
 //
@@ -3565,7 +3566,6 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
     LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
 
-
     // create a temporary ggml context to store the lora tensors
     // todo: calculate size from biggest possible tensor
     std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
@@ -3583,11 +3583,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
         model_tensors.insert(kv);
     }
 
-
     // load base model
     std::unique_ptr<llama_model_loader> model_loader;
     ggml_context * base_ctx = NULL;
-    gguf_buffer base_buf;
+    std::vector<uint8_t> base_buf;
     if (path_base_model) {
         LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
         model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
@@ -3598,8 +3597,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
         base_buf.resize(ctx_size);
 
         ggml_init_params base_params;
-        base_params.mem_size   = base_buf.size;
-        base_params.mem_buffer = base_buf.addr;
+        base_params.mem_size   = base_buf.size();
+        base_params.mem_buffer = base_buf.data();
         base_params.no_alloc   = model_loader->use_mmap;
 
         base_ctx = ggml_init(base_params);
diff --git a/gguf-util.h b/gguf-util.h
@@ -476,66 +476,48 @@ struct gguf_mlock {
 
 // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
 struct gguf_buffer {
-    uint8_t * addr = NULL;
+    void * addr = NULL;
     size_t size = 0;
 
-    gguf_buffer() = default;
-
     void resize(size_t len) {
-#ifdef GGML_USE_METAL
         free(addr);
-        int result = posix_memalign((void **) &addr, getpagesize(), len);
-        if (result == 0) {
-            memset(addr, 0, len);
-        }
-        else {
-            addr = NULL;
-        }
+#ifdef GGML_USE_METAL
+        const int result = posix_memalign((void **) &addr, getpagesize(), len);
+        GGML_ASSERT(result == 0);
 #else
-        delete[] addr;
-        addr = new uint8_t[len];
+        addr = malloc(len);
 #endif
+        GGML_ASSERT(addr);
         size = len;
     }
 
     ~gguf_buffer() {
-#ifdef GGML_USE_METAL
         free(addr);
-#else
-        delete[] addr;
-#endif
         addr = NULL;
     }
-
-    // disable copy and move
-    gguf_buffer(const gguf_buffer&) = delete;
-    gguf_buffer(gguf_buffer&&) = delete;
-    gguf_buffer& operator=(const gguf_buffer&) = delete;
-    gguf_buffer& operator=(gguf_buffer&&) = delete;
 };
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 struct gguf_ctx_buffer {
     uint8_t * addr = NULL;
-    bool is_cuda;
     size_t size = 0;
 
-    gguf_ctx_buffer() = default;
+    bool is_cuda = false;
 
-    void resize(size_t size) {
+    void resize(size_t len) {
         free();
 
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
+        addr = (uint8_t *) ggml_cuda_host_malloc(len);
         if (addr) {
             is_cuda = true;
         }
         else {
             // fall back to pageable memory
-            addr = new uint8_t[size];
+            addr = new uint8_t[len];
             is_cuda = false;
         }
-        this->size = size;
+        size = len;
     }
 
     void free() {