From 91e7e0ff175c18debd20149cdbfcf205473810c1 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 25 Feb 2024 12:56:34 +0100 Subject: [PATCH 1/8] refactor work queue related stuff --- examples/server/server.cpp | 66 +++++++++++++++++++++++++------------- examples/server/utils.hpp | 31 +++++++++++------- 2 files changed, 64 insertions(+), 33 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 19a8c1067e72a..e1e6ebc576f1b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1021,13 +1021,23 @@ struct llama_server_context return slot.images.size() > 0; } - void send_error(task_server& task, const std::string &error) + void send_error(task_server &task, const std::string &error) { - LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); + send_error(task.id, task.multitask_id, error); + } + + void send_error(llama_client_slot &slot, const std::string &error) + { + send_error(slot.task_id, slot.multitask_id, error); + } + + void send_error(int task_id, int multitask_id, const std::string &error) + { + LOG_TEE("task %i - error: %s\n", task_id, error.c_str()); task_result res; - res.id = task.id; - res.multitask_id = task.multitask_id; - res.stop = false; + res.id = task_id; + res.multitask_id = multitask_id; + res.stop = true; res.error = true; res.result_json = { { "content", error } }; queue_results.send(res); @@ -1466,7 +1476,9 @@ struct llama_server_context queue_results.send(result); } - bool update_slots() { + void run_slots() { + bool has_next_response = false; // whether to schedule next slot run, to generate next token + if (system_need_update) { LOG_TEE("updating system prompt\n"); @@ -1482,14 +1494,9 @@ struct llama_server_context LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); kv_cache_clear(); } - return true; + return; } - task_server task; - task.type = TASK_TYPE_NEXT_RESPONSE; - task.target_id = -1; - queue_tasks.post(task); - for (llama_client_slot &slot : slots) { if (slot.ga_n == 1) @@ -1737,7 +1744,8 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { LOG_TEE("failed processing images\n"); - return false; + send_error(slot, "failed processing images"); + continue; } // extract the logits only for the last token @@ -1755,7 +1763,6 @@ struct llama_server_context if (batch.n_tokens == 0) { all_slots_are_idle = true; - return true; } for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) @@ -1812,7 +1819,13 @@ struct llama_server_context { // if you get here, it means the KV cache is full - try increasing it via the context size LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); - return false; + for (auto & slot : slots) + { + send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); + slot.release(); + } + has_next_response = false; + break; } LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); @@ -1873,14 +1886,23 @@ struct llama_server_context send_final_response(slot); } + // if slot is not yet finish its work, we schedule next run + if (slot.has_next_token) + { + has_next_response = true; + } + slot.i_batch = -1; } } - return true; - } - void run_on_all_tasks_finished() { - update_slots(); + if (has_next_response) { + LOG_VERBOSE("schedule next slot run", {}); + task_server task; + task.type = TASK_TYPE_NEXT_RESPONSE; + task.target_id = -1; + queue_tasks.post(task); + } } }; @@ -3210,7 +3232,7 @@ int main(int argc, char **argv) bool running = true; while (running) { - running = llama.update_slots(); + running = llama.run_slots(); } }*/ //); @@ -3232,8 +3254,8 @@ int main(int argc, char **argv) &llama_server_context::process_single_task, &llama, std::placeholders::_1)); llama.queue_tasks.on_finish_multitask(std::bind( &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1)); - llama.queue_tasks.on_all_tasks_finished(std::bind( - &llama_server_context::run_on_all_tasks_finished, &llama)); + llama.queue_tasks.on_run_slots(std::bind( + &llama_server_context::run_slots, &llama)); llama.queue_results.on_multitask_update(std::bind( &llama_server_queue::update_multitask, &llama.queue_tasks, diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 88545eb6931d0..8cc63e7d43845 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -227,7 +227,7 @@ struct llama_server_queue { // callback functions std::function callback_new_task; std::function callback_finish_multitask; - std::function callback_all_task_finished; + std::function callback_run_slots; // Add a new task to the end of the queue int post(task_server task) { @@ -257,14 +257,14 @@ struct llama_server_queue { callback_new_task = callback; } - // Register function to process a multitask + // Register function to process a multitask when it is finished void on_finish_multitask(std::function callback) { callback_finish_multitask = callback; } - // Register the function to be called when the batch of tasks is finished - void on_all_tasks_finished(std::function callback) { - callback_all_task_finished = callback; + // Register the function to be called when all slots data is ready to be processed + void on_run_slots(std::function callback) { + callback_run_slots = callback; } // Call when the state of one slot is changed @@ -286,7 +286,13 @@ struct llama_server_queue { condition_tasks.notify_all(); } - // Start the main loop. + /** + * Main loop consists of these steps: + * - Wait until a new task arrives + * - Process the task (i.e. maybe copy data into slot) + * - Check if multitask is finished + * - Run all slots + */ void start_loop() { running = true; while (true) { @@ -306,8 +312,8 @@ struct llama_server_queue { LOG_VERBOSE("callback_new_task", {}); callback_new_task(task); } - LOG_VERBOSE("callback_all_task_finished", {}); - // process and update all the multitasks + LOG_VERBOSE("update_multitasks", {}); + // check if we have any finished multitasks auto queue_iterator = queue_multitasks.begin(); while (queue_iterator != queue_multitasks.end()) { @@ -324,8 +330,9 @@ struct llama_server_queue { ++queue_iterator; } } - // all tasks in the current loop is finished - callback_all_task_finished(); + // all tasks in the current loop is processed, slots data is now ready + LOG_VERBOSE("callback_run_slots", {}); + callback_run_slots(); } LOG_VERBOSE("wait for new task", {}); // wait for new task @@ -401,7 +408,9 @@ struct llama_server_response { condition_results.wait(lock, [&]{ return !queue_results.empty(); }); - LOG_VERBOSE("condition_results unblock", {}); + LOG_VERBOSE("condition_results unblock", { + {"data", queue_results[0].result_json}, + }); for (int i = 0; i < (int) queue_results.size(); i++) { From 3b2dea18c3a8c7a662bce230587dcbf40237d245 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 25 Feb 2024 13:18:55 +0100 Subject: [PATCH 2/8] condition_results.notify_all --- examples/server/utils.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8cc63e7d43845..0bac4cef25559 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -408,9 +408,7 @@ struct llama_server_response { condition_results.wait(lock, [&]{ return !queue_results.empty(); }); - LOG_VERBOSE("condition_results unblock", { - {"data", queue_results[0].result_json}, - }); + LOG_VERBOSE("condition_results unblock", {}); for (int i = 0; i < (int) queue_results.size(); i++) { @@ -418,6 +416,10 @@ struct llama_server_response { { assert(queue_results[i].multitask_id == -1); task_result res = queue_results[i]; + LOG_VERBOSE("got task result", { + {"task_id", res.id}, + {"data", res.result_json}, + }); queue_results.erase(queue_results.begin() + i); return res; } @@ -450,7 +452,7 @@ struct llama_server_response { { LOG_VERBOSE("queue_results.push_back", {}); queue_results.push_back(result); - condition_results.notify_one(); + condition_results.notify_all(); return; } } From c420e05383e7ac6c654178a8e44c218192a87eb3 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 25 Feb 2024 14:58:38 +0100 Subject: [PATCH 3/8] remove_waiting_task_id: also clean pending results --- examples/server/utils.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 0bac4cef25559..7cfad20a8a560 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -390,14 +390,24 @@ struct llama_server_response { std::mutex mutex_results; std::condition_variable condition_results; + // add the task_id to the list of tasks waiting for response void add_waiting_task_id(int task_id) { std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); } + // when thr request is finished, we can remove task associated with it void remove_waiting_task_id(int task_id) { std::unique_lock lock(mutex_results); waiting_task_ids.erase(task_id); + // also clear pending results, just in case + for (int i = 0; i < (int) queue_results.size(); i++) + { + if (queue_results[i].id == task_id) + { + queue_results.erase(queue_results.begin() + i); + } + } } // This function blocks the thread until there is a response for this task_id @@ -416,10 +426,7 @@ struct llama_server_response { { assert(queue_results[i].multitask_id == -1); task_result res = queue_results[i]; - LOG_VERBOSE("got task result", { - {"task_id", res.id}, - {"data", res.result_json}, - }); + LOG_VERBOSE("got task result", {{"task_id", res.id}}); queue_results.erase(queue_results.begin() + i); return res; } From a5603ded45938d1b3f6e0029dcc978eca90eeae4 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 25 Feb 2024 15:44:03 +0100 Subject: [PATCH 4/8] move llama_client_slot to utils.hpp --- examples/server/server.cpp | 298 +-------------- .../server/tests/features/parallel.feature | 2 +- .../server/tests/features/security.feature | 2 +- examples/server/tests/features/server.feature | 2 +- .../tests/features/wrong_usages.feature | 2 +- examples/server/utils.hpp | 359 ++++++++++++++++-- 6 files changed, 342 insertions(+), 323 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6136c39031631..2a8673d06b457 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -144,238 +144,6 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector cache_tokens; - std::vector generated_token_probs; - - bool infill = false; - bool embedding = false; - bool has_next_token = true; - bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; - - bool oaicompat = false; - std::string oaicompat_model; - - std::string stopping_word; - - // sampling - struct llama_sampling_params sparams; - llama_sampling_context *ctx_sampling = nullptr; - - int32_t ga_i = 0; // group-attention state - int32_t ga_n = 1; // group-attention factor - int32_t ga_w = 512; // group-attention width - - int32_t n_past_se = 0; // self-extend - - // multimodal - std::vector images; - - // stats - size_t sent_count = 0; - size_t sent_token_probs_index = 0; - - int64_t t_start_process_prompt; - int64_t t_start_genereration; - - double t_prompt_processing; // ms - double t_token_generation; // ms - - // multitasks - int multitask_id = -1; - - void reset() { - num_prompt_tokens = 0; - generated_text = ""; - truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; - stopping_word = ""; - n_past = 0; - sent_count = 0; - sent_token_probs_index = 0; - infill = false; - ga_i = 0; - n_past_se = 0; - - generated_token_probs.clear(); - - for (slot_image & img : images) - { - free(img.image_embedding); - if (img.img_data) { - clip_image_u8_free(img.img_data); - } - img.prefix_prompt = ""; - } - - images.clear(); - } - - bool has_budget(gpt_params &global_params) { - if (params.n_predict == -1 && global_params.n_predict == -1) - { - return true; // limitless - } - - n_remaining = -1; - - if (params.n_predict != -1) - { - n_remaining = params.n_predict - n_decoded; - } - else if (global_params.n_predict != -1) - { - n_remaining = global_params.n_predict - n_decoded; - } - - return n_remaining > 0; // no budget - } - - bool available() const { - return state == IDLE && command == NONE; - } - - bool is_processing() const { - return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; - } - - void add_token_string(const completion_token_output &token) { - if (command == RELEASE) - { - return; - } - cache_tokens.push_back(token.tok); - generated_token_probs.push_back(token); - } - - void release() { - if (state == PROCESSING) - { - t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; - command = RELEASE; - } - } - - json get_formated_timings() { - return json - { - {"prompt_n", num_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed}, - {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed}, - - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; - } - - void print_timings() const { - char buffer[512]; - double t_token = t_prompt_processing / num_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; - sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", - t_prompt_processing, num_prompt_tokens_processed, - t_token, n_tokens_second); - LOG_INFO(buffer, { - {"slot_id", id}, - {"task_id", task_id}, - {"t_prompt_processing", t_prompt_processing}, - {"num_prompt_tokens_processed", num_prompt_tokens_processed}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - t_token = t_token_generation / n_decoded; - n_tokens_second = 1e3 / t_token_generation * n_decoded; - sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", - t_token_generation, n_decoded, - t_token, n_tokens_second); - LOG_INFO(buffer, { - {"slot_id", id}, - {"task_id", task_id}, - {"t_token_generation", t_token_generation}, - {"n_decoded", n_decoded}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - LOG_INFO(buffer, { - {"slot_id", id}, - {"task_id", task_id}, - {"t_prompt_processing", t_prompt_processing}, - {"t_token_generation", t_token_generation}, - {"t_total", t_prompt_processing + t_token_generation}, - }); - } -}; - -struct llama_metrics { - uint64_t n_prompt_tokens_processed_total = 0; - uint64_t n_tokens_predicted_total = 0; - - uint64_t n_prompt_tokens_processed = 0; - uint64_t t_prompt_processing = 0; - - uint64_t n_tokens_predicted = 0; - uint64_t t_tokens_generation = 0; - - - void on_prompt_eval(const llama_client_slot &slot) { - n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; - - n_prompt_tokens_processed += slot.num_prompt_tokens_processed; - t_prompt_processing += slot.t_prompt_processing; - } - - void on_prediction(const llama_client_slot &slot) { - n_tokens_predicted_total += slot.n_decoded; - - n_tokens_predicted += slot.n_decoded; - t_tokens_generation += slot.t_token_generation; - } - - void reset_bucket() { - n_prompt_tokens_processed = 0; - t_prompt_processing = 0; - n_tokens_predicted = 0; - t_tokens_generation = 0; - } -}; - struct llama_server_context { llama_model *model = nullptr; @@ -1795,21 +1563,8 @@ struct llama_server_context if (slot.ga_n != 1) { - int ga_i = 0; - int32_t ga_n = slot.ga_n; - int32_t ga_w = slot.ga_w; - int32_t slot_npast = 0; - for (int k = 0; k < slot.n_past; ++k) - { - while (slot_npast >= ga_i + ga_w) { - const int bd = (ga_w/ga_n)*(ga_n - 1); - slot_npast -= bd; - ga_i += ga_w/ga_n; - } - slot_npast++; - } - slot.n_past_se = slot_npast; - slot.ga_i = ga_i; + // context extension via Self-Extend + slot.grp_attn_update_params(); } LOG_INFO("slot progression", { @@ -1855,22 +1610,16 @@ struct llama_server_context // process the prefix of first image std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; - int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - - int32_t ga_i = slot.ga_i; - int32_t ga_n = slot.ga_n; - int32_t ga_w = slot.ga_w; + int32_t slot_npast = slot.n_past; for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) { if (slot.ga_n != 1) { - while (slot_npast >= ga_i + ga_w) { - const int bd = (ga_w/ga_n)*(ga_n - 1); - slot_npast -= bd; - ga_i += ga_w/ga_n; - } + // context extension via Self-Extend + slot_npast = slot.grp_attn_calc_npast(); } + llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false); slot_npast++; } @@ -1902,6 +1651,7 @@ struct llama_server_context all_slots_are_idle = true; } + // loop of n_batch for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); @@ -1911,28 +1661,9 @@ struct llama_server_context if (slot.ga_n != 1) { // context extension via Self-Extend - while (slot.n_past_se >= slot.ga_i + slot.ga_w) - { - const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; - const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); - const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); - - slot.n_past_se -= bd; - - slot.ga_i += slot.ga_w / slot.ga_n; - - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); - } - slot.n_past_se += n_tokens; + // TODO @ngxson: What happen if we're retrying with smaller n_batch? + // By the second time we retry, "grp_attn_shift" has already been called + slot.grp_attn_shift(ctx, n_tokens); } } @@ -1962,7 +1693,7 @@ struct llama_server_context slot.release(); } has_next_response = false; - break; + break; // break loop of n_batch } LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); @@ -1970,14 +1701,15 @@ struct llama_server_context // retry with half the batch size to try to find a free slot in the KV cache n_batch /= 2; i -= n_batch; - continue; + continue; // continue loop of n_batch } + // loop of slots for (auto & slot : slots) { if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { - continue; + continue; // continue loop of slots } // prompt evaluated for embedding @@ -1986,7 +1718,7 @@ struct llama_server_context send_embedding(slot); slot.release(); slot.i_batch = -1; - continue; + continue; // continue loop of slots } completion_token_output result; diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index c85f9de1d9a52..6fe1e05dee113 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -2,7 +2,7 @@ Feature: Parallel Background: Server startup - Given a server listening on localhost:8080 + Given a server listening on 0.0.0.0:8080 And a model file stories260K.gguf And a model alias tinyllama-2 And 42 as server seed diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature index db06d39775c05..dba0849d12ccc 100644 --- a/examples/server/tests/features/security.feature +++ b/examples/server/tests/features/security.feature @@ -2,7 +2,7 @@ Feature: Security Background: Server startup with an api key defined - Given a server listening on localhost:8080 + Given a server listening on 0.0.0.0:8080 And a model file stories260K.gguf And a server api key llama.cpp Then the server is starting diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index b571582a7857e..10941972ebbc9 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -2,7 +2,7 @@ Feature: llama.cpp server Background: Server startup - Given a server listening on localhost:8080 + Given a server listening on 0.0.0.0:8080 And a model file stories260K.gguf And a model alias tinyllama-2 And 42 as server seed diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature index e228b2371ccce..f4fc6a8a21cb3 100644 --- a/examples/server/tests/features/wrong_usages.feature +++ b/examples/server/tests/features/wrong_usages.feature @@ -6,7 +6,7 @@ Feature: Wrong usage of llama.cpp server # to cap the number of tokens any completion request can generate # or pass n_predict/max_tokens in the request. Scenario: Infinite loop - Given a server listening on localhost:8080 + Given a server listening on 0.0.0.0:8080 And a model file stories260K.gguf # Uncomment below to fix the issue #And 64 server max tokens to predict diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index bfe00d3a197e4..a5a6ab9f8f07b 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -37,9 +37,49 @@ extern bool server_log_json; #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) -// -// parallel -// +static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) +{ + std::stringstream ss_tid; + ss_tid << std::this_thread::get_id(); + json log = nlohmann::ordered_json{ + {"tid", ss_tid.str()}, + {"timestamp", time(nullptr)}, + }; + + if (server_log_json) { + log.merge_patch( + { + {"level", level}, + {"function", function}, + {"line", line}, + {"msg", message}, + }); + if (!extra.empty()) { + log.merge_patch(extra); + } + + std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; + } else { + char buf[1024]; + snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); + + if (!extra.empty()) { + log.merge_patch(extra); + } + std::stringstream ss; + ss << buf << " |"; + for (const auto& el : log.items()) + { + const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); + snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str()); + ss << buf; + } + + const std::string str = ss.str(); + printf("%.*s\n", (int)str.size(), str.data()); + fflush(stdout); + } +} enum server_state { SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet @@ -134,49 +174,296 @@ struct completion_token_output std::string text_to_send; }; -static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) +struct llama_client_slot { - std::stringstream ss_tid; - ss_tid << std::this_thread::get_id(); - json log = nlohmann::ordered_json{ - {"tid", ss_tid.str()}, - {"timestamp", time(nullptr)}, - }; + int id; + int task_id = -1; - if (server_log_json) { - log.merge_patch( - { - {"level", level}, - {"function", function}, - {"line", line}, - {"msg", message}, - }); - if (!extra.empty()) { - log.merge_patch(extra); + struct slot_params params; + + slot_state state = IDLE; + slot_command command = NONE; + + // used to determine the slot that has been used the longest + int64_t t_last_used = -1; + + // generation props + int32_t n_ctx = 0; // context size per slot + int32_t n_past = 0; + int32_t n_decoded = 0; + int32_t n_remaining = -1; + int32_t i_batch = -1; + int32_t n_predict = -1; + + int32_t num_prompt_tokens = 0; + int32_t num_prompt_tokens_processed = 0; + + json prompt; + std::string generated_text; + llama_token sampled; + std::vector cache_tokens; + std::vector generated_token_probs; + + bool infill = false; + bool embedding = false; + bool has_next_token = true; + bool truncated = false; + bool stopped_eos = false; + bool stopped_word = false; + bool stopped_limit = false; + + bool oaicompat = false; + std::string oaicompat_model; + + std::string stopping_word; + + // sampling + struct llama_sampling_params sparams; + llama_sampling_context *ctx_sampling = nullptr; + + int32_t ga_i = 0; // group-attention state + int32_t ga_n = 1; // group-attention factor + int32_t ga_w = 512; // group-attention width + + int32_t n_past_se = 0; // self-extend + + // multimodal + std::vector images; + + // stats + size_t sent_count = 0; + size_t sent_token_probs_index = 0; + + int64_t t_start_process_prompt; + int64_t t_start_genereration; + + double t_prompt_processing; // ms + double t_token_generation; // ms + + // multitasks + int multitask_id = -1; + + void reset() { + num_prompt_tokens = 0; + generated_text = ""; + truncated = false; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; + stopping_word = ""; + n_past = 0; + sent_count = 0; + sent_token_probs_index = 0; + infill = false; + ga_i = 0; + n_past_se = 0; + + generated_token_probs.clear(); + + for (slot_image & img : images) + { + free(img.image_embedding); + if (img.img_data) { + clip_image_u8_free(img.img_data); + } + img.prefix_prompt = ""; } - std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; - } else { - char buf[1024]; - snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); + images.clear(); + } - if (!extra.empty()) { - log.merge_patch(extra); + bool has_budget(gpt_params &global_params) { + if (params.n_predict == -1 && global_params.n_predict == -1) + { + return true; // limitless } - std::stringstream ss; - ss << buf << " |"; - for (const auto& el : log.items()) + + n_remaining = -1; + + if (params.n_predict != -1) { - const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); - snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str()); - ss << buf; + n_remaining = params.n_predict - n_decoded; + } + else if (global_params.n_predict != -1) + { + n_remaining = global_params.n_predict - n_decoded; } - const std::string str = ss.str(); - printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); + return n_remaining > 0; // no budget } -} + + bool available() const { + return state == IDLE && command == NONE; + } + + bool is_processing() const { + return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; + } + + void add_token_string(const completion_token_output &token) { + if (command == RELEASE) + { + return; + } + cache_tokens.push_back(token.tok); + generated_token_probs.push_back(token); + } + + void release() { + if (state == PROCESSING) + { + t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; + command = RELEASE; + } + } + + json get_formated_timings() { + return json + { + {"prompt_n", num_prompt_tokens_processed}, + {"prompt_ms", t_prompt_processing}, + {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed}, + {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed}, + + {"predicted_n", n_decoded}, + {"predicted_ms", t_token_generation}, + {"predicted_per_token_ms", t_token_generation / n_decoded}, + {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, + }; + } + + void print_timings() const { + char buffer[512]; + double t_token = t_prompt_processing / num_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", + t_prompt_processing, num_prompt_tokens_processed, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"num_prompt_tokens_processed", num_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + t_token = t_token_generation / n_decoded; + n_tokens_second = 1e3 / t_token_generation * n_decoded; + sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + t_token_generation, n_decoded, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_token_generation", t_token_generation}, + {"n_decoded", n_decoded}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"t_token_generation", t_token_generation}, + {"t_total", t_prompt_processing + t_token_generation}, + }); + } + + // context extension via Self-Extend + void grp_attn_update_params() { + int grpa_i = 0; + // copy to local variables + int32_t grpa_n = ga_n; + int32_t grpa_w = ga_w; + int32_t slot_npast = 0; + for (int k = 0; k < n_past; ++k) + { + while (slot_npast >= grpa_i + grpa_w) { + const int bd = (grpa_w/grpa_n)*(grpa_n - 1); + slot_npast -= bd; + grpa_i += grpa_w/grpa_n; + } + slot_npast++; + } + n_past_se = slot_npast; + ga_i = grpa_i; + } + + int32_t grp_attn_calc_npast() { + int32_t slot_npast = n_past_se > 0 ? n_past_se : n_past; + // copy to local variables + int32_t grpa_i = ga_i; + int32_t grpa_n = ga_n; + int32_t grpa_w = ga_w; + while (slot_npast >= grpa_i + grpa_w) { + const int bd = (grpa_w/grpa_n)*(grpa_n - 1); + slot_npast -= bd; + grpa_i += grpa_w/grpa_n; + } + return slot_npast; + } + + void grp_attn_shift(llama_context * ctx, const int32_t n_tokens) { + while (n_past_se >= ga_i + ga_w) + { + const int ib = (ga_n * ga_i) / ga_w; + const int bd = (ga_w / ga_n) * (ga_n - 1); + const int dd = (ga_w / ga_n) - ib * bd - ga_w; + + LOG_TEE("\n"); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past_se, ib * bd, ga_i + ib * bd, n_past_se + ib * bd); + LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib * bd, ga_i + ib * bd + ga_w, ga_n, (ga_i + ib * bd) / ga_n, (ga_i + ib * bd + ga_w) / ga_n); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib * bd + ga_w, n_past_se + ib * bd, dd, ga_i + ib * bd + ga_w + dd, n_past_se + ib * bd + dd); + + llama_kv_cache_seq_shift(ctx, id, ga_i, n_past_se, ib * bd); + llama_kv_cache_seq_div(ctx, id, ga_i + ib * bd, ga_i + ib * bd + ga_w,ga_n); + llama_kv_cache_seq_shift(ctx, id, ga_i + ib * bd + ga_w,n_past_se + ib * bd, dd); + + n_past_se -= bd; + + ga_i += ga_w / ga_n; + + LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past_se + bd, n_past_se, ga_i); + } + n_past_se += n_tokens; + } +}; + +struct llama_metrics { + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t n_tokens_predicted_total = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + + void on_prompt_eval(const llama_client_slot &slot) { + n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; + + n_prompt_tokens_processed += slot.num_prompt_tokens_processed; + t_prompt_processing += slot.t_prompt_processing; + } + + void on_prediction(const llama_client_slot &slot) { + n_tokens_predicted_total += slot.n_decoded; + + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; + } + + void reset_bucket() { + n_prompt_tokens_processed = 0; + t_prompt_processing = 0; + n_tokens_predicted = 0; + t_tokens_generation = 0; + } +}; // // server utils From 624214aedf524d45313a29f8e66ea9302a1e480b Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 25 Feb 2024 15:45:10 +0100 Subject: [PATCH 5/8] revert test addr change --- examples/server/tests/features/parallel.feature | 2 +- examples/server/tests/features/security.feature | 2 +- examples/server/tests/features/server.feature | 2 +- examples/server/tests/features/wrong_usages.feature | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 6fe1e05dee113..c85f9de1d9a52 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -2,7 +2,7 @@ Feature: Parallel Background: Server startup - Given a server listening on 0.0.0.0:8080 + Given a server listening on localhost:8080 And a model file stories260K.gguf And a model alias tinyllama-2 And 42 as server seed diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature index dba0849d12ccc..db06d39775c05 100644 --- a/examples/server/tests/features/security.feature +++ b/examples/server/tests/features/security.feature @@ -2,7 +2,7 @@ Feature: Security Background: Server startup with an api key defined - Given a server listening on 0.0.0.0:8080 + Given a server listening on localhost:8080 And a model file stories260K.gguf And a server api key llama.cpp Then the server is starting diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 10941972ebbc9..b571582a7857e 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -2,7 +2,7 @@ Feature: llama.cpp server Background: Server startup - Given a server listening on 0.0.0.0:8080 + Given a server listening on localhost:8080 And a model file stories260K.gguf And a model alias tinyllama-2 And 42 as server seed diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature index f4fc6a8a21cb3..e228b2371ccce 100644 --- a/examples/server/tests/features/wrong_usages.feature +++ b/examples/server/tests/features/wrong_usages.feature @@ -6,7 +6,7 @@ Feature: Wrong usage of llama.cpp server # to cap the number of tokens any completion request can generate # or pass n_predict/max_tokens in the request. Scenario: Infinite loop - Given a server listening on 0.0.0.0:8080 + Given a server listening on localhost:8080 And a model file stories260K.gguf # Uncomment below to fix the issue #And 64 server max tokens to predict From 72a8d59d484cd1f5fcdb24ebe8f978c47c91a3ed Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 25 Feb 2024 21:41:32 +0100 Subject: [PATCH 6/8] move llama_client_slot back to server.cpp --- examples/server/server.cpp | 316 ++++++++++++++++++++++++++++++++++--- examples/server/utils.hpp | 291 ---------------------------------- 2 files changed, 291 insertions(+), 316 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 84e02ec023203..a647b9e2584e3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -144,6 +144,297 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector cache_tokens; + std::vector generated_token_probs; + + bool infill = false; + bool embedding = false; + bool has_next_token = true; + bool truncated = false; + bool stopped_eos = false; + bool stopped_word = false; + bool stopped_limit = false; + + bool oaicompat = false; + std::string oaicompat_model; + + std::string stopping_word; + + // sampling + struct llama_sampling_params sparams; + llama_sampling_context *ctx_sampling = nullptr; + + int32_t ga_i = 0; // group-attention state + int32_t ga_n = 1; // group-attention factor + int32_t ga_w = 512; // group-attention width + + int32_t n_past_se = 0; // self-extend + + // multimodal + std::vector images; + + // stats + size_t sent_count = 0; + size_t sent_token_probs_index = 0; + + int64_t t_start_process_prompt; + int64_t t_start_genereration; + + double t_prompt_processing; // ms + double t_token_generation; // ms + + // multitasks + int multitask_id = -1; + + void reset() { + num_prompt_tokens = 0; + generated_text = ""; + truncated = false; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; + stopping_word = ""; + n_past = 0; + sent_count = 0; + sent_token_probs_index = 0; + infill = false; + ga_i = 0; + n_past_se = 0; + + generated_token_probs.clear(); + + for (slot_image & img : images) + { + free(img.image_embedding); + if (img.img_data) { + clip_image_u8_free(img.img_data); + } + img.prefix_prompt = ""; + } + + images.clear(); + } + + bool has_budget(gpt_params &global_params) { + if (params.n_predict == -1 && global_params.n_predict == -1) + { + return true; // limitless + } + + n_remaining = -1; + + if (params.n_predict != -1) + { + n_remaining = params.n_predict - n_decoded; + } + else if (global_params.n_predict != -1) + { + n_remaining = global_params.n_predict - n_decoded; + } + + return n_remaining > 0; // no budget + } + + bool available() const { + return state == IDLE && command == NONE; + } + + bool is_processing() const { + return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; + } + + void add_token_string(const completion_token_output &token) { + if (command == RELEASE) + { + return; + } + cache_tokens.push_back(token.tok); + generated_token_probs.push_back(token); + } + + void release() { + if (state == PROCESSING) + { + t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; + command = RELEASE; + } + } + + json get_formated_timings() { + return json + { + {"prompt_n", num_prompt_tokens_processed}, + {"prompt_ms", t_prompt_processing}, + {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed}, + {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed}, + + {"predicted_n", n_decoded}, + {"predicted_ms", t_token_generation}, + {"predicted_per_token_ms", t_token_generation / n_decoded}, + {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, + }; + } + + void print_timings() const { + char buffer[512]; + double t_token = t_prompt_processing / num_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", + t_prompt_processing, num_prompt_tokens_processed, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"num_prompt_tokens_processed", num_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + t_token = t_token_generation / n_decoded; + n_tokens_second = 1e3 / t_token_generation * n_decoded; + sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + t_token_generation, n_decoded, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_token_generation", t_token_generation}, + {"n_decoded", n_decoded}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"t_token_generation", t_token_generation}, + {"t_total", t_prompt_processing + t_token_generation}, + }); + } + + // context extension via Self-Extend + void grp_attn_update_params() { + int grpa_i = 0; + // copy to local variables + int32_t grpa_n = ga_n; + int32_t grpa_w = ga_w; + int32_t slot_npast = 0; + for (int k = 0; k < n_past; ++k) + { + while (slot_npast >= grpa_i + grpa_w) { + const int bd = (grpa_w/grpa_n)*(grpa_n - 1); + slot_npast -= bd; + grpa_i += grpa_w/grpa_n; + } + slot_npast++; + } + n_past_se = slot_npast; + ga_i = grpa_i; + } + + int32_t grp_attn_calc_npast() { + int32_t slot_npast = n_past_se > 0 ? n_past_se : n_past; + // copy to local variables + int32_t grpa_i = ga_i; + int32_t grpa_n = ga_n; + int32_t grpa_w = ga_w; + while (slot_npast >= grpa_i + grpa_w) { + const int bd = (grpa_w/grpa_n)*(grpa_n - 1); + slot_npast -= bd; + grpa_i += grpa_w/grpa_n; + } + return slot_npast; + } + + void grp_attn_shift(llama_context * ctx, const int32_t n_tokens) { + while (n_past_se >= ga_i + ga_w) + { + const int ib = (ga_n * ga_i) / ga_w; + const int bd = (ga_w / ga_n) * (ga_n - 1); + const int dd = (ga_w / ga_n) - ib * bd - ga_w; + + LOG_TEE("\n"); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past_se, ib * bd, ga_i + ib * bd, n_past_se + ib * bd); + LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib * bd, ga_i + ib * bd + ga_w, ga_n, (ga_i + ib * bd) / ga_n, (ga_i + ib * bd + ga_w) / ga_n); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib * bd + ga_w, n_past_se + ib * bd, dd, ga_i + ib * bd + ga_w + dd, n_past_se + ib * bd + dd); + + llama_kv_cache_seq_shift(ctx, id, ga_i, n_past_se, ib * bd); + llama_kv_cache_seq_div(ctx, id, ga_i + ib * bd, ga_i + ib * bd + ga_w,ga_n); + llama_kv_cache_seq_shift(ctx, id, ga_i + ib * bd + ga_w,n_past_se + ib * bd, dd); + + n_past_se -= bd; + + ga_i += ga_w / ga_n; + + LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past_se + bd, n_past_se, ga_i); + } + n_past_se += n_tokens; + } +}; + +struct llama_metrics { + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t n_tokens_predicted_total = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + + void on_prompt_eval(const llama_client_slot &slot) { + n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; + + n_prompt_tokens_processed += slot.num_prompt_tokens_processed; + t_prompt_processing += slot.t_prompt_processing; + } + + void on_prediction(const llama_client_slot &slot) { + n_tokens_predicted_total += slot.n_decoded; + + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; + } + + void reset_bucket() { + n_prompt_tokens_processed = 0; + t_prompt_processing = 0; + n_tokens_predicted = 0; + t_tokens_generation = 0; + } +}; + struct llama_server_context { llama_model *model = nullptr; @@ -1683,34 +1974,9 @@ struct llama_server_context if (slot.ga_n != 1) { // context extension via Self-Extend -<<<<<<< HEAD // TODO @ngxson: What happen if we're retrying with smaller n_batch? // By the second time we retry, "grp_attn_shift" has already been called slot.grp_attn_shift(ctx, n_tokens); -======= - while (slot.n_past_se >= slot.ga_i + slot.ga_w) - { - const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; - const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); - const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - - llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); - llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); - - slot.n_past_se -= bd; - - slot.ga_i += slot.ga_w / slot.ga_n; - - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); - } - slot.n_past_se += n_tokens; ->>>>>>> master } } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index a5a6ab9f8f07b..280b06a856345 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -174,297 +174,6 @@ struct completion_token_output std::string text_to_send; }; -struct llama_client_slot -{ - int id; - int task_id = -1; - - struct slot_params params; - - slot_state state = IDLE; - slot_command command = NONE; - - // used to determine the slot that has been used the longest - int64_t t_last_used = -1; - - // generation props - int32_t n_ctx = 0; // context size per slot - int32_t n_past = 0; - int32_t n_decoded = 0; - int32_t n_remaining = -1; - int32_t i_batch = -1; - int32_t n_predict = -1; - - int32_t num_prompt_tokens = 0; - int32_t num_prompt_tokens_processed = 0; - - json prompt; - std::string generated_text; - llama_token sampled; - std::vector cache_tokens; - std::vector generated_token_probs; - - bool infill = false; - bool embedding = false; - bool has_next_token = true; - bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; - - bool oaicompat = false; - std::string oaicompat_model; - - std::string stopping_word; - - // sampling - struct llama_sampling_params sparams; - llama_sampling_context *ctx_sampling = nullptr; - - int32_t ga_i = 0; // group-attention state - int32_t ga_n = 1; // group-attention factor - int32_t ga_w = 512; // group-attention width - - int32_t n_past_se = 0; // self-extend - - // multimodal - std::vector images; - - // stats - size_t sent_count = 0; - size_t sent_token_probs_index = 0; - - int64_t t_start_process_prompt; - int64_t t_start_genereration; - - double t_prompt_processing; // ms - double t_token_generation; // ms - - // multitasks - int multitask_id = -1; - - void reset() { - num_prompt_tokens = 0; - generated_text = ""; - truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; - stopping_word = ""; - n_past = 0; - sent_count = 0; - sent_token_probs_index = 0; - infill = false; - ga_i = 0; - n_past_se = 0; - - generated_token_probs.clear(); - - for (slot_image & img : images) - { - free(img.image_embedding); - if (img.img_data) { - clip_image_u8_free(img.img_data); - } - img.prefix_prompt = ""; - } - - images.clear(); - } - - bool has_budget(gpt_params &global_params) { - if (params.n_predict == -1 && global_params.n_predict == -1) - { - return true; // limitless - } - - n_remaining = -1; - - if (params.n_predict != -1) - { - n_remaining = params.n_predict - n_decoded; - } - else if (global_params.n_predict != -1) - { - n_remaining = global_params.n_predict - n_decoded; - } - - return n_remaining > 0; // no budget - } - - bool available() const { - return state == IDLE && command == NONE; - } - - bool is_processing() const { - return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; - } - - void add_token_string(const completion_token_output &token) { - if (command == RELEASE) - { - return; - } - cache_tokens.push_back(token.tok); - generated_token_probs.push_back(token); - } - - void release() { - if (state == PROCESSING) - { - t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; - command = RELEASE; - } - } - - json get_formated_timings() { - return json - { - {"prompt_n", num_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed}, - {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed}, - - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; - } - - void print_timings() const { - char buffer[512]; - double t_token = t_prompt_processing / num_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; - sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", - t_prompt_processing, num_prompt_tokens_processed, - t_token, n_tokens_second); - LOG_INFO(buffer, { - {"slot_id", id}, - {"task_id", task_id}, - {"t_prompt_processing", t_prompt_processing}, - {"num_prompt_tokens_processed", num_prompt_tokens_processed}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - t_token = t_token_generation / n_decoded; - n_tokens_second = 1e3 / t_token_generation * n_decoded; - sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", - t_token_generation, n_decoded, - t_token, n_tokens_second); - LOG_INFO(buffer, { - {"slot_id", id}, - {"task_id", task_id}, - {"t_token_generation", t_token_generation}, - {"n_decoded", n_decoded}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - LOG_INFO(buffer, { - {"slot_id", id}, - {"task_id", task_id}, - {"t_prompt_processing", t_prompt_processing}, - {"t_token_generation", t_token_generation}, - {"t_total", t_prompt_processing + t_token_generation}, - }); - } - - // context extension via Self-Extend - void grp_attn_update_params() { - int grpa_i = 0; - // copy to local variables - int32_t grpa_n = ga_n; - int32_t grpa_w = ga_w; - int32_t slot_npast = 0; - for (int k = 0; k < n_past; ++k) - { - while (slot_npast >= grpa_i + grpa_w) { - const int bd = (grpa_w/grpa_n)*(grpa_n - 1); - slot_npast -= bd; - grpa_i += grpa_w/grpa_n; - } - slot_npast++; - } - n_past_se = slot_npast; - ga_i = grpa_i; - } - - int32_t grp_attn_calc_npast() { - int32_t slot_npast = n_past_se > 0 ? n_past_se : n_past; - // copy to local variables - int32_t grpa_i = ga_i; - int32_t grpa_n = ga_n; - int32_t grpa_w = ga_w; - while (slot_npast >= grpa_i + grpa_w) { - const int bd = (grpa_w/grpa_n)*(grpa_n - 1); - slot_npast -= bd; - grpa_i += grpa_w/grpa_n; - } - return slot_npast; - } - - void grp_attn_shift(llama_context * ctx, const int32_t n_tokens) { - while (n_past_se >= ga_i + ga_w) - { - const int ib = (ga_n * ga_i) / ga_w; - const int bd = (ga_w / ga_n) * (ga_n - 1); - const int dd = (ga_w / ga_n) - ib * bd - ga_w; - - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past_se, ib * bd, ga_i + ib * bd, n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib * bd, ga_i + ib * bd + ga_w, ga_n, (ga_i + ib * bd) / ga_n, (ga_i + ib * bd + ga_w) / ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib * bd + ga_w, n_past_se + ib * bd, dd, ga_i + ib * bd + ga_w + dd, n_past_se + ib * bd + dd); - - llama_kv_cache_seq_shift(ctx, id, ga_i, n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, id, ga_i + ib * bd, ga_i + ib * bd + ga_w,ga_n); - llama_kv_cache_seq_shift(ctx, id, ga_i + ib * bd + ga_w,n_past_se + ib * bd, dd); - - n_past_se -= bd; - - ga_i += ga_w / ga_n; - - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past_se + bd, n_past_se, ga_i); - } - n_past_se += n_tokens; - } -}; - -struct llama_metrics { - uint64_t n_prompt_tokens_processed_total = 0; - uint64_t n_tokens_predicted_total = 0; - - uint64_t n_prompt_tokens_processed = 0; - uint64_t t_prompt_processing = 0; - - uint64_t n_tokens_predicted = 0; - uint64_t t_tokens_generation = 0; - - - void on_prompt_eval(const llama_client_slot &slot) { - n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; - - n_prompt_tokens_processed += slot.num_prompt_tokens_processed; - t_prompt_processing += slot.t_prompt_processing; - } - - void on_prediction(const llama_client_slot &slot) { - n_tokens_predicted_total += slot.n_decoded; - - n_tokens_predicted += slot.n_decoded; - t_tokens_generation += slot.t_token_generation; - } - - void reset_bucket() { - n_prompt_tokens_processed = 0; - t_prompt_processing = 0; - n_tokens_predicted = 0; - t_tokens_generation = 0; - } -}; - // // server utils // From bb363b9879aa967fb7d222c5994dee5c9f420035 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 25 Feb 2024 21:46:13 +0100 Subject: [PATCH 7/8] adapt to new api change --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a647b9e2584e3..8435f44eb3da3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -388,9 +388,9 @@ struct llama_client_slot LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib * bd, ga_i + ib * bd + ga_w, ga_n, (ga_i + ib * bd) / ga_n, (ga_i + ib * bd + ga_w) / ga_n); LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib * bd + ga_w, n_past_se + ib * bd, dd, ga_i + ib * bd + ga_w + dd, n_past_se + ib * bd + dd); - llama_kv_cache_seq_shift(ctx, id, ga_i, n_past_se, ib * bd); + llama_kv_cache_seq_add(ctx, id, ga_i, n_past_se, ib * bd); llama_kv_cache_seq_div(ctx, id, ga_i + ib * bd, ga_i + ib * bd + ga_w,ga_n); - llama_kv_cache_seq_shift(ctx, id, ga_i + ib * bd + ga_w,n_past_se + ib * bd, dd); + llama_kv_cache_seq_add(ctx, id, ga_i + ib * bd + ga_w,n_past_se + ib * bd, dd); n_past_se -= bd; From 92671d71c4d857376cf79f55f6a367d77685d0e0 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 25 Feb 2024 21:48:32 +0100 Subject: [PATCH 8/8] revert move server_log --- examples/server/utils.hpp | 88 +++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 280b06a856345..2cf50ab689d2d 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -37,50 +37,6 @@ extern bool server_log_json; #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) -static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) -{ - std::stringstream ss_tid; - ss_tid << std::this_thread::get_id(); - json log = nlohmann::ordered_json{ - {"tid", ss_tid.str()}, - {"timestamp", time(nullptr)}, - }; - - if (server_log_json) { - log.merge_patch( - { - {"level", level}, - {"function", function}, - {"line", line}, - {"msg", message}, - }); - if (!extra.empty()) { - log.merge_patch(extra); - } - - std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; - } else { - char buf[1024]; - snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - - if (!extra.empty()) { - log.merge_patch(extra); - } - std::stringstream ss; - ss << buf << " |"; - for (const auto& el : log.items()) - { - const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); - snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str()); - ss << buf; - } - - const std::string str = ss.str(); - printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); - } -} - enum server_state { SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet SERVER_STATE_READY, // Server is ready and model is loaded @@ -174,6 +130,50 @@ struct completion_token_output std::string text_to_send; }; +static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) +{ + std::stringstream ss_tid; + ss_tid << std::this_thread::get_id(); + json log = nlohmann::ordered_json{ + {"tid", ss_tid.str()}, + {"timestamp", time(nullptr)}, + }; + + if (server_log_json) { + log.merge_patch( + { + {"level", level}, + {"function", function}, + {"line", line}, + {"msg", message}, + }); + if (!extra.empty()) { + log.merge_patch(extra); + } + + std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; + } else { + char buf[1024]; + snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); + + if (!extra.empty()) { + log.merge_patch(extra); + } + std::stringstream ss; + ss << buf << " |"; + for (const auto& el : log.items()) + { + const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); + snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str()); + ss << buf; + } + + const std::string str = ss.str(); + printf("%.*s\n", (int)str.size(), str.data()); + fflush(stdout); + } +} + // // server utils //