From 6051d69e8ee5ec47db6494ba312903b03e1fa769 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 4 Sep 2025 06:25:14 +0700 Subject: [PATCH 1/2] server: add exceed_context_size_error type --- tools/server/server.cpp | 36 ++++++++++++++----- .../server/tests/unit/test_chat_completion.py | 17 +++++++++ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index e0302e2f2f777..9b522ad9f9748 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -86,6 +86,7 @@ enum error_type { ERROR_TYPE_PERMISSION, ERROR_TYPE_UNAVAILABLE, // custom error ERROR_TYPE_NOT_SUPPORTED, // custom error + ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error }; static bool server_task_type_need_embd(server_task_type task_type) { @@ -1224,6 +1225,10 @@ static json format_error_response(const std::string & message, const enum error_ type_str = "unavailable_error"; code = 503; break; + case ERROR_TYPE_EXCEED_CONTEXT_SIZE: + type_str = "exceed_context_size_error"; + code = 500; + break; } return json { {"code", code}, @@ -1237,12 +1242,21 @@ struct server_task_result_error : server_task_result { error_type err_type = ERROR_TYPE_SERVER; std::string err_msg; + // for ERROR_TYPE_EXCEED_CONTEXT_SIZE + int32_t n_prompt_tokens = 0; + int32_t n_ctx = 0; + virtual bool is_error() override { return true; } virtual json to_json() override { - return format_error_response(err_msg, err_type); + json res = format_error_response(err_msg, err_type); + if (err_type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) { + res["n_prompt_tokens"] = n_prompt_tokens; + res["n_ctx"] = n_ctx; + } + return res; } }; @@ -2605,16 +2619,22 @@ struct server_context { } void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - send_error(slot.id_task, error, type); + send_error(slot.id_task, error, type, slot.n_prompt_tokens, slot.n_ctx); } - void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { + void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER, const int32_t n_prompt_tokens = 0, const int32_t n_ctx = 0) { SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); + if (type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) { + GGML_ASSERT(n_ctx > 0 && n_prompt_tokens > 0); + } + auto res = std::make_unique(); - res->id = id_task; - res->err_type = type; - res->err_msg = error; + res->id = id_task; + res->err_type = type; + res->err_msg = error; + res->n_prompt_tokens = n_prompt_tokens; + res->n_ctx = n_ctx; queue_results.send(std::move(res)); } @@ -3286,7 +3306,7 @@ struct server_context { if (slot.n_prompt_tokens > slot.n_ctx) { slot.release(); - send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_SERVER); + send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_EXCEED_CONTEXT_SIZE); continue; } } else { @@ -3296,7 +3316,7 @@ struct server_context { // context shift should be applied only during the generation phase if (slot.n_prompt_tokens >= slot.n_ctx) { slot.release(); - send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); + send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_EXCEED_CONTEXT_SIZE); continue; } } diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 509c024b75fd3..04604b4295071 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -385,3 +385,20 @@ def test_logit_bias(): output_text = res.choices[0].message.content assert output_text assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude) + +def test_context_size_exceeded(): + global server + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "messages": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ] * 100, # make the prompt too long + }) + assert res.status_code == 500 + assert "error" in res.body + assert res.body["error"]["type"] == "exceed_context_size_error" + assert res.body["error"]["n_prompt_tokens"] > 0 + assert server.n_ctx is not None + assert server.n_slots is not None + assert res.body["error"]["n_ctx"] == server.n_ctx // server.n_slots From 6aaf4f3a14135582e00a86848d02a77bd0443551 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 4 Sep 2025 06:35:27 +0700 Subject: [PATCH 2/2] change error code to 400 --- tools/server/server.cpp | 2 +- tools/server/tests/unit/test_chat_completion.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 9b522ad9f9748..44487eca59337 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1227,7 +1227,7 @@ static json format_error_response(const std::string & message, const enum error_ break; case ERROR_TYPE_EXCEED_CONTEXT_SIZE: type_str = "exceed_context_size_error"; - code = 500; + code = 400; break; } return json { diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 04604b4295071..22dbfdd9b4a89 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -395,7 +395,7 @@ def test_context_size_exceeded(): {"role": "user", "content": "What is the best book"}, ] * 100, # make the prompt too long }) - assert res.status_code == 500 + assert res.status_code == 400 assert "error" in res.body assert res.body["error"]["type"] == "exceed_context_size_error" assert res.body["error"]["n_prompt_tokens"] > 0