From 8a25f79620d9bd3b4e5594cd167a06fbd2573d32 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 08:10:45 +0100 Subject: [PATCH 01/11] server: fix/test add_generation_prompt --- tools/server/tests/unit/test_template.py | 25 ++++++++++++++++++++++++ tools/server/utils.hpp | 1 + 2 files changed, 26 insertions(+) diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py index cf9f96a7fbc52..7bb857b335bb6 100644 --- a/tools/server/tests/unit/test_template.py +++ b/tools/server/tests/unit/test_template.py @@ -47,3 +47,28 @@ def test_date_inside_prompt(template_name: str, format: str, tools: list[dict]): today_str = datetime.date.today().strftime(format) assert today_str in prompt, f"Expected today's date ({today_str}) in content ({prompt})" + + +@pytest.mark.parametrize("add_generation_prompt", [False, True]) +@pytest.mark.parametrize("template_name,expected_generation_prompt", [ + ("meta-llama-Llama-3.3-70B-Instruct", "<|start_header_id|>assistant<|end_header_id|>"), +]) +def test_add_generation_prompt(template_name: str, expected_generation_prompt: str, add_generation_prompt: bool): + global server + server.jinja = True + server.chat_template_file = f'../../../models/templates/{template_name}.jinja' + server.start(timeout_seconds=TIMEOUT_SERVER_START) + + res = server.make_request("POST", "/apply-template", data={ + "messages": [ + {"role": "user", "content": "What is today?"}, + ], + "add_generation_prompt": add_generation_prompt, + }) + assert res.status_code == 200 + prompt = res.body["prompt"] + + if add_generation_prompt: + assert expected_generation_prompt in prompt, f"Expected generation prompt ({expected_generation_prompt}) in content ({prompt})" + else: + assert expected_generation_prompt not in prompt, f"Did not expect generation prompt ({expected_generation_prompt}) in content ({prompt})" diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 91efcfef06772..ee33f76c2b06d 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -731,6 +731,7 @@ static json oaicompat_chat_params_parse( inputs.grammar = grammar; inputs.use_jinja = opt.use_jinja; inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); + inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); inputs.reasoning_format = opt.reasoning_format; if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) { throw std::runtime_error("Cannot use custom grammar constraints with tools."); From 43b5626e835783a34360f12cf477f7112492770d Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 08:50:48 +0100 Subject: [PATCH 02/11] tools: enable hermes2/qwen chat logic even w/o tools --- common/chat.cpp | 178 ++++++++++++++++++++++---------------------- tests/test-chat.cpp | 4 +- 2 files changed, 92 insertions(+), 90 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 78af5eafa40c3..b4b496c75c3ab 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1466,98 +1466,100 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat data.thinking_forced_open = true; } - // (content)?({"name": "foo", "arguments": {"a": 1}})* - data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - std::vector tool_rules; - std::vector tool_call_alts; - std::vector escaped_names; - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool.at("function"); - std::string name = function.at("name"); - auto parameters = function.at("parameters"); - builder.resolve_refs(parameters); - tool_rules.push_back(builder.add_schema(name + "-call", { - {"type", "object"}, - {"properties", json { - {"name", json {{"const", name}}}, - {"arguments", parameters}, - }}, - {"required", json::array({"name", "arguments"})}, - })); - tool_call_alts.push_back(builder.add_rule( - name + "-function-tag", - "\"\" space " + - builder.add_schema(name + "-args", parameters) + " " - "\"\" space")); + if (!inputs.tools.is_null()) { + // (content)?({"name": "foo", "arguments": {"a": 1}})* + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector tool_rules; + std::vector tool_call_alts; + std::vector escaped_names; + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); + tool_rules.push_back(builder.add_schema(name + "-call", { + {"type", "object"}, + {"properties", json { + {"name", json {{"const", name}}}, + {"arguments", parameters}, + }}, + {"required", json::array({"name", "arguments"})}, + })); + tool_call_alts.push_back(builder.add_rule( + name + "-function-tag", + "\"\" space " + + builder.add_schema(name + "-args", parameters) + " " + "\"\" space")); - data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_WORD, - "", + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, + "", + }); + auto escaped_name = regex_escape(name); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, + " alt_tags { + any_tool_call, + "\"\" space " + any_tool_call + " \"\"", + // The rest is just to accommodate common "good bad" outputs. + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + }; + auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space"); + tool_call_alts.push_back(wrappable_tool_call); + tool_call_alts.push_back( + "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space "); + auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | ")); + builder.add_rule("root", + std::string(data.thinking_forced_open ? "( \"\" space )? " : "") + + (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call)); + // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives) data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, - " tag in the grammar, + // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar) + std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") + ( + "(\\s*" + "(?:" + "||||)?" + "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\"" + ")" + ")[\\s\\S]*" + ), }); - escaped_names.push_back(escaped_name); - }); - auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space"); - std::vector alt_tags { - any_tool_call, - "\"\" space " + any_tool_call + " \"\"", - // The rest is just to accommodate common "good bad" outputs. - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - }; - auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space"); - tool_call_alts.push_back(wrappable_tool_call); - tool_call_alts.push_back( - "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space "); - auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | ")); - builder.add_rule("root", - std::string(data.thinking_forced_open ? "( \"\" space )? " : "") + - (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call)); - // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives) - data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, - // If thinking_forced_open, then we capture the tag in the grammar, - // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar) - std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") + ( - "(\\s*" - "(?:" - "||||)?" - "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\"" - ")" - ")[\\s\\S]*" - ), + data.preserved_tokens = { + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "```", + "```json", + "```xml", + }; }); - data.preserved_tokens = { - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "```", - "```json", - "```xml", - }; - }); + } return data; } @@ -1702,7 +1704,7 @@ static common_chat_params common_chat_templates_apply_jinja( } // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools) - if (src.find("") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) { + if (src.find("") != std::string::npos && params.json_schema.is_null()) { return common_chat_params_init_hermes_2_pro(tmpl, params); } diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index dfcdce350ba86..fb048022a06c4 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -737,14 +737,14 @@ static void test_template_output_parsers() { auto tmpls = read_templates("models/templates/Qwen-QwQ-32B.jinja"); std::vector end_tokens{ "<|im_end|>" }; - assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format); + assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format); assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format); } { auto tmpls = read_templates("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja"); std::vector end_tokens{ "<|im_end|>" }; - assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format); + assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format); assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format); assert_equals( COMMON_CHAT_FORMAT_HERMES_2_PRO, From b457f89e72035a7967e8feda2e2f914cde014527 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 08:52:33 +0100 Subject: [PATCH 03/11] server: add --reasoning-format=disabled to disable thinking (incl. qwen3 w/ enable_thinking:false) --- common/arg.cpp | 1 + common/chat.cpp | 36 +++++++++++++++++++++--- common/chat.h | 2 ++ common/common.h | 1 + tools/server/server.cpp | 2 +- tools/server/tests/unit/test_template.py | 32 +++++++++++++++++++++ tools/server/tests/utils.py | 2 +- 7 files changed, 70 insertions(+), 6 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 62eec8337e033..f64bb0f038f45 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2854,6 +2854,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } + else if (value == "disabled") { params.reasoning_format = COMMON_REASONING_FORMAT_DISABLED; } else { std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK")); diff --git a/common/chat.cpp b/common/chat.cpp index b4b496c75c3ab..9dcdde838c26c 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -133,6 +133,7 @@ struct templates_params { bool stream; std::string grammar; bool add_generation_prompt = true; + bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); }; @@ -591,6 +592,16 @@ std::string common_chat_format_name(common_chat_format format) { } } +std::string common_reasoning_format_name(common_reasoning_format format) { + switch (format) { + case COMMON_REASONING_FORMAT_NONE: return "none"; + case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; + case COMMON_REASONING_FORMAT_DISABLED: return "disabled"; + default: + throw std::runtime_error("Unknown reasoning format"); + } +} + static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) { std::string arguments; if (builder.is_partial()) { @@ -918,7 +929,11 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {}); data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; if (string_ends_with(data.prompt, "<|START_THINKING|>")) { - data.thinking_forced_open = true; + if (!inputs.enable_thinking) { + data.prompt += "<|END_THINKING|>"; + } else { + data.thinking_forced_open = true; + } } data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; @@ -1186,7 +1201,11 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_ data.prompt = prompt; data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1; if (string_ends_with(data.prompt, "\n")) { - data.thinking_forced_open = true; + if (!inputs.enable_thinking) { + data.prompt += ""; + } else { + data.thinking_forced_open = true; + } } if (inputs.tools.is_array() && !inputs.tools.empty()) { @@ -1460,10 +1479,18 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + json additional_context = { + {"enable_thinking", inputs.enable_thinking}, + }; + + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context); data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; if (string_ends_with(data.prompt, "\n")) { - data.thinking_forced_open = true; + if (!inputs.enable_thinking) { + data.prompt += ""; + } else { + data.thinking_forced_open = true; + } } if (!inputs.tools.is_null()) { @@ -1671,6 +1698,7 @@ static common_chat_params common_chat_templates_apply_jinja( params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); params.add_generation_prompt = inputs.add_generation_prompt; params.tool_choice = inputs.tool_choice; + params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_DISABLED; params.grammar = inputs.grammar; params.now = inputs.now; if (!inputs.json_schema.empty()) { diff --git a/common/chat.h b/common/chat.h index ce926777ebe91..f5967a0ab7bde 100644 --- a/common/chat.h +++ b/common/chat.h @@ -123,6 +123,7 @@ struct common_chat_templates_inputs { common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO; bool parallel_tool_calls = false; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; + bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); }; @@ -182,6 +183,7 @@ std::string common_chat_format_example( bool use_jinja); std::string common_chat_format_name(common_chat_format format); +std::string common_reasoning_format_name(common_reasoning_format format); common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax); common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); diff --git a/common/common.h b/common/common.h index f0c52c314b744..087e921da5736 100644 --- a/common/common.h +++ b/common/common.h @@ -216,6 +216,7 @@ struct common_params_vocoder { enum common_reasoning_format { COMMON_REASONING_FORMAT_NONE, COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content` + COMMON_REASONING_FORMAT_DISABLED, // Disable thinking (causes any thinking tag to be closed, or empty thinking tags to be inserted, depending on the model) }; struct common_params { diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 9f0b0ffaa6e1e..1ca11fc41b229 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -178,7 +178,7 @@ struct slot_params { {"grammar_triggers", grammar_triggers}, {"preserved_tokens", sampling.preserved_tokens}, {"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)}, - {"reasoning_format", (oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "deepseek" : "none")}, + {"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)}, {"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content}, {"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open}, {"samplers", samplers}, diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py index cf9f96a7fbc52..c1e4991d27502 100644 --- a/tools/server/tests/unit/test_template.py +++ b/tools/server/tests/unit/test_template.py @@ -25,6 +25,38 @@ def create_server(): server.n_slots = 1 +@pytest.mark.parametrize("template_name,enable_thinking,expected_end", [ + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True, "\n"), + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False, "\n"), + + ("Qwen-Qwen3-0.6B", True, "<|im_start|>assistant\n"), + ("Qwen-Qwen3-0.6B", False, "<|im_start|>assistant\n\n\n\n\n"), + + ("Qwen-QwQ-32B", True, "<|im_start|>assistant\n\n"), + ("Qwen-QwQ-32B", False, "<|im_start|>assistant\n\n"), + + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True, "<|START_THINKING|>"), + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False, "<|START_THINKING|><|END_THINKING|>"), +]) +def test_enable_thinking(template_name: str, enable_thinking: bool, expected_end: str): + global server + server.jinja = True + server.reasoning_format = 'deepseek' if enable_thinking else 'disabled' + server.chat_template_file = f'../../../models/templates/{template_name}.jinja' + server.start(timeout_seconds=TIMEOUT_SERVER_START) + + res = server.make_request("POST", "/apply-template", data={ + "messages": [ + {"role": "user", "content": "What is today?"}, + ], + "tools": [TEST_TOOL], + }) + assert res.status_code == 200 + prompt = res.body["prompt"] + + assert prompt.endswith(expected_end), f"Expected prompt to end with '{expected_end}', got '{prompt}'" + + @pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]]) @pytest.mark.parametrize("template_name,format", [ ("meta-llama-Llama-3.3-70B-Instruct", "%d %b %Y"), diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index b480801b17abb..dbbca8aafa533 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -84,7 +84,7 @@ class ServerProcess: draft_max: int | None = None no_webui: bool | None = None jinja: bool | None = None - reasoning_format: Literal['deepseek', 'none'] | None = None + reasoning_format: Literal['deepseek', 'none', 'disabled'] | None = None chat_template: str | None = None chat_template_file: str | None = None server_path: str | None = None From df25e6bb09e5bd084fcdbf0ca462f795831bf902 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 09:01:19 +0100 Subject: [PATCH 04/11] Update README.md --- common/arg.cpp | 5 ++--- tools/server/README.md | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index f64bb0f038f45..b546928a34705 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2848,9 +2848,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", - "reasoning format (default: deepseek; allowed values: deepseek, none)\n" - "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n" - "only supported for non-streamed responses", + "reasoning format (default: deepseek; allowed values: deepseek, none, disabled)\n" + "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'disabled' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).", [](common_params & params, const std::string & value) { /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } diff --git a/tools/server/README.md b/tools/server/README.md index 0b84966ae86d7..ad588ed301029 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -173,7 +173,7 @@ The project is under active development, and we are [looking for feedback and co | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)
controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).
only supported for non-streamed responses
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none, disabled)
controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'disabled' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).
(env: LLAMA_ARG_THINK) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled
(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | From b6eb0a5c97935f18e6fecc53a24149e88cd7f7e7 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 09:44:53 +0100 Subject: [PATCH 05/11] Add models/templates/Qwen-Qwen3-0.6B.jinja --- models/templates/Qwen-Qwen3-0.6B.jinja | 85 ++++++++++++++++++++++++++ models/templates/README.md | 1 + 2 files changed, 86 insertions(+) create mode 100644 models/templates/Qwen-Qwen3-0.6B.jinja diff --git a/models/templates/Qwen-Qwen3-0.6B.jinja b/models/templates/Qwen-Qwen3-0.6B.jinja new file mode 100644 index 0000000000000..699ff8df401fe --- /dev/null +++ b/models/templates/Qwen-Qwen3-0.6B.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '
' in message.content %} + {%- set content = message.content.split('
')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/models/templates/README.md b/models/templates/README.md index b8655be9fce95..35b6386dd0649 100644 --- a/models/templates/README.md +++ b/models/templates/README.md @@ -20,4 +20,5 @@ These templates can be updated with the following commands: ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja ./scripts/get_chat_template.py Qwen/QwQ-32B > models/templates/Qwen-QwQ-32B.jinja +./scripts/get_chat_template.py Qwen/Qwen3-0.6B > models/templates/Qwen-Qwen3-0.6B.jinja ``` \ No newline at end of file From cdea6a9b2496e46fb90e1e4bcf2e9045a7cf91ef Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 10:01:47 +0100 Subject: [PATCH 06/11] update --reasoning-format={disabled -> nothink} as suggested --- common/arg.cpp | 6 +++--- common/chat.cpp | 6 +++--- common/common.h | 2 +- tools/server/README.md | 2 +- tools/server/tests/unit/test_template.py | 25 ++++++++++++------------ tools/server/tests/utils.py | 2 +- 6 files changed, 22 insertions(+), 21 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b546928a34705..6537ab5c458ee 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2848,12 +2848,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", - "reasoning format (default: deepseek; allowed values: deepseek, none, disabled)\n" - "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'disabled' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).", + "reasoning format (default: deepseek; allowed values: deepseek, none, nothink)\n" + "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'nothink' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).", [](common_params & params, const std::string & value) { /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } - else if (value == "disabled") { params.reasoning_format = COMMON_REASONING_FORMAT_DISABLED; } + else if (value == "nothink") { params.reasoning_format = COMMON_REASONING_FORMAT_NOTHINK; } else { std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK")); diff --git a/common/chat.cpp b/common/chat.cpp index 9dcdde838c26c..67013e4ae7bbf 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -594,9 +594,9 @@ std::string common_chat_format_name(common_chat_format format) { std::string common_reasoning_format_name(common_reasoning_format format) { switch (format) { - case COMMON_REASONING_FORMAT_NONE: return "none"; + case COMMON_REASONING_FORMAT_NONE: return "none"; case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; - case COMMON_REASONING_FORMAT_DISABLED: return "disabled"; + case COMMON_REASONING_FORMAT_NOTHINK: return "nothink"; default: throw std::runtime_error("Unknown reasoning format"); } @@ -1698,7 +1698,7 @@ static common_chat_params common_chat_templates_apply_jinja( params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); params.add_generation_prompt = inputs.add_generation_prompt; params.tool_choice = inputs.tool_choice; - params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_DISABLED; + params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_NOTHINK; params.grammar = inputs.grammar; params.now = inputs.now; if (!inputs.json_schema.empty()) { diff --git a/common/common.h b/common/common.h index 087e921da5736..1f66edb457a62 100644 --- a/common/common.h +++ b/common/common.h @@ -216,7 +216,7 @@ struct common_params_vocoder { enum common_reasoning_format { COMMON_REASONING_FORMAT_NONE, COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content` - COMMON_REASONING_FORMAT_DISABLED, // Disable thinking (causes any thinking tag to be closed, or empty thinking tags to be inserted, depending on the model) + COMMON_REASONING_FORMAT_NOTHINK, // Forcibly disables thinking (causes any thinking tag to be closed, empty thinking tags to be inserted, or template specific variables to be set, depending on the chat format) }; struct common_params { diff --git a/tools/server/README.md b/tools/server/README.md index ad588ed301029..a118636030b89 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -173,7 +173,7 @@ The project is under active development, and we are [looking for feedback and co | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none, disabled)
controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'disabled' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none, nothink)
controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'nothink' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).
(env: LLAMA_ARG_THINK) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled
(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py index c1e4991d27502..4c912d0e5e362 100644 --- a/tools/server/tests/unit/test_template.py +++ b/tools/server/tests/unit/test_template.py @@ -25,23 +25,24 @@ def create_server(): server.n_slots = 1 -@pytest.mark.parametrize("template_name,enable_thinking,expected_end", [ - ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True, "\n"), - ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False, "\n"), +@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]]) +@pytest.mark.parametrize("template_name,nothink,expected_end", [ + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False, "\n"), + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True, "\n"), - ("Qwen-Qwen3-0.6B", True, "<|im_start|>assistant\n"), - ("Qwen-Qwen3-0.6B", False, "<|im_start|>assistant\n\n\n\n\n"), + ("Qwen-Qwen3-0.6B", False, "<|im_start|>assistant\n"), + ("Qwen-Qwen3-0.6B", True, "<|im_start|>assistant\n\n\n\n\n"), - ("Qwen-QwQ-32B", True, "<|im_start|>assistant\n\n"), - ("Qwen-QwQ-32B", False, "<|im_start|>assistant\n\n"), + ("Qwen-QwQ-32B", False, "<|im_start|>assistant\n\n"), + ("Qwen-QwQ-32B", True, "<|im_start|>assistant\n\n"), - ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True, "<|START_THINKING|>"), - ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False, "<|START_THINKING|><|END_THINKING|>"), + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False, "<|START_THINKING|>"), + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True, "<|START_THINKING|><|END_THINKING|>"), ]) -def test_enable_thinking(template_name: str, enable_thinking: bool, expected_end: str): +def test_nothink(template_name: str, nothink: bool, expected_end: str, tools: list[dict]): global server server.jinja = True - server.reasoning_format = 'deepseek' if enable_thinking else 'disabled' + server.reasoning_format = 'nothink' if nothink else None server.chat_template_file = f'../../../models/templates/{template_name}.jinja' server.start(timeout_seconds=TIMEOUT_SERVER_START) @@ -49,7 +50,7 @@ def test_enable_thinking(template_name: str, enable_thinking: bool, expected_end "messages": [ {"role": "user", "content": "What is today?"}, ], - "tools": [TEST_TOOL], + "tools": tools, }) assert res.status_code == 200 prompt = res.body["prompt"] diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index dbbca8aafa533..74a0f7a84e48f 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -84,7 +84,7 @@ class ServerProcess: draft_max: int | None = None no_webui: bool | None = None jinja: bool | None = None - reasoning_format: Literal['deepseek', 'none', 'disabled'] | None = None + reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None chat_template: str | None = None chat_template_file: str | None = None server_path: str | None = None From 473c01e51d298d71e5ae209cacb11fca1e11b3de Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 10:38:15 +0100 Subject: [PATCH 07/11] fix command r7b's nothink w/ official template --- common/chat.cpp | 2 ++ tools/server/tests/unit/test_template.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 67013e4ae7bbf..0c8796eb49e5f 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -934,6 +934,8 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ } else { data.thinking_forced_open = true; } + } else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) { + data.prompt += "<|START_THINKING|><|END_THINKING|>"; } data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py index 4c912d0e5e362..e465dcded2764 100644 --- a/tools/server/tests/unit/test_template.py +++ b/tools/server/tests/unit/test_template.py @@ -36,8 +36,8 @@ def create_server(): ("Qwen-QwQ-32B", False, "<|im_start|>assistant\n\n"), ("Qwen-QwQ-32B", True, "<|im_start|>assistant\n\n"), - ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False, "<|START_THINKING|>"), - ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True, "<|START_THINKING|><|END_THINKING|>"), + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", False, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"), + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", True, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"), ]) def test_nothink(template_name: str, nothink: bool, expected_end: str, tools: list[dict]): global server From 6b9efe7be93144f9d04823af363aace26d9335c4 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 11:57:56 +0100 Subject: [PATCH 08/11] rewrite docs as list as suggested --- common/arg.cpp | 7 +++++-- tools/server/README.md | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 6537ab5c458ee..36225547e3f3a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2848,8 +2848,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", - "reasoning format (default: deepseek; allowed values: deepseek, none, nothink)\n" - "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'nothink' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).", + "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" + "- none: leaves thoughts unparsed in `message.content`\n" + "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n" + "- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)\n" + "(default: deepseek)", [](common_params & params, const std::string & value) { /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } diff --git a/tools/server/README.md b/tools/server/README.md index a118636030b89..6f2cce83df72c 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -173,7 +173,7 @@ The project is under active development, and we are [looking for feedback and co | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none, nothink)
controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'nothink' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)
- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)
(default: deepseek)
(env: LLAMA_ARG_THINK) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled
(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | From 355b38c9a9dbb0d3ee514fdedf54eb80418f465c Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Sun, 25 May 2025 11:58:16 +0100 Subject: [PATCH 09/11] Update common/chat.cpp Co-authored-by: Xuan-Son Nguyen --- common/chat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index 0c8796eb49e5f..b21d11ab60683 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -592,7 +592,7 @@ std::string common_chat_format_name(common_chat_format format) { } } -std::string common_reasoning_format_name(common_reasoning_format format) { +const char * common_reasoning_format_name(common_reasoning_format format) { switch (format) { case COMMON_REASONING_FORMAT_NONE: return "none"; case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; From 8547fccf5308f2d66e99ab48743f4c10710afcdc Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 12:00:51 +0100 Subject: [PATCH 10/11] const char* return for chat enum name helpers --- common/chat.cpp | 6 +++--- common/chat.h | 4 ++-- tools/server/server.cpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index b21d11ab60683..64b03d155e291 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -574,7 +574,7 @@ common_chat_templates_ptr common_chat_templates_init( return tmpls; } -std::string common_chat_format_name(common_chat_format format) { +const char * common_chat_format_name(common_chat_format format) { switch (format) { case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only"; case COMMON_CHAT_FORMAT_GENERIC: return "Generic"; @@ -1853,7 +1853,7 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) { } static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) { - LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format).c_str(), builder.input().c_str()); + LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format), builder.input().c_str()); switch (format) { case COMMON_CHAT_FORMAT_CONTENT_ONLY: @@ -1890,7 +1890,7 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form common_chat_parse_command_r7b(builder); break; default: - throw std::runtime_error("Unsupported format: " + common_chat_format_name(format)); + throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(format)); } builder.finish(); } diff --git a/common/chat.h b/common/chat.h index f5967a0ab7bde..3e2cbbaae3369 100644 --- a/common/chat.h +++ b/common/chat.h @@ -182,8 +182,8 @@ std::string common_chat_format_example( const struct common_chat_templates * tmpls, bool use_jinja); -std::string common_chat_format_name(common_chat_format format); -std::string common_reasoning_format_name(common_reasoning_format format); +const char* common_chat_format_name(common_chat_format format); +const char* common_reasoning_format_name(common_reasoning_format format); common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax); common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 1ca11fc41b229..8295c455d1b38 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -357,7 +357,7 @@ struct server_task { auto it = data.find("chat_format"); if (it != data.end()) { params.oaicompat_chat_syntax.format = static_cast(it->get()); - SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format).c_str()); + SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format)); } else { params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format; } From 9cdeebe808d721855a7707f2bffe422dc0f0e661 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 25 May 2025 18:52:07 +0100 Subject: [PATCH 11/11] switch to --reasoning-budget flag --- common/arg.cpp | 14 ++++++++++---- common/chat.cpp | 3 +-- common/common.h | 2 +- tools/server/README.md | 3 ++- tools/server/server.cpp | 1 + tools/server/tests/unit/test_template.py | 23 ++++++++++++----------- tools/server/tests/utils.py | 3 +++ tools/server/utils.hpp | 2 ++ 8 files changed, 32 insertions(+), 19 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 36225547e3f3a..5ed5a23903332 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2851,15 +2851,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" "- none: leaves thoughts unparsed in `message.content`\n" "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n" - "- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)\n" "(default: deepseek)", [](common_params & params, const std::string & value) { /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } - else if (value == "nothink") { params.reasoning_format = COMMON_REASONING_FORMAT_NOTHINK; } - else { std::invalid_argument("invalid value"); } + else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK")); + add_opt(common_arg( + {"--reasoning-budget"}, "N", + "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)", + [](common_params & params, int value) { + if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); } + params.reasoning_budget = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET")); add_opt(common_arg( {"--chat-template"}, "JINJA_TEMPLATE", string_format( @@ -2958,7 +2964,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } else if (value == "md") { params.batched_bench_output_jsonl = false; } - else { std::invalid_argument("invalid value"); } + else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( diff --git a/common/chat.cpp b/common/chat.cpp index 64b03d155e291..adfe51db5a770 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -596,7 +596,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) { switch (format) { case COMMON_REASONING_FORMAT_NONE: return "none"; case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; - case COMMON_REASONING_FORMAT_NOTHINK: return "nothink"; default: throw std::runtime_error("Unknown reasoning format"); } @@ -1700,7 +1699,7 @@ static common_chat_params common_chat_templates_apply_jinja( params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); params.add_generation_prompt = inputs.add_generation_prompt; params.tool_choice = inputs.tool_choice; - params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_NOTHINK; + params.enable_thinking = inputs.enable_thinking; params.grammar = inputs.grammar; params.now = inputs.now; if (!inputs.json_schema.empty()) { diff --git a/common/common.h b/common/common.h index 1f66edb457a62..92b9533fc2948 100644 --- a/common/common.h +++ b/common/common.h @@ -216,7 +216,6 @@ struct common_params_vocoder { enum common_reasoning_format { COMMON_REASONING_FORMAT_NONE, COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content` - COMMON_REASONING_FORMAT_NOTHINK, // Forcibly disables thinking (causes any thinking tag to be closed, empty thinking tags to be inserted, or template specific variables to be set, depending on the chat format) }; struct common_params { @@ -369,6 +368,7 @@ struct common_params { bool use_jinja = false; // NOLINT bool enable_chat_template = true; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + int reasoning_budget = -1; bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response std::vector api_keys; diff --git a/tools/server/README.md b/tools/server/README.md index 6f2cce83df72c..06533c172e530 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -173,7 +173,8 @@ The project is under active development, and we are [looking for feedback and co | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)
- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)
(default: deepseek)
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)
(default: deepseek)
(env: LLAMA_ARG_THINK) | +| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled
(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 8295c455d1b38..07b613122e321 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2089,6 +2089,7 @@ struct server_context { /* common_chat_templates */ chat_templates.get(), /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, + /* enable_thinking */ params_base.reasoning_budget != 0, }; } diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py index e465dcded2764..dcc3f0a3e1149 100644 --- a/tools/server/tests/unit/test_template.py +++ b/tools/server/tests/unit/test_template.py @@ -26,23 +26,24 @@ def create_server(): @pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]]) -@pytest.mark.parametrize("template_name,nothink,expected_end", [ - ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False, "\n"), - ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True, "\n"), +@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [ + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "\n"), + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", -1, "\n"), + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", 0, "\n"), - ("Qwen-Qwen3-0.6B", False, "<|im_start|>assistant\n"), - ("Qwen-Qwen3-0.6B", True, "<|im_start|>assistant\n\n\n\n\n"), + ("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"), + ("Qwen-Qwen3-0.6B", 0, "<|im_start|>assistant\n\n\n\n\n"), - ("Qwen-QwQ-32B", False, "<|im_start|>assistant\n\n"), - ("Qwen-QwQ-32B", True, "<|im_start|>assistant\n\n"), + ("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n\n"), + ("Qwen-QwQ-32B", 0, "<|im_start|>assistant\n\n"), - ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", False, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"), - ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", True, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"), + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"), + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", 0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"), ]) -def test_nothink(template_name: str, nothink: bool, expected_end: str, tools: list[dict]): +def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]): global server server.jinja = True - server.reasoning_format = 'nothink' if nothink else None + server.reasoning_budget = reasoning_budget server.chat_template_file = f'../../../models/templates/{template_name}.jinja' server.start(timeout_seconds=TIMEOUT_SERVER_START) diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index 74a0f7a84e48f..11672f515df1d 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -85,6 +85,7 @@ class ServerProcess: no_webui: bool | None = None jinja: bool | None = None reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None + reasoning_budget: int | None = None chat_template: str | None = None chat_template_file: str | None = None server_path: str | None = None @@ -191,6 +192,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: server_args.append("--jinja") if self.reasoning_format is not None: server_args.extend(("--reasoning-format", self.reasoning_format)) + if self.reasoning_budget is not None: + server_args.extend(("--reasoning-budget", self.reasoning_budget)) if self.chat_template: server_args.extend(["--chat-template", self.chat_template]) if self.chat_template_file: diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 91efcfef06772..70486e964fac3 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -568,6 +568,7 @@ struct oaicompat_parser_options { common_chat_templates * tmpls; bool allow_image; bool allow_audio; + bool enable_thinking = true; }; // used by /chat/completions endpoint @@ -732,6 +733,7 @@ static json oaicompat_chat_params_parse( inputs.use_jinja = opt.use_jinja; inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); inputs.reasoning_format = opt.reasoning_format; + inputs.enable_thinking = opt.enable_thinking; if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) { throw std::runtime_error("Cannot use custom grammar constraints with tools."); }