From 8a25f79620d9bd3b4e5594cd167a06fbd2573d32 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 08:10:45 +0100
Subject: [PATCH 01/11] server: fix/test add_generation_prompt

---
 tools/server/tests/unit/test_template.py | 25 ++++++++++++++++++++++++
 tools/server/utils.hpp                   |  1 +
 2 files changed, 26 insertions(+)

diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py
index cf9f96a7fbc52..7bb857b335bb6 100644
--- a/tools/server/tests/unit/test_template.py
+++ b/tools/server/tests/unit/test_template.py
@@ -47,3 +47,28 @@ def test_date_inside_prompt(template_name: str, format: str, tools: list[dict]):
 
     today_str = datetime.date.today().strftime(format)
     assert today_str in prompt, f"Expected today's date ({today_str}) in content ({prompt})"
+
+
+@pytest.mark.parametrize("add_generation_prompt", [False, True])
+@pytest.mark.parametrize("template_name,expected_generation_prompt", [
+    ("meta-llama-Llama-3.3-70B-Instruct",    "<|start_header_id|>assistant<|end_header_id|>"),
+])
+def test_add_generation_prompt(template_name: str, expected_generation_prompt: str, add_generation_prompt: bool):
+    global server
+    server.jinja = True
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "user", "content": "What is today?"},
+        ],
+        "add_generation_prompt": add_generation_prompt,
+    })
+    assert res.status_code == 200
+    prompt = res.body["prompt"]
+
+    if add_generation_prompt:
+        assert expected_generation_prompt in prompt, f"Expected generation prompt ({expected_generation_prompt}) in content ({prompt})"
+    else:
+        assert expected_generation_prompt not in prompt, f"Did not expect generation prompt ({expected_generation_prompt}) in content ({prompt})"
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 91efcfef06772..ee33f76c2b06d 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -731,6 +731,7 @@ static json oaicompat_chat_params_parse(
     inputs.grammar               = grammar;
     inputs.use_jinja             = opt.use_jinja;
     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
+    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
     inputs.reasoning_format      = opt.reasoning_format;
     if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
         throw std::runtime_error("Cannot use custom grammar constraints with tools.");

From 43b5626e835783a34360f12cf477f7112492770d Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 08:50:48 +0100
Subject: [PATCH 02/11] tools: enable hermes2/qwen chat logic even w/o tools

---
 common/chat.cpp     | 178 ++++++++++++++++++++++----------------------
 tests/test-chat.cpp |   4 +-
 2 files changed, 92 insertions(+), 90 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 78af5eafa40c3..b4b496c75c3ab 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1466,98 +1466,100 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
         data.thinking_forced_open = true;
     }
 
-    // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
-    data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-        std::vector<std::string> tool_rules;
-        std::vector<std::string> tool_call_alts;
-        std::vector<std::string> escaped_names;
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const auto & function = tool.at("function");
-            std::string name = function.at("name");
-            auto parameters = function.at("parameters");
-            builder.resolve_refs(parameters);
-            tool_rules.push_back(builder.add_schema(name + "-call", {
-                {"type", "object"},
-                {"properties", json {
-                    {"name", json {{"const", name}}},
-                    {"arguments", parameters},
-                }},
-                {"required", json::array({"name", "arguments"})},
-            }));
-            tool_call_alts.push_back(builder.add_rule(
-                name + "-function-tag",
-                "\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
-                builder.add_schema(name + "-args", parameters) + " "
-                "\"</function>\" space"));
+    if (!inputs.tools.is_null()) {
+        // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
+        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            std::vector<std::string> tool_rules;
+            std::vector<std::string> tool_call_alts;
+            std::vector<std::string> escaped_names;
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+                tool_rules.push_back(builder.add_schema(name + "-call", {
+                    {"type", "object"},
+                    {"properties", json {
+                        {"name", json {{"const", name}}},
+                        {"arguments", parameters},
+                    }},
+                    {"required", json::array({"name", "arguments"})},
+                }));
+                tool_call_alts.push_back(builder.add_rule(
+                    name + "-function-tag",
+                    "\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
+                    builder.add_schema(name + "-args", parameters) + " "
+                    "\"</function>\" space"));
 
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
-                "<function=" + name + ">",
+                data.grammar_triggers.push_back({
+                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+                    "<function=" + name + ">",
+                });
+                auto escaped_name = regex_escape(name);
+                data.grammar_triggers.push_back({
+                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+                    "<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
+                });
+                escaped_names.push_back(escaped_name);
             });
-            auto escaped_name = regex_escape(name);
+            auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
+            std::vector<std::string> alt_tags {
+                any_tool_call,
+                "\"<tool_call>\" space "     + any_tool_call + " \"</tool_call>\"",
+                // The rest is just to accommodate common "good bad" outputs.
+                "\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
+                "\"<response>\"  space "     + any_tool_call + " \"</response>\"",
+                "\"<tools>\"     space "     + any_tool_call + " \"</tools>\"",
+                "\"<json>\"      space "     + any_tool_call + " \"</json>\"",
+                "\"<xml>\"      space "     + any_tool_call + " \"</xml>\"",
+                "\"<JSON>\"      space "     + any_tool_call + " \"</JSON>\"",
+            };
+            auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
+            tool_call_alts.push_back(wrappable_tool_call);
+            tool_call_alts.push_back(
+                "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
+            auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
+            builder.add_rule("root",
+                std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
+                (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
+            // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
             data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-                "<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+                // If thinking_forced_open, then we capture the </think> tag in the grammar,
+                // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
+                    "(\\s*"
+                    "(?:<tool_call>"
+                    "|<function"
+                    "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
+                    "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
+                    ")"
+                    ")[\\s\\S]*"
+                ),
             });
-            escaped_names.push_back(escaped_name);
-        });
-        auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
-        std::vector<std::string> alt_tags {
-            any_tool_call,
-            "\"<tool_call>\" space "     + any_tool_call + " \"</tool_call>\"",
-            // The rest is just to accommodate common "good bad" outputs.
-            "\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
-            "\"<response>\"  space "     + any_tool_call + " \"</response>\"",
-            "\"<tools>\"     space "     + any_tool_call + " \"</tools>\"",
-            "\"<json>\"      space "     + any_tool_call + " \"</json>\"",
-            "\"<xml>\"      space "     + any_tool_call + " \"</xml>\"",
-            "\"<JSON>\"      space "     + any_tool_call + " \"</JSON>\"",
-        };
-        auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
-        tool_call_alts.push_back(wrappable_tool_call);
-        tool_call_alts.push_back(
-            "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
-        auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
-        builder.add_rule("root",
-            std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
-            (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
-        // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
-        data.grammar_triggers.push_back({
-            COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
-            // If thinking_forced_open, then we capture the </think> tag in the grammar,
-            // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
-            std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
-                "(\\s*"
-                "(?:<tool_call>"
-                "|<function"
-                "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
-                 "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
-                ")"
-                ")[\\s\\S]*"
-            ),
+            data.preserved_tokens = {
+                "<think>",
+                "</think>",
+                "<tool_call>",
+                "</tool_call>",
+                "<function",
+                "<tools>",
+                "</tools>",
+                "<response>",
+                "</response>",
+                "<function_call>",
+                "</function_call>",
+                "<json>",
+                "</json>",
+                "<JSON>",
+                "</JSON>",
+                "```",
+                "```json",
+                "```xml",
+            };
         });
-        data.preserved_tokens = {
-            "<think>",
-            "</think>",
-            "<tool_call>",
-            "</tool_call>",
-            "<function",
-            "<tools>",
-            "</tools>",
-            "<response>",
-            "</response>",
-            "<function_call>",
-            "</function_call>",
-            "<json>",
-            "</json>",
-            "<JSON>",
-            "</JSON>",
-            "```",
-            "```json",
-            "```xml",
-        };
-    });
+    }
 
     return data;
 }
@@ -1702,7 +1704,7 @@ static common_chat_params common_chat_templates_apply_jinja(
     }
 
     // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
-    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
+    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
         return common_chat_params_init_hermes_2_pro(tmpl, params);
     }
 
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index dfcdce350ba86..fb048022a06c4 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -737,14 +737,14 @@ static void test_template_output_parsers() {
         auto tmpls = read_templates("models/templates/Qwen-QwQ-32B.jinja");
         std::vector<std::string> end_tokens{ "<|im_end|>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
         assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
     }
     {
         auto tmpls = read_templates("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja");
         std::vector<std::string> end_tokens{ "<|im_end|>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
         assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
         assert_equals(
             COMMON_CHAT_FORMAT_HERMES_2_PRO,

From b457f89e72035a7967e8feda2e2f914cde014527 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 08:52:33 +0100
Subject: [PATCH 03/11] server: add --reasoning-format=disabled to disable
 thinking (incl. qwen3 w/ enable_thinking:false)

---
 common/arg.cpp                           |  1 +
 common/chat.cpp                          | 36 +++++++++++++++++++++---
 common/chat.h                            |  2 ++
 common/common.h                          |  1 +
 tools/server/server.cpp                  |  2 +-
 tools/server/tests/unit/test_template.py | 32 +++++++++++++++++++++
 tools/server/tests/utils.py              |  2 +-
 7 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 62eec8337e033..f64bb0f038f45 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2854,6 +2854,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
+            else if (value == "disabled") { params.reasoning_format = COMMON_REASONING_FORMAT_DISABLED; }
             else { std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
diff --git a/common/chat.cpp b/common/chat.cpp
index b4b496c75c3ab..9dcdde838c26c 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -133,6 +133,7 @@ struct templates_params {
     bool stream;
     std::string grammar;
     bool add_generation_prompt = true;
+    bool enable_thinking = true;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
 
@@ -591,6 +592,16 @@ std::string common_chat_format_name(common_chat_format format) {
     }
 }
 
+std::string common_reasoning_format_name(common_reasoning_format format) {
+    switch (format) {
+        case COMMON_REASONING_FORMAT_NONE: return "none";
+        case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
+        case COMMON_REASONING_FORMAT_DISABLED: return "disabled";
+        default:
+            throw std::runtime_error("Unknown reasoning format");
+    }
+}
+
 static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
     std::string arguments;
     if (builder.is_partial()) {
@@ -918,7 +929,11 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
     data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
     data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
     if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
-        data.thinking_forced_open = true;
+        if (!inputs.enable_thinking) {
+            data.prompt += "<|END_THINKING|>";
+        } else {
+            data.thinking_forced_open = true;
+        }
     }
 
     data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1186,7 +1201,11 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
     data.prompt = prompt;
     data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
     if (string_ends_with(data.prompt, "<think>\n")) {
-        data.thinking_forced_open = true;
+        if (!inputs.enable_thinking) {
+            data.prompt += "</think>";
+        } else {
+            data.thinking_forced_open = true;
+        }
     }
 
     if (inputs.tools.is_array() && !inputs.tools.empty()) {
@@ -1460,10 +1479,18 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
 static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
 
-    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    json additional_context = {
+        {"enable_thinking", inputs.enable_thinking},
+    };
+
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
     data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
     if (string_ends_with(data.prompt, "<think>\n")) {
-        data.thinking_forced_open = true;
+        if (!inputs.enable_thinking) {
+            data.prompt += "</think>";
+        } else {
+            data.thinking_forced_open = true;
+        }
     }
 
     if (!inputs.tools.is_null()) {
@@ -1671,6 +1698,7 @@ static common_chat_params common_chat_templates_apply_jinja(
     params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
     params.add_generation_prompt = inputs.add_generation_prompt;
     params.tool_choice = inputs.tool_choice;
+    params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_DISABLED;
     params.grammar = inputs.grammar;
     params.now = inputs.now;
     if (!inputs.json_schema.empty()) {
diff --git a/common/chat.h b/common/chat.h
index ce926777ebe91..f5967a0ab7bde 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -123,6 +123,7 @@ struct common_chat_templates_inputs {
     common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
     bool parallel_tool_calls = false;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    bool enable_thinking = true;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
 
@@ -182,6 +183,7 @@ std::string common_chat_format_example(
     bool use_jinja);
 
 std::string               common_chat_format_name(common_chat_format format);
+std::string               common_reasoning_format_name(common_reasoning_format format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
 
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
diff --git a/common/common.h b/common/common.h
index f0c52c314b744..087e921da5736 100644
--- a/common/common.h
+++ b/common/common.h
@@ -216,6 +216,7 @@ struct common_params_vocoder {
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
     COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
+    COMMON_REASONING_FORMAT_DISABLED, // Disable thinking (causes any thinking tag to be closed, or empty thinking tags to be inserted, depending on the model)
 };
 
 struct common_params {
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 9f0b0ffaa6e1e..1ca11fc41b229 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -178,7 +178,7 @@ struct slot_params {
             {"grammar_triggers",          grammar_triggers},
             {"preserved_tokens",          sampling.preserved_tokens},
             {"chat_format",               common_chat_format_name(oaicompat_chat_syntax.format)},
-            {"reasoning_format",          (oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "deepseek" : "none")},
+            {"reasoning_format",          common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
             {"reasoning_in_content",      oaicompat_chat_syntax.reasoning_in_content},
             {"thinking_forced_open",      oaicompat_chat_syntax.thinking_forced_open},
             {"samplers",                  samplers},
diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py
index cf9f96a7fbc52..c1e4991d27502 100644
--- a/tools/server/tests/unit/test_template.py
+++ b/tools/server/tests/unit/test_template.py
@@ -25,6 +25,38 @@ def create_server():
     server.n_slots = 1
 
 
+@pytest.mark.parametrize("template_name,enable_thinking,expected_end", [
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True,  "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False, "<think>\n</think>"),
+
+    ("Qwen-Qwen3-0.6B", True,  "<|im_start|>assistant\n"),
+    ("Qwen-Qwen3-0.6B", False, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
+
+    ("Qwen-QwQ-32B", True,  "<|im_start|>assistant\n<think>\n"),
+    ("Qwen-QwQ-32B", False, "<|im_start|>assistant\n<think>\n</think>"),
+
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True,  "<|START_THINKING|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False, "<|START_THINKING|><|END_THINKING|>"),
+])
+def test_enable_thinking(template_name: str, enable_thinking: bool, expected_end: str):
+    global server
+    server.jinja = True
+    server.reasoning_format = 'deepseek' if enable_thinking else 'disabled'
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "user", "content": "What is today?"},
+        ],
+        "tools": [TEST_TOOL],
+    })
+    assert res.status_code == 200
+    prompt = res.body["prompt"]
+
+    assert prompt.endswith(expected_end), f"Expected prompt to end with '{expected_end}', got '{prompt}'"
+
+
 @pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
 @pytest.mark.parametrize("template_name,format", [
     ("meta-llama-Llama-3.3-70B-Instruct",    "%d %b %Y"),
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index b480801b17abb..dbbca8aafa533 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -84,7 +84,7 @@ class ServerProcess:
     draft_max: int | None = None
     no_webui: bool | None = None
     jinja: bool | None = None
-    reasoning_format: Literal['deepseek', 'none'] | None = None
+    reasoning_format: Literal['deepseek', 'none', 'disabled'] | None = None
     chat_template: str | None = None
     chat_template_file: str | None = None
     server_path: str | None = None

From df25e6bb09e5bd084fcdbf0ca462f795831bf902 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 09:01:19 +0100
Subject: [PATCH 04/11] Update README.md

---
 common/arg.cpp         | 5 ++---
 tools/server/README.md | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index f64bb0f038f45..b546928a34705 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2848,9 +2848,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
-        "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
-        "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
-        "only supported for non-streamed responses",
+        "reasoning format (default: deepseek; allowed values: deepseek, none, disabled)\n"
+        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'disabled' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
diff --git a/tools/server/README.md b/tools/server/README.md
index 0b84966ae86d7..ad588ed301029 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -173,7 +173,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
-| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)<br/>controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).<br/>only supported for non-streamed responses<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none, disabled)<br/>controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'disabled' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).<br/>(env: LLAMA_ARG_THINK) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |

From b6eb0a5c97935f18e6fecc53a24149e88cd7f7e7 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 09:44:53 +0100
Subject: [PATCH 05/11] Add models/templates/Qwen-Qwen3-0.6B.jinja

---
 models/templates/Qwen-Qwen3-0.6B.jinja | 85 ++++++++++++++++++++++++++
 models/templates/README.md             |  1 +
 2 files changed, 86 insertions(+)
 create mode 100644 models/templates/Qwen-Qwen3-0.6B.jinja

diff --git a/models/templates/Qwen-Qwen3-0.6B.jinja b/models/templates/Qwen-Qwen3-0.6B.jinja
new file mode 100644
index 0000000000000..699ff8df401fe
--- /dev/null
+++ b/models/templates/Qwen-Qwen3-0.6B.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/models/templates/README.md b/models/templates/README.md
index b8655be9fce95..35b6386dd0649 100644
--- a/models/templates/README.md
+++ b/models/templates/README.md
@@ -20,4 +20,5 @@ These templates can be updated with the following commands:
 ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use   > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
 ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct                      > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
 ./scripts/get_chat_template.py Qwen/QwQ-32B                                  > models/templates/Qwen-QwQ-32B.jinja
+./scripts/get_chat_template.py Qwen/Qwen3-0.6B                               > models/templates/Qwen-Qwen3-0.6B.jinja
 ```
\ No newline at end of file

From cdea6a9b2496e46fb90e1e4bcf2e9045a7cf91ef Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 10:01:47 +0100
Subject: [PATCH 06/11] update --reasoning-format={disabled -> nothink} as
 suggested

---
 common/arg.cpp                           |  6 +++---
 common/chat.cpp                          |  6 +++---
 common/common.h                          |  2 +-
 tools/server/README.md                   |  2 +-
 tools/server/tests/unit/test_template.py | 25 ++++++++++++------------
 tools/server/tests/utils.py              |  2 +-
 6 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index b546928a34705..6537ab5c458ee 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2848,12 +2848,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
-        "reasoning format (default: deepseek; allowed values: deepseek, none, disabled)\n"
-        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'disabled' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).",
+        "reasoning format (default: deepseek; allowed values: deepseek, none, nothink)\n"
+        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'nothink' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
-            else if (value == "disabled") { params.reasoning_format = COMMON_REASONING_FORMAT_DISABLED; }
+            else if (value == "nothink") {  params.reasoning_format = COMMON_REASONING_FORMAT_NOTHINK; }
             else { std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
diff --git a/common/chat.cpp b/common/chat.cpp
index 9dcdde838c26c..67013e4ae7bbf 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -594,9 +594,9 @@ std::string common_chat_format_name(common_chat_format format) {
 
 std::string common_reasoning_format_name(common_reasoning_format format) {
     switch (format) {
-        case COMMON_REASONING_FORMAT_NONE: return "none";
+        case COMMON_REASONING_FORMAT_NONE:     return "none";
         case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
-        case COMMON_REASONING_FORMAT_DISABLED: return "disabled";
+        case COMMON_REASONING_FORMAT_NOTHINK:  return "nothink";
         default:
             throw std::runtime_error("Unknown reasoning format");
     }
@@ -1698,7 +1698,7 @@ static common_chat_params common_chat_templates_apply_jinja(
     params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
     params.add_generation_prompt = inputs.add_generation_prompt;
     params.tool_choice = inputs.tool_choice;
-    params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_DISABLED;
+    params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_NOTHINK;
     params.grammar = inputs.grammar;
     params.now = inputs.now;
     if (!inputs.json_schema.empty()) {
diff --git a/common/common.h b/common/common.h
index 087e921da5736..1f66edb457a62 100644
--- a/common/common.h
+++ b/common/common.h
@@ -216,7 +216,7 @@ struct common_params_vocoder {
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
     COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
-    COMMON_REASONING_FORMAT_DISABLED, // Disable thinking (causes any thinking tag to be closed, or empty thinking tags to be inserted, depending on the model)
+    COMMON_REASONING_FORMAT_NOTHINK,  // Forcibly disables thinking (causes any thinking tag to be closed, empty thinking tags to be inserted, or template specific variables to be set, depending on the chat format)
 };
 
 struct common_params {
diff --git a/tools/server/README.md b/tools/server/README.md
index ad588ed301029..a118636030b89 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -173,7 +173,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
-| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none, disabled)<br/>controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'disabled' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none, nothink)<br/>controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'nothink' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).<br/>(env: LLAMA_ARG_THINK) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py
index c1e4991d27502..4c912d0e5e362 100644
--- a/tools/server/tests/unit/test_template.py
+++ b/tools/server/tests/unit/test_template.py
@@ -25,23 +25,24 @@ def create_server():
     server.n_slots = 1
 
 
-@pytest.mark.parametrize("template_name,enable_thinking,expected_end", [
-    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True,  "<think>\n"),
-    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False, "<think>\n</think>"),
+@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
+@pytest.mark.parametrize("template_name,nothink,expected_end", [
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False,  "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True, "<think>\n</think>"),
 
-    ("Qwen-Qwen3-0.6B", True,  "<|im_start|>assistant\n"),
-    ("Qwen-Qwen3-0.6B", False, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
+    ("Qwen-Qwen3-0.6B", False,  "<|im_start|>assistant\n"),
+    ("Qwen-Qwen3-0.6B", True, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
 
-    ("Qwen-QwQ-32B", True,  "<|im_start|>assistant\n<think>\n"),
-    ("Qwen-QwQ-32B", False, "<|im_start|>assistant\n<think>\n</think>"),
+    ("Qwen-QwQ-32B", False,  "<|im_start|>assistant\n<think>\n"),
+    ("Qwen-QwQ-32B", True, "<|im_start|>assistant\n<think>\n</think>"),
 
-    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True,  "<|START_THINKING|>"),
-    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False, "<|START_THINKING|><|END_THINKING|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False,  "<|START_THINKING|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True, "<|START_THINKING|><|END_THINKING|>"),
 ])
-def test_enable_thinking(template_name: str, enable_thinking: bool, expected_end: str):
+def test_nothink(template_name: str, nothink: bool, expected_end: str, tools: list[dict]):
     global server
     server.jinja = True
-    server.reasoning_format = 'deepseek' if enable_thinking else 'disabled'
+    server.reasoning_format = 'nothink' if nothink else None
     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
 
@@ -49,7 +50,7 @@ def test_enable_thinking(template_name: str, enable_thinking: bool, expected_end
         "messages": [
             {"role": "user", "content": "What is today?"},
         ],
-        "tools": [TEST_TOOL],
+        "tools": tools,
     })
     assert res.status_code == 200
     prompt = res.body["prompt"]
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index dbbca8aafa533..74a0f7a84e48f 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -84,7 +84,7 @@ class ServerProcess:
     draft_max: int | None = None
     no_webui: bool | None = None
     jinja: bool | None = None
-    reasoning_format: Literal['deepseek', 'none', 'disabled'] | None = None
+    reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
     chat_template: str | None = None
     chat_template_file: str | None = None
     server_path: str | None = None

From 473c01e51d298d71e5ae209cacb11fca1e11b3de Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 10:38:15 +0100
Subject: [PATCH 07/11] fix command r7b's nothink w/ official template

---
 common/chat.cpp                          | 2 ++
 tools/server/tests/unit/test_template.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 67013e4ae7bbf..0c8796eb49e5f 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -934,6 +934,8 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
         } else {
             data.thinking_forced_open = true;
         }
+    } else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) {
+        data.prompt += "<|START_THINKING|><|END_THINKING|>";
     }
 
     data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py
index 4c912d0e5e362..e465dcded2764 100644
--- a/tools/server/tests/unit/test_template.py
+++ b/tools/server/tests/unit/test_template.py
@@ -36,8 +36,8 @@ def create_server():
     ("Qwen-QwQ-32B", False,  "<|im_start|>assistant\n<think>\n"),
     ("Qwen-QwQ-32B", True, "<|im_start|>assistant\n<think>\n</think>"),
 
-    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False,  "<|START_THINKING|>"),
-    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True, "<|START_THINKING|><|END_THINKING|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", False,  "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", True, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
 ])
 def test_nothink(template_name: str, nothink: bool, expected_end: str, tools: list[dict]):
     global server

From 6b9efe7be93144f9d04823af363aace26d9335c4 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 11:57:56 +0100
Subject: [PATCH 08/11] rewrite docs as list as suggested

---
 common/arg.cpp         | 7 +++++--
 tools/server/README.md | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 6537ab5c458ee..36225547e3f3a 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2848,8 +2848,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
-        "reasoning format (default: deepseek; allowed values: deepseek, none, nothink)\n"
-        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'nothink' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).",
+        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
+        "- none: leaves thoughts unparsed in `message.content`\n"
+        "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
+        "- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)\n"
+        "(default: deepseek)",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
diff --git a/tools/server/README.md b/tools/server/README.md
index a118636030b89..6f2cce83df72c 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -173,7 +173,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
-| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none, nothink)<br/>controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only), 'nothink' prevents generation of thoughts (by closing any thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3).<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |

From 355b38c9a9dbb0d3ee514fdedf54eb80418f465c Mon Sep 17 00:00:00 2001
From: Olivier Chafik <olivier.chafik@gmail.com>
Date: Sun, 25 May 2025 11:58:16 +0100
Subject: [PATCH 09/11] Update common/chat.cpp

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
---
 common/chat.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 0c8796eb49e5f..b21d11ab60683 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -592,7 +592,7 @@ std::string common_chat_format_name(common_chat_format format) {
     }
 }
 
-std::string common_reasoning_format_name(common_reasoning_format format) {
+const char * common_reasoning_format_name(common_reasoning_format format) {
     switch (format) {
         case COMMON_REASONING_FORMAT_NONE:     return "none";
         case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";

From 8547fccf5308f2d66e99ab48743f4c10710afcdc Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 12:00:51 +0100
Subject: [PATCH 10/11] const char* return for chat enum name helpers

---
 common/chat.cpp         | 6 +++---
 common/chat.h           | 4 ++--
 tools/server/server.cpp | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index b21d11ab60683..64b03d155e291 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -574,7 +574,7 @@ common_chat_templates_ptr common_chat_templates_init(
     return tmpls;
 }
 
-std::string common_chat_format_name(common_chat_format format) {
+const char * common_chat_format_name(common_chat_format format) {
     switch (format) {
         case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
         case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
@@ -1853,7 +1853,7 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
 }
 
 static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) {
-    LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format).c_str(), builder.input().c_str());
+    LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format), builder.input().c_str());
 
     switch (format) {
         case COMMON_CHAT_FORMAT_CONTENT_ONLY:
@@ -1890,7 +1890,7 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form
             common_chat_parse_command_r7b(builder);
             break;
         default:
-            throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
+            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(format));
     }
     builder.finish();
 }
diff --git a/common/chat.h b/common/chat.h
index f5967a0ab7bde..3e2cbbaae3369 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -182,8 +182,8 @@ std::string common_chat_format_example(
     const struct common_chat_templates * tmpls,
     bool use_jinja);
 
-std::string               common_chat_format_name(common_chat_format format);
-std::string               common_reasoning_format_name(common_reasoning_format format);
+const char*               common_chat_format_name(common_chat_format format);
+const char*               common_reasoning_format_name(common_reasoning_format format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
 
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 1ca11fc41b229..8295c455d1b38 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -357,7 +357,7 @@ struct server_task {
             auto it = data.find("chat_format");
             if (it != data.end()) {
                 params.oaicompat_chat_syntax.format = static_cast<common_chat_format>(it->get<int>());
-                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format).c_str());
+                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format));
             } else {
                 params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
             }

From 9cdeebe808d721855a7707f2bffe422dc0f0e661 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 25 May 2025 18:52:07 +0100
Subject: [PATCH 11/11] switch to --reasoning-budget flag

---
 common/arg.cpp                           | 14 ++++++++++----
 common/chat.cpp                          |  3 +--
 common/common.h                          |  2 +-
 tools/server/README.md                   |  3 ++-
 tools/server/server.cpp                  |  1 +
 tools/server/tests/unit/test_template.py | 23 ++++++++++++-----------
 tools/server/tests/utils.py              |  3 +++
 tools/server/utils.hpp                   |  2 ++
 8 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 36225547e3f3a..5ed5a23903332 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2851,15 +2851,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
         "- none: leaves thoughts unparsed in `message.content`\n"
         "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
-        "- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)\n"
         "(default: deepseek)",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
-            else if (value == "nothink") {  params.reasoning_format = COMMON_REASONING_FORMAT_NOTHINK; }
-            else { std::invalid_argument("invalid value"); }
+            else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
+    add_opt(common_arg(
+        {"--reasoning-budget"}, "N",
+        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
+        [](common_params & params, int value) {
+            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
+            params.reasoning_budget = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         string_format(
@@ -2958,7 +2964,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
             else if (value == "md") { params.batched_bench_output_jsonl = false; }
-            else { std::invalid_argument("invalid value"); }
+            else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
diff --git a/common/chat.cpp b/common/chat.cpp
index 64b03d155e291..adfe51db5a770 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -596,7 +596,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
     switch (format) {
         case COMMON_REASONING_FORMAT_NONE:     return "none";
         case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
-        case COMMON_REASONING_FORMAT_NOTHINK:  return "nothink";
         default:
             throw std::runtime_error("Unknown reasoning format");
     }
@@ -1700,7 +1699,7 @@ static common_chat_params common_chat_templates_apply_jinja(
     params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
     params.add_generation_prompt = inputs.add_generation_prompt;
     params.tool_choice = inputs.tool_choice;
-    params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_NOTHINK;
+    params.enable_thinking = inputs.enable_thinking;
     params.grammar = inputs.grammar;
     params.now = inputs.now;
     if (!inputs.json_schema.empty()) {
diff --git a/common/common.h b/common/common.h
index 1f66edb457a62..92b9533fc2948 100644
--- a/common/common.h
+++ b/common/common.h
@@ -216,7 +216,6 @@ struct common_params_vocoder {
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
     COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
-    COMMON_REASONING_FORMAT_NOTHINK,  // Forcibly disables thinking (causes any thinking tag to be closed, empty thinking tags to be inserted, or template specific variables to be set, depending on the chat format)
 };
 
 struct common_params {
@@ -369,6 +368,7 @@ struct common_params {
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    int reasoning_budget = -1;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
     std::vector<std::string> api_keys;
diff --git a/tools/server/README.md b/tools/server/README.md
index 6f2cce83df72c..06533c172e530 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -173,7 +173,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
-| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 8295c455d1b38..07b613122e321 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2089,6 +2089,7 @@ struct server_context {
             /* common_chat_templates */ chat_templates.get(),
             /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
             /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
+            /* enable_thinking       */ params_base.reasoning_budget != 0,
         };
     }
 
diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py
index e465dcded2764..dcc3f0a3e1149 100644
--- a/tools/server/tests/unit/test_template.py
+++ b/tools/server/tests/unit/test_template.py
@@ -26,23 +26,24 @@ def create_server():
 
 
 @pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
-@pytest.mark.parametrize("template_name,nothink,expected_end", [
-    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False,  "<think>\n"),
-    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True, "<think>\n</think>"),
+@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B",   -1, "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B",    0, "<think>\n</think>"),
 
-    ("Qwen-Qwen3-0.6B", False,  "<|im_start|>assistant\n"),
-    ("Qwen-Qwen3-0.6B", True, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
+    ("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"),
+    ("Qwen-Qwen3-0.6B",  0, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
 
-    ("Qwen-QwQ-32B", False,  "<|im_start|>assistant\n<think>\n"),
-    ("Qwen-QwQ-32B", True, "<|im_start|>assistant\n<think>\n</think>"),
+    ("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n<think>\n"),
+    ("Qwen-QwQ-32B",  0, "<|im_start|>assistant\n<think>\n</think>"),
 
-    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", False,  "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
-    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", True, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use",  0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
 ])
-def test_nothink(template_name: str, nothink: bool, expected_end: str, tools: list[dict]):
+def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]):
     global server
     server.jinja = True
-    server.reasoning_format = 'nothink' if nothink else None
+    server.reasoning_budget = reasoning_budget
     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
 
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 74a0f7a84e48f..11672f515df1d 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -85,6 +85,7 @@ class ServerProcess:
     no_webui: bool | None = None
     jinja: bool | None = None
     reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
+    reasoning_budget: int | None = None
     chat_template: str | None = None
     chat_template_file: str | None = None
     server_path: str | None = None
@@ -191,6 +192,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.append("--jinja")
         if self.reasoning_format is not None:
             server_args.extend(("--reasoning-format", self.reasoning_format))
+        if self.reasoning_budget is not None:
+            server_args.extend(("--reasoning-budget", self.reasoning_budget))
         if self.chat_template:
             server_args.extend(["--chat-template", self.chat_template])
         if self.chat_template_file:
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 91efcfef06772..70486e964fac3 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -568,6 +568,7 @@ struct oaicompat_parser_options {
     common_chat_templates * tmpls;
     bool allow_image;
     bool allow_audio;
+    bool enable_thinking = true;
 };
 
 // used by /chat/completions endpoint
@@ -732,6 +733,7 @@ static json oaicompat_chat_params_parse(
     inputs.use_jinja             = opt.use_jinja;
     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
     inputs.reasoning_format      = opt.reasoning_format;
+    inputs.enable_thinking       = opt.enable_thinking;
     if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
         throw std::runtime_error("Cannot use custom grammar constraints with tools.");
     }