Skip to content

Commit b457f89

Browse files
author
ochafik
committed
server: add --reasoning-format=disabled to disable thinking (incl. qwen3 w/ enable_thinking:false)
1 parent 43b5626 commit b457f89

File tree

7 files changed

+70
-6
lines changed

7 files changed

+70
-6
lines changed

common/arg.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2854,6 +2854,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28542854
[](common_params & params, const std::string & value) {
28552855
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
28562856
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2857+
else if (value == "disabled") { params.reasoning_format = COMMON_REASONING_FORMAT_DISABLED; }
28572858
else { std::invalid_argument("invalid value"); }
28582859
}
28592860
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));

common/chat.cpp

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ struct templates_params {
133133
bool stream;
134134
std::string grammar;
135135
bool add_generation_prompt = true;
136+
bool enable_thinking = true;
136137
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
137138
};
138139

@@ -591,6 +592,16 @@ std::string common_chat_format_name(common_chat_format format) {
591592
}
592593
}
593594

595+
std::string common_reasoning_format_name(common_reasoning_format format) {
596+
switch (format) {
597+
case COMMON_REASONING_FORMAT_NONE: return "none";
598+
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
599+
case COMMON_REASONING_FORMAT_DISABLED: return "disabled";
600+
default:
601+
throw std::runtime_error("Unknown reasoning format");
602+
}
603+
}
604+
594605
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
595606
std::string arguments;
596607
if (builder.is_partial()) {
@@ -918,7 +929,11 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
918929
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
919930
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
920931
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
921-
data.thinking_forced_open = true;
932+
if (!inputs.enable_thinking) {
933+
data.prompt += "<|END_THINKING|>";
934+
} else {
935+
data.thinking_forced_open = true;
936+
}
922937
}
923938

924939
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1186,7 +1201,11 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
11861201
data.prompt = prompt;
11871202
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
11881203
if (string_ends_with(data.prompt, "<think>\n")) {
1189-
data.thinking_forced_open = true;
1204+
if (!inputs.enable_thinking) {
1205+
data.prompt += "</think>";
1206+
} else {
1207+
data.thinking_forced_open = true;
1208+
}
11901209
}
11911210

11921211
if (inputs.tools.is_array() && !inputs.tools.empty()) {
@@ -1460,10 +1479,18 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
14601479
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
14611480
common_chat_params data;
14621481

1463-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1482+
json additional_context = {
1483+
{"enable_thinking", inputs.enable_thinking},
1484+
};
1485+
1486+
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
14641487
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
14651488
if (string_ends_with(data.prompt, "<think>\n")) {
1466-
data.thinking_forced_open = true;
1489+
if (!inputs.enable_thinking) {
1490+
data.prompt += "</think>";
1491+
} else {
1492+
data.thinking_forced_open = true;
1493+
}
14671494
}
14681495

14691496
if (!inputs.tools.is_null()) {
@@ -1671,6 +1698,7 @@ static common_chat_params common_chat_templates_apply_jinja(
16711698
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
16721699
params.add_generation_prompt = inputs.add_generation_prompt;
16731700
params.tool_choice = inputs.tool_choice;
1701+
params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_DISABLED;
16741702
params.grammar = inputs.grammar;
16751703
params.now = inputs.now;
16761704
if (!inputs.json_schema.empty()) {

common/chat.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ struct common_chat_templates_inputs {
123123
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
124124
bool parallel_tool_calls = false;
125125
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
126+
bool enable_thinking = true;
126127
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
127128
};
128129

@@ -182,6 +183,7 @@ std::string common_chat_format_example(
182183
bool use_jinja);
183184

184185
std::string common_chat_format_name(common_chat_format format);
186+
std::string common_reasoning_format_name(common_reasoning_format format);
185187
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
186188

187189
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ struct common_params_vocoder {
216216
enum common_reasoning_format {
217217
COMMON_REASONING_FORMAT_NONE,
218218
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
219+
COMMON_REASONING_FORMAT_DISABLED, // Disable thinking (causes any thinking tag to be closed, or empty thinking tags to be inserted, depending on the model)
219220
};
220221

221222
struct common_params {

tools/server/server.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ struct slot_params {
178178
{"grammar_triggers", grammar_triggers},
179179
{"preserved_tokens", sampling.preserved_tokens},
180180
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
181-
{"reasoning_format", (oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "deepseek" : "none")},
181+
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
182182
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
183183
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
184184
{"samplers", samplers},

tools/server/tests/unit/test_template.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,38 @@ def create_server():
2525
server.n_slots = 1
2626

2727

28+
@pytest.mark.parametrize("template_name,enable_thinking,expected_end", [
29+
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True, "<think>\n"),
30+
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False, "<think>\n</think>"),
31+
32+
("Qwen-Qwen3-0.6B", True, "<|im_start|>assistant\n"),
33+
("Qwen-Qwen3-0.6B", False, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
34+
35+
("Qwen-QwQ-32B", True, "<|im_start|>assistant\n<think>\n"),
36+
("Qwen-QwQ-32B", False, "<|im_start|>assistant\n<think>\n</think>"),
37+
38+
("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True, "<|START_THINKING|>"),
39+
("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False, "<|START_THINKING|><|END_THINKING|>"),
40+
])
41+
def test_enable_thinking(template_name: str, enable_thinking: bool, expected_end: str):
42+
global server
43+
server.jinja = True
44+
server.reasoning_format = 'deepseek' if enable_thinking else 'disabled'
45+
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
46+
server.start(timeout_seconds=TIMEOUT_SERVER_START)
47+
48+
res = server.make_request("POST", "/apply-template", data={
49+
"messages": [
50+
{"role": "user", "content": "What is today?"},
51+
],
52+
"tools": [TEST_TOOL],
53+
})
54+
assert res.status_code == 200
55+
prompt = res.body["prompt"]
56+
57+
assert prompt.endswith(expected_end), f"Expected prompt to end with '{expected_end}', got '{prompt}'"
58+
59+
2860
@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
2961
@pytest.mark.parametrize("template_name,format", [
3062
("meta-llama-Llama-3.3-70B-Instruct", "%d %b %Y"),

tools/server/tests/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ class ServerProcess:
8484
draft_max: int | None = None
8585
no_webui: bool | None = None
8686
jinja: bool | None = None
87-
reasoning_format: Literal['deepseek', 'none'] | None = None
87+
reasoning_format: Literal['deepseek', 'none', 'disabled'] | None = None
8888
chat_template: str | None = None
8989
chat_template_file: str | None = None
9090
server_path: str | None = None

0 commit comments

Comments
 (0)