From 17853a1053c3c7210a0c89b380f4710b80546415 Mon Sep 17 00:00:00 2001 From: CNE Pierre FICHEPOIL Date: Thu, 4 Sep 2025 13:47:22 +0000 Subject: [PATCH 1/7] fixed reasoning streaming with tool_choice="required" Signed-off-by: CNE Pierre FICHEPOIL --- vllm/entrypoints/openai/serving_chat.py | 71 +++++++++++++++++-------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 35edd2f85cd0..d64b65363e7b 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -512,14 +512,13 @@ async def chat_completion_stream_generator( # Only one of these will be used, thus previous_texts and # all_previous_token_ids will not be used twice in the same iteration. - if tool_choice_auto or self.reasoning_parser: + if tool_choice_auto or self.reasoning_parser or \ + request.tool_choice == "required": # These are only required in "auto" tool choice case all_previous_token_ids = [[]] * num_choices # For reasoning parser and tool call all enabled added_content_delta_arr = [False] * num_choices reasoning_end_arr = [False] * num_choices - elif request.tool_choice == "required": - all_previous_token_ids = None else: all_previous_token_ids = None @@ -681,7 +680,8 @@ async def chat_completion_stream_generator( delta_message: Optional[DeltaMessage] # just update previous_texts and previous_token_ids - if ((tool_choice_auto or self.reasoning_parser) + if ((tool_choice_auto or self.reasoning_parser + or request.tool_choice == "required") and not self.use_harmony): assert previous_texts is not None assert all_previous_token_ids is not None @@ -764,25 +764,53 @@ async def chat_completion_stream_generator( previous_text = previous_texts[i] current_text = previous_text + delta_text fn_name_returned = function_name_returned[i] + output_token_ids = as_list(output.token_ids) - if self.reasoning_parser: - _, content = \ - reasoning_parser.extract_reasoning_content( + if not reasoning_end_arr[i] and \ + res.prompt_token_ids and \ + reasoning_parser.is_reasoning_end(res.prompt_token_ids): + reasoning_end_arr[i] = True + + if self.reasoning_parser and not reasoning_end_arr[i]: + delta_message = ( + reasoning_parser. + extract_reasoning_content_streaming( + previous_text, current_text, - request - ) + delta_text, + previous_token_ids, + current_token_ids, + output_token_ids, + )) + if reasoning_parser.is_reasoning_end( + output_token_ids): + reasoning_end_arr[i] = True + if delta_message and delta_message.content: + current_text = delta_message.content + delta_message.content = None + else: + current_text = "" else: - content = current_text - delta_message, function_name_returned[i] = ( - self.extract_tool_call_required_streaming( - previous_text=previous_text, - current_text=content, - delta_text=delta_text, - function_name_returned=fn_name_returned, - tool_call_idx=history_tool_call_cnt)) - if (delta_message and delta_message.tool_calls and - delta_message.tool_calls[0].id is not None): - history_tool_call_cnt += 1 + if self.reasoning_parser: + _, content = \ + reasoning_parser.extract_reasoning_content( + current_text, + request + ) + else: + content = current_text + + delta_message, function_name_returned[i] = ( + self.extract_tool_call_required_streaming( + previous_text=previous_text, + current_text=content, + delta_text=delta_text, + function_name_returned=fn_name_returned, + tool_call_idx=history_tool_call_cnt)) + if (delta_message and delta_message.tool_calls + and delta_message.tool_calls[0].id + is not None): + history_tool_call_cnt += 1 # update the previous values for the next iteration previous_texts[i] = current_text @@ -888,7 +916,8 @@ async def chat_completion_stream_generator( delta_message = DeltaMessage(content=delta_text) # update the previous values for the next iteration - if ((tool_choice_auto or self.reasoning_parser) + if ((tool_choice_auto or self.reasoning_parser + or request.tool_choice == "required") and not self.use_harmony): assert previous_texts is not None assert all_previous_token_ids is not None From 1a5780e394ca91ea9049bdb267f708d411cef752 Mon Sep 17 00:00:00 2001 From: CNE Pierre FICHEPOIL Date: Fri, 12 Sep 2025 06:40:47 +0000 Subject: [PATCH 2/7] oops Signed-off-by: CNE Pierre FICHEPOIL --- vllm/entrypoints/openai/serving_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 65d9c58d3696..c45b7606f98d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -681,7 +681,7 @@ async def chat_completion_stream_generator( # just update previous_texts and previous_token_ids if (tool_choice_auto or self.reasoning_parser - or request.tool_choice == "required") + or request.tool_choice == "required"): assert previous_texts is not None assert all_previous_token_ids is not None previous_text = previous_texts[i] From 4d8d81c290a162f82e9d765eff07d5eeb0d0dd1d Mon Sep 17 00:00:00 2001 From: CNE Pierre FICHEPOIL Date: Fri, 12 Sep 2025 07:34:16 +0000 Subject: [PATCH 3/7] faken precommit forgotten Signed-off-by: CNE Pierre FICHEPOIL --- vllm/entrypoints/openai/serving_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index c45b7606f98d..bc27b01c7b42 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -681,7 +681,7 @@ async def chat_completion_stream_generator( # just update previous_texts and previous_token_ids if (tool_choice_auto or self.reasoning_parser - or request.tool_choice == "required"): + or request.tool_choice == "required"): assert previous_texts is not None assert all_previous_token_ids is not None previous_text = previous_texts[i] From e89b4846bc0bf07653fdbd8430d2799017055188 Mon Sep 17 00:00:00 2001 From: CNE Pierre FICHEPOIL Date: Mon, 15 Sep 2025 15:20:41 +0000 Subject: [PATCH 4/7] reverted useless changes, weird Signed-off-by: CNE Pierre FICHEPOIL --- vllm/entrypoints/openai/serving_chat.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index bc27b01c7b42..c651dbdc0b66 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -514,8 +514,7 @@ async def chat_completion_stream_generator( # Only one of these will be used, thus previous_texts and # all_previous_token_ids will not be used twice in the same iteration. - if tool_choice_auto or self.reasoning_parser or \ - request.tool_choice == "required": + if tool_choice_auto or self.reasoning_parser: # These are only required in "auto" tool choice case all_previous_token_ids = [[]] * num_choices # For reasoning parser and tool call all enabled @@ -680,8 +679,7 @@ async def chat_completion_stream_generator( delta_message: Optional[DeltaMessage] # just update previous_texts and previous_token_ids - if (tool_choice_auto or self.reasoning_parser - or request.tool_choice == "required"): + if (tool_choice_auto or self.reasoning_parser): assert previous_texts is not None assert all_previous_token_ids is not None previous_text = previous_texts[i] @@ -964,8 +962,7 @@ async def chat_completion_stream_generator( delta_message = DeltaMessage(content=delta_text) # update the previous values for the next iteration - if ((tool_choice_auto or self.reasoning_parser - or request.tool_choice == "required") + if ((tool_choice_auto or self.reasoning_parser) and not self.use_harmony): assert previous_texts is not None assert all_previous_token_ids is not None From b62941d15a5c75ad4da3b23707845af64b998990 Mon Sep 17 00:00:00 2001 From: CNE Pierre FICHEPOIL Date: Mon, 15 Sep 2025 15:51:54 +0000 Subject: [PATCH 5/7] fixed reasoning streaming with tool_choice=required Signed-off-by: CNE Pierre FICHEPOIL --- .../openai/test_completion_with_function_calling.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 4ef5d4e8a699..fdcbc66a8d42 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -211,11 +211,19 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str, }) output = [] + reasoning = [] async for chunk in output_stream: - if chunk.choices and chunk.choices[0].delta.tool_calls: - output.extend(chunk.choices[0].delta.tool_calls) + if chunk.choices: + if enable_thinking and\ + getattr(chunk.choices[0].delta,\ + "reasoning_content", None): + reasoning.append(chunk.choices[0].delta.reasoning_content) + if chunk.choices[0].delta.tool_calls: + output.extend(chunk.choices[0].delta.tool_calls) assert len(output) > 0 + if enable_thinking: + assert len(reasoning) > 0 @pytest.fixture(scope="module") From 003fe5932dc9e1b331347894d3cbc5ddfe89e9eb Mon Sep 17 00:00:00 2001 From: CNE Pierre FICHEPOIL Date: Wed, 17 Sep 2025 11:44:21 +0000 Subject: [PATCH 6/7] fixed streaming toolcall with non reasoning model (duplicated delta added) Signed-off-by: CNE Pierre FICHEPOIL --- vllm/entrypoints/openai/serving_chat.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index c651dbdc0b66..3f44a699e01a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -522,6 +522,7 @@ async def chat_completion_stream_generator( reasoning_end_arr = [False] * num_choices else: all_previous_token_ids = None + reasoning_end_arr = None try: if self.reasoning_parser: @@ -807,7 +808,8 @@ async def chat_completion_stream_generator( fn_name_returned = function_name_returned[i] output_token_ids = as_list(output.token_ids) - if not reasoning_end_arr[i] and \ + if self.reasoning_parser is not None and \ + not reasoning_end_arr[i] and \ res.prompt_token_ids and \ reasoning_parser.is_reasoning_end(res.prompt_token_ids): reasoning_end_arr[i] = True @@ -830,16 +832,11 @@ async def chat_completion_stream_generator( current_text = delta_message.content delta_message.content = None else: + #reasoning ended current_text = "" else: - if self.reasoning_parser: - _, content = \ - reasoning_parser.extract_reasoning_content( - current_text, - request - ) - else: - content = current_text + #either finished reasoning or no reasoning at all + content = current_text delta_message, function_name_returned[i] = ( self.extract_tool_call_required_streaming( @@ -854,9 +851,6 @@ async def chat_completion_stream_generator( history_tool_call_cnt += 1 tools_streamed[i] = True - # update the previous values for the next iteration - previous_texts[i] = current_text - # handle streaming deltas for tools with "auto" tool choice # and reasoning parser elif tool_choice_auto and self.reasoning_parser: From 92fda09ed486b1e52bf7f8ac103ca334c8db9429 Mon Sep 17 00:00:00 2001 From: CNE Pierre FICHEPOIL Date: Thu, 18 Sep 2025 07:11:23 +0000 Subject: [PATCH 7/7] removed useless reasoning_end_arr I forgot to remove from old code experiment, and removed added parentheses Signed-off-by: CNE Pierre FICHEPOIL --- vllm/entrypoints/openai/serving_chat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index e32362d048f0..da6df49ecf31 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -522,7 +522,6 @@ async def chat_completion_stream_generator( reasoning_end_arr = [False] * num_choices else: all_previous_token_ids = None - reasoning_end_arr = None try: if self.reasoning_parser: @@ -680,7 +679,7 @@ async def chat_completion_stream_generator( delta_message: Optional[DeltaMessage] # just update previous_texts and previous_token_ids - if (tool_choice_auto or self.reasoning_parser): + if tool_choice_auto or self.reasoning_parser: assert previous_texts is not None assert all_previous_token_ids is not None previous_text = previous_texts[i] @@ -851,7 +850,6 @@ async def chat_completion_stream_generator( history_tool_call_cnt += 1 tools_streamed[i] = True - # handle streaming deltas for tools with "auto" tool choice # and reasoning parser elif tool_choice_auto and self.reasoning_parser: