From 17853a1053c3c7210a0c89b380f4710b80546415 Mon Sep 17 00:00:00 2001
From: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Date: Thu, 4 Sep 2025 13:47:22 +0000
Subject: [PATCH 1/7] fixed reasoning streaming with tool_choice="required"

Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
---
 vllm/entrypoints/openai/serving_chat.py | 71 +++++++++++++++++--------
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 35edd2f85cd0..d64b65363e7b 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -512,14 +512,13 @@ async def chat_completion_stream_generator(
 
         # Only one of these will be used, thus previous_texts and
         # all_previous_token_ids will not be used twice in the same iteration.
-        if tool_choice_auto or self.reasoning_parser:
+        if tool_choice_auto or self.reasoning_parser or \
+                request.tool_choice == "required":
             # These are only required in "auto" tool choice case
             all_previous_token_ids = [[]] * num_choices
             # For reasoning parser and tool call all enabled
             added_content_delta_arr = [False] * num_choices
             reasoning_end_arr = [False] * num_choices
-        elif request.tool_choice == "required":
-            all_previous_token_ids = None
         else:
             all_previous_token_ids = None
 
@@ -681,7 +680,8 @@ async def chat_completion_stream_generator(
                     delta_message: Optional[DeltaMessage]
 
                     # just update previous_texts and previous_token_ids
-                    if ((tool_choice_auto or self.reasoning_parser)
+                    if ((tool_choice_auto or self.reasoning_parser
+                         or request.tool_choice == "required")
                             and not self.use_harmony):
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
@@ -764,25 +764,53 @@ async def chat_completion_stream_generator(
                         previous_text = previous_texts[i]
                         current_text = previous_text + delta_text
                         fn_name_returned = function_name_returned[i]
+                        output_token_ids = as_list(output.token_ids)
 
-                        if self.reasoning_parser:
-                            _, content = \
-                                reasoning_parser.extract_reasoning_content(
+                        if not reasoning_end_arr[i] and \
+                                res.prompt_token_ids and \
+                                reasoning_parser.is_reasoning_end(res.prompt_token_ids):
+                            reasoning_end_arr[i] = True
+
+                        if self.reasoning_parser and not reasoning_end_arr[i]:
+                            delta_message = (
+                                reasoning_parser.
+                                extract_reasoning_content_streaming(
+                                    previous_text,
                                     current_text,
-                                    request
-                                )
+                                    delta_text,
+                                    previous_token_ids,
+                                    current_token_ids,
+                                    output_token_ids,
+                                ))
+                            if reasoning_parser.is_reasoning_end(
+                                    output_token_ids):
+                                reasoning_end_arr[i] = True
+                                if delta_message and delta_message.content:
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    current_text = ""
                         else:
-                            content = current_text
-                        delta_message, function_name_returned[i] = (
-                            self.extract_tool_call_required_streaming(
-                                previous_text=previous_text,
-                                current_text=content,
-                                delta_text=delta_text,
-                                function_name_returned=fn_name_returned,
-                                tool_call_idx=history_tool_call_cnt))
-                        if (delta_message and delta_message.tool_calls and
-                                delta_message.tool_calls[0].id is not None):
-                            history_tool_call_cnt += 1
+                            if self.reasoning_parser:
+                                _, content = \
+                                    reasoning_parser.extract_reasoning_content(
+                                        current_text,
+                                        request
+                                    )
+                            else:
+                                content = current_text
+
+                            delta_message, function_name_returned[i] = (
+                                self.extract_tool_call_required_streaming(
+                                    previous_text=previous_text,
+                                    current_text=content,
+                                    delta_text=delta_text,
+                                    function_name_returned=fn_name_returned,
+                                    tool_call_idx=history_tool_call_cnt))
+                            if (delta_message and delta_message.tool_calls
+                                    and delta_message.tool_calls[0].id
+                                    is not None):
+                                history_tool_call_cnt += 1
 
                         # update the previous values for the next iteration
                         previous_texts[i] = current_text
@@ -888,7 +916,8 @@ async def chat_completion_stream_generator(
                         delta_message = DeltaMessage(content=delta_text)
 
                     # update the previous values for the next iteration
-                    if ((tool_choice_auto or self.reasoning_parser)
+                    if ((tool_choice_auto or self.reasoning_parser
+                         or request.tool_choice == "required")
                             and not self.use_harmony):
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None

From 1a5780e394ca91ea9049bdb267f708d411cef752 Mon Sep 17 00:00:00 2001
From: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Date: Fri, 12 Sep 2025 06:40:47 +0000
Subject: [PATCH 2/7] oops

Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
---
 vllm/entrypoints/openai/serving_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 65d9c58d3696..c45b7606f98d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -681,7 +681,7 @@ async def chat_completion_stream_generator(
 
                     # just update previous_texts and previous_token_ids
                     if (tool_choice_auto or self.reasoning_parser
-                         or request.tool_choice == "required")
+                         or request.tool_choice == "required"):
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_text = previous_texts[i]

From 4d8d81c290a162f82e9d765eff07d5eeb0d0dd1d Mon Sep 17 00:00:00 2001
From: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Date: Fri, 12 Sep 2025 07:34:16 +0000
Subject: [PATCH 3/7] faken precommit forgotten

Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
---
 vllm/entrypoints/openai/serving_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index c45b7606f98d..bc27b01c7b42 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -681,7 +681,7 @@ async def chat_completion_stream_generator(
 
                     # just update previous_texts and previous_token_ids
                     if (tool_choice_auto or self.reasoning_parser
-                         or request.tool_choice == "required"):
+                            or request.tool_choice == "required"):
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_text = previous_texts[i]

From e89b4846bc0bf07653fdbd8430d2799017055188 Mon Sep 17 00:00:00 2001
From: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Date: Mon, 15 Sep 2025 15:20:41 +0000
Subject: [PATCH 4/7] reverted useless changes, weird

Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
---
 vllm/entrypoints/openai/serving_chat.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index bc27b01c7b42..c651dbdc0b66 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -514,8 +514,7 @@ async def chat_completion_stream_generator(
 
         # Only one of these will be used, thus previous_texts and
         # all_previous_token_ids will not be used twice in the same iteration.
-        if tool_choice_auto or self.reasoning_parser or \
-                request.tool_choice == "required":
+        if tool_choice_auto or self.reasoning_parser:
             # These are only required in "auto" tool choice case
             all_previous_token_ids = [[]] * num_choices
             # For reasoning parser and tool call all enabled
@@ -680,8 +679,7 @@ async def chat_completion_stream_generator(
                     delta_message: Optional[DeltaMessage]
 
                     # just update previous_texts and previous_token_ids
-                    if (tool_choice_auto or self.reasoning_parser
-                            or request.tool_choice == "required"):
+                    if (tool_choice_auto or self.reasoning_parser):
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_text = previous_texts[i]
@@ -964,8 +962,7 @@ async def chat_completion_stream_generator(
                         delta_message = DeltaMessage(content=delta_text)
 
                     # update the previous values for the next iteration
-                    if ((tool_choice_auto or self.reasoning_parser
-                         or request.tool_choice == "required")
+                    if ((tool_choice_auto or self.reasoning_parser)
                             and not self.use_harmony):
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None

From b62941d15a5c75ad4da3b23707845af64b998990 Mon Sep 17 00:00:00 2001
From: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Date: Mon, 15 Sep 2025 15:51:54 +0000
Subject: [PATCH 5/7] fixed reasoning streaming with tool_choice=required

Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
---
 .../openai/test_completion_with_function_calling.py  | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 4ef5d4e8a699..fdcbc66a8d42 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -211,11 +211,19 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
             })
 
         output = []
+        reasoning = []
         async for chunk in output_stream:
-            if chunk.choices and chunk.choices[0].delta.tool_calls:
-                output.extend(chunk.choices[0].delta.tool_calls)
+            if chunk.choices:
+                if enable_thinking and\
+                        getattr(chunk.choices[0].delta,\
+                            "reasoning_content", None):
+                    reasoning.append(chunk.choices[0].delta.reasoning_content)
+                if chunk.choices[0].delta.tool_calls:
+                    output.extend(chunk.choices[0].delta.tool_calls)
 
         assert len(output) > 0
+        if enable_thinking:
+            assert len(reasoning) > 0
 
 
 @pytest.fixture(scope="module")

From 003fe5932dc9e1b331347894d3cbc5ddfe89e9eb Mon Sep 17 00:00:00 2001
From: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Date: Wed, 17 Sep 2025 11:44:21 +0000
Subject: [PATCH 6/7] fixed streaming toolcall with non reasoning model
 (duplicated delta added)

Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
---
 vllm/entrypoints/openai/serving_chat.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index c651dbdc0b66..3f44a699e01a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -522,6 +522,7 @@ async def chat_completion_stream_generator(
             reasoning_end_arr = [False] * num_choices
         else:
             all_previous_token_ids = None
+            reasoning_end_arr = None
 
         try:
             if self.reasoning_parser:
@@ -807,7 +808,8 @@ async def chat_completion_stream_generator(
                         fn_name_returned = function_name_returned[i]
                         output_token_ids = as_list(output.token_ids)
 
-                        if not reasoning_end_arr[i] and \
+                        if self.reasoning_parser is not None and \
+                                not reasoning_end_arr[i] and \
                                 res.prompt_token_ids and \
                                 reasoning_parser.is_reasoning_end(res.prompt_token_ids):
                             reasoning_end_arr[i] = True
@@ -830,16 +832,11 @@ async def chat_completion_stream_generator(
                                     current_text = delta_message.content
                                     delta_message.content = None
                                 else:
+                                    #reasoning ended
                                     current_text = ""
                         else:
-                            if self.reasoning_parser:
-                                _, content = \
-                                    reasoning_parser.extract_reasoning_content(
-                                        current_text,
-                                        request
-                                    )
-                            else:
-                                content = current_text
+                            #either finished reasoning or no reasoning at all
+                            content = current_text
 
                             delta_message, function_name_returned[i] = (
                                 self.extract_tool_call_required_streaming(
@@ -854,9 +851,6 @@ async def chat_completion_stream_generator(
                                 history_tool_call_cnt += 1
                                 tools_streamed[i] = True
 
-                        # update the previous values for the next iteration
-                        previous_texts[i] = current_text
-
                     # handle streaming deltas for tools with "auto" tool choice
                     # and reasoning parser
                     elif tool_choice_auto and self.reasoning_parser:

From 92fda09ed486b1e52bf7f8ac103ca334c8db9429 Mon Sep 17 00:00:00 2001
From: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Date: Thu, 18 Sep 2025 07:11:23 +0000
Subject: [PATCH 7/7] removed useless reasoning_end_arr I forgot to remove from
 old code experiment, and removed added parentheses

Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
---
 vllm/entrypoints/openai/serving_chat.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index e32362d048f0..da6df49ecf31 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -522,7 +522,6 @@ async def chat_completion_stream_generator(
             reasoning_end_arr = [False] * num_choices
         else:
             all_previous_token_ids = None
-            reasoning_end_arr = None
 
         try:
             if self.reasoning_parser:
@@ -680,7 +679,7 @@ async def chat_completion_stream_generator(
                     delta_message: Optional[DeltaMessage]
 
                     # just update previous_texts and previous_token_ids
-                    if (tool_choice_auto or self.reasoning_parser):
+                    if tool_choice_auto or self.reasoning_parser:
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_text = previous_texts[i]
@@ -851,7 +850,6 @@ async def chat_completion_stream_generator(
                                 history_tool_call_cnt += 1
                                 tools_streamed[i] = True
 
-
                     # handle streaming deltas for tools with "auto" tool choice
                     # and reasoning parser
                     elif tool_choice_auto and self.reasoning_parser: