Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -211,11 +211,19 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
})

output = []
reasoning = []
async for chunk in output_stream:
if chunk.choices and chunk.choices[0].delta.tool_calls:
output.extend(chunk.choices[0].delta.tool_calls)
if chunk.choices:
if enable_thinking and\
getattr(chunk.choices[0].delta,\
"reasoning_content", None):
reasoning.append(chunk.choices[0].delta.reasoning_content)
if chunk.choices[0].delta.tool_calls:
output.extend(chunk.choices[0].delta.tool_calls)

assert len(output) > 0
if enable_thinking:
assert len(reasoning) > 0


@pytest.fixture(scope="module")
Expand Down
58 changes: 40 additions & 18 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,8 +520,6 @@ async def chat_completion_stream_generator(
# For reasoning parser and tool call all enabled
added_content_delta_arr = [False] * num_choices
reasoning_end_arr = [False] * num_choices
elif request.tool_choice == "required":
all_previous_token_ids = None
else:
all_previous_token_ids = None

Expand Down Expand Up @@ -807,26 +805,50 @@ async def chat_completion_stream_generator(
previous_text = previous_texts[i]
current_text = previous_text + delta_text
fn_name_returned = function_name_returned[i]
output_token_ids = as_list(output.token_ids)

if self.reasoning_parser:
_, content = \
reasoning_parser.extract_reasoning_content(
if self.reasoning_parser is not None and \
not reasoning_end_arr[i] and \
res.prompt_token_ids and \
reasoning_parser.is_reasoning_end(res.prompt_token_ids):
reasoning_end_arr[i] = True

if self.reasoning_parser and not reasoning_end_arr[i]:
delta_message = (
reasoning_parser.
extract_reasoning_content_streaming(
previous_text,
current_text,
request
)
delta_text,
previous_token_ids,
current_token_ids,
output_token_ids,
))
if reasoning_parser.is_reasoning_end(
output_token_ids):
reasoning_end_arr[i] = True
if delta_message and delta_message.content:
current_text = delta_message.content
delta_message.content = None
else:
#reasoning ended
current_text = ""
else:
#either finished reasoning or no reasoning at all
content = current_text
delta_message, function_name_returned[i] = (
self.extract_tool_call_required_streaming(
previous_text=previous_text,
current_text=content,
delta_text=delta_text,
function_name_returned=fn_name_returned,
tool_call_idx=history_tool_call_cnt))
if (delta_message and delta_message.tool_calls and
delta_message.tool_calls[0].id is not None):
history_tool_call_cnt += 1
tools_streamed[i] = True

delta_message, function_name_returned[i] = (
self.extract_tool_call_required_streaming(
previous_text=previous_text,
current_text=content,
delta_text=delta_text,
function_name_returned=fn_name_returned,
tool_call_idx=history_tool_call_cnt))
if (delta_message and delta_message.tool_calls
and delta_message.tool_calls[0].id
is not None):
history_tool_call_cnt += 1
tools_streamed[i] = True

# handle streaming deltas for tools with "auto" tool choice
# and reasoning parser
Expand Down