diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
index 7b6af183a86a..1b669c8fd2fb 100644
--- a/tests/reasoning/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -90,6 +90,40 @@ def deepseek_r1_qwen_tokenizer():
"content": "This is the rest",
"is_reasoning_end": True,
}
+THINK_NO_END = {
+ "output": "This is a reasoning section",
+ "reasoning_content": "This is a reasoning section",
+ "content": None,
+ "is_reasoning_end": False,
+}
+EMPTY = {
+ "output": "",
+ "reasoning_content": "",
+ "content": None,
+ "is_reasoning_end": False,
+}
+EMPTY_STREAMING = {
+ "output": "",
+ "reasoning_content": None,
+ "content": None,
+ "is_reasoning_end": False,
+}
+NEW_LINE = {
+ "output": "\nThis is a reasoning section\nThis is the rest",
+ "reasoning_content": "This is a reasoning section",
+ "content": "\nThis is the rest",
+ "is_reasoning_end": True,
+}
+# Streaming cannot handle new lines at the beginning of the output
+# because we need to support ... and ...
+# We cannot know if the text before is reasoning content
+# or not.
+NEW_LINE_STREAMING = {
+ "output": "\nThis is a reasoning section\nThis is the rest",
+ "reasoning_content": "\nThis is a reasoning section",
+ "content": "\nThis is the rest",
+ "is_reasoning_end": True,
+}
TEST_CASES = [
pytest.param(
@@ -182,6 +216,36 @@ def deepseek_r1_qwen_tokenizer():
SHORTEST_REASONING_WITH_THINK,
id="shortest_with_think_streaming",
),
+ pytest.param(
+ False,
+ THINK_NO_END,
+ id="think_no_end",
+ ),
+ pytest.param(
+ True,
+ THINK_NO_END,
+ id="think_no_end_streaming",
+ ),
+ pytest.param(
+ False,
+ EMPTY,
+ id="empty",
+ ),
+ pytest.param(
+ True,
+ EMPTY_STREAMING,
+ id="empty_streaming",
+ ),
+ pytest.param(
+ False,
+ NEW_LINE,
+ id="new_line",
+ ),
+ pytest.param(
+ True,
+ NEW_LINE_STREAMING,
+ id="new_line_streaming",
+ ),
]
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index 73be6d4d1ab1..1c283c092a28 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
-import re
from collections.abc import Sequence
from typing import Optional, Union
@@ -32,9 +31,6 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
def __init__(self, tokenizer: PreTrainedTokenizerBase):
super().__init__(tokenizer)
- self.reasoning_regex = re.compile(
- rf"{self.start_token}(.*?){self.end_token}", re.DOTALL)
-
if not self.model_tokenizer:
raise ValueError(
"The model tokenizer must be passed to the ReasoningParser "
@@ -143,23 +139,34 @@ def extract_reasoning_content_streaming(
def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[Optional[str], Optional[str]]:
+ """
+ Extract reasoning content from the model output.
+
+ For text abcxyz:
+ - 'abc' goes to reasoning_content
+ - 'xyz' goes to content
+
+ Returns:
+ tuple[Optional[str], Optional[str]]: reasoning content and content
+ """
+
+ # Check if the start token is present in the model output, remove it
+ # if it is present.
+ model_output_parts = model_output.partition(self.start_token)
+ model_output = model_output_parts[2] if model_output_parts[
+ 1] else model_output_parts[0]
+
# DeepSeek R1 doesn't generate now.
# Thus we assume the reasoning content is always at the start.
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
if self.end_token not in model_output:
return model_output, None
else:
- # Add a start token if it's missing to keep compatibility.
- if self.start_token not in model_output:
- model_output = f"{self.start_token}{model_output}"
- # Use a regex to find the reasoning content
- reasoning_content = self.reasoning_regex.findall(model_output)[0]
-
- end_index = len(
- f"{self.start_token}{reasoning_content}{self.end_token}")
- final_output = model_output[end_index:]
-
- if len(final_output) == 0:
- return reasoning_content, None
-
- return reasoning_content, final_output
+ reasoning_content, _, content = model_output.partition(
+ self.end_token)
+ # If the end token is not found, return the model output as is.
+ # It should not happen since we already checked for the presence
+ # of the end token.
+ # If generation stops right after end-of-think, return null content
+ final_content = content or None
+ return reasoning_content, final_content