Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions docs/source/features/reasoning_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,10 @@ vLLM currently supports the following reasoning models:

## Quickstart

To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.

```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
```

Next, make a request to the model that should return the reasoning content in the response.
Expand Down Expand Up @@ -140,8 +139,7 @@ Remember to check whether the `reasoning_content` exists in the response before
The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.

```bash
VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
```

Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
Expand Down Expand Up @@ -316,9 +314,8 @@ class DeepSeekReasoner(Reasoner):

The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.

Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags.
Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.

```bash
vllm serve <model_tag> \
--enable-reasoning --reasoning-parser example
vllm serve <model_tag> --reasoning-parser example
```
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
--reasoning-parser deepseek_r1
```

This example demonstrates how to generate chat completions from reasoning models
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

```bash
vllm serve Qwen/QwQ-32B \
--enable-reasoning --reasoning-parser deepseek_r1 \
--reasoning-parser deepseek_r1 \
--enable-auto-tool-choice --tool-call-parser hermes

```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
--reasoning-parser deepseek_r1
```

This example demonstrates how to generate chat completions from reasoning models
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
--reasoning-parser deepseek_r1
```

Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
Expand Down
6 changes: 3 additions & 3 deletions tests/entrypoints/openai/test_chat_with_tool_reasoning.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
"--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
"--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
"--tool-call-parser", "hermes"
"--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
"deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
"hermes"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
Expand Down
14 changes: 3 additions & 11 deletions tests/entrypoints/openai/test_cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,31 +122,23 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
"""Ensure validation fails if reasoning is enabled with auto tool choice"""
args = serve_parser.parse_args(args=[
"--enable-auto-tool-choice",
"--enable-reasoning",
"--reasoning-parser",
"deepseek_r1",
])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)


def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
def test_passes_with_reasoning_parser(serve_parser):
"""Ensure validation passes if reasoning is enabled
with a reasoning parser"""
args = serve_parser.parse_args(args=[
"--enable-reasoning",
"--reasoning-parser",
"deepseek_r1",
])
validate_parsed_serve_args(args)


def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
"""Ensure validation fails if reasoning is enabled
without a reasoning parser"""
args = serve_parser.parse_args(args=["--enable-reasoning"])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)


def test_chat_template_validation_for_happy_paths(serve_parser):
"""Ensure validation passes if the chat template exists"""
args = serve_parser.parse_args(
Expand Down
5 changes: 2 additions & 3 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3223,10 +3223,9 @@ def guided_decoding_backend(self, value: GuidedDecodingBackend):
in the JSON schema. This is only supported for the `guidance` backend and
is used to better align its behaviour with `outlines` and `xgrammar`."""

reasoning_backend: Optional[str] = None
reasoning_backend: str = ""
"""Select the reasoning parser depending on the model that you're using.
This is used to parse the reasoning content into OpenAI API format.
Required for `--enable-reasoning`."""
This is used to parse the reasoning content into OpenAI API format."""

def compute_hash(self) -> str:
"""
Expand Down
17 changes: 12 additions & 5 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,9 @@ class EngineArgs:
calculate_kv_scales: bool = CacheConfig.calculate_kv_scales

additional_config: Optional[Dict[str, Any]] = None
enable_reasoning: Optional[bool] = None
reasoning_parser: Optional[str] = DecodingConfig.reasoning_backend
enable_reasoning: Optional[bool] = None # DEPRECATED
reasoning_parser: str = DecodingConfig.reasoning_backend

use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load

def __post_init__(self):
Expand Down Expand Up @@ -801,8 +802,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"--enable-reasoning",
action="store_true",
default=False,
help="Whether to enable reasoning_content for the model. "
"If enabled, the model will be able to generate reasoning content."
help=
"[DEPRECATED] " \
"The --enable-reasoning flag is deprecated as of v0.8.6. "
"Use --reasoning-parser to specify " \
"the reasoning parser backend instead. "
"This flag (--enable-reasoning) will be " \
"removed in v0.10.0. "
"When --reasoning-parser is specified, " \
"reasoning mode is automatically enabled."
)

return parser
Expand Down Expand Up @@ -1091,7 +1099,6 @@ def create_engine_config(
disable_additional_properties=\
self.guided_decoding_disable_additional_properties,
reasoning_backend=self.reasoning_parser
if self.enable_reasoning else None,
)

show_hidden_metrics = False
Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2096,7 +2096,7 @@ def _build_logits_processors(
guided_decoding.backend = guided_decoding.backend or \
self.decoding_config.backend

if self.decoding_config.reasoning_backend is not None:
if self.decoding_config.reasoning_backend:
logger.debug("Building with reasoning backend %s",
self.decoding_config.reasoning_backend)

Expand Down
3 changes: 1 addition & 2 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,6 @@ async def init_app_state(
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
tool_parser=args.tool_call_parser,
enable_reasoning=args.enable_reasoning,
reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
) if model_config.runner_type == "generate" else None
Expand Down Expand Up @@ -1053,7 +1052,7 @@ async def run_server(args, **uvicorn_kwargs) -> None:
f"(chose from {{ {','.join(valid_tool_parses)} }})")

valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
if args.enable_reasoning \
if args.reasoning_parser \
and args.reasoning_parser not in valid_reasoning_parses:
raise KeyError(
f"invalid reasoning parser: {args.reasoning_parser} "
Expand Down
5 changes: 0 additions & 5 deletions vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,11 +284,6 @@ def validate_parsed_serve_args(args: argparse.Namespace):
raise TypeError("Error: --enable-auto-tool-choice requires "
"--tool-call-parser")

# Enable reasoning needs a reasoning parser to be valid
if args.enable_reasoning and not args.reasoning_parser:
raise TypeError("Error: --enable-reasoning requires "
"--reasoning-parser")


def create_parser_for_docs() -> FlexibleArgumentParser:
parser_for_docs = FlexibleArgumentParser(
Expand Down
59 changes: 14 additions & 45 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ def __init__(
chat_template: Optional[str],
chat_template_content_format: ChatTemplateContentFormatOption,
return_tokens_as_token_ids: bool = False,
enable_reasoning: bool = False,
reasoning_parser: Optional[str] = None,
reasoning_parser: str = "",
enable_auto_tools: bool = False,
tool_parser: Optional[str] = None,
enable_prompt_tokens_details: bool = False,
Expand All @@ -82,18 +81,17 @@ def __init__(
" the parallel_tool_calls client option is preset for "
"compatibility reasons, it will be ignored.")

self.enable_reasoning: bool = enable_reasoning
self.reasoning_parser: Optional[Callable[[AnyTokenizer],
ReasoningParser]] = None
if self.enable_reasoning:
if reasoning_parser:
try:
self.reasoning_parser = (
ReasoningParserManager.get_reasoning_parser(
reasoning_parser))
assert self.reasoning_parser is not None
except Exception as e:
raise TypeError("Error: --enable-reasoning requires "
f"reasoning_parser:'{reasoning_parser}' "
"which has not been registered") from e
raise TypeError(
f"{reasoning_parser=} has not been registered") from e
self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
if self.enable_auto_tools:
try:
Expand Down Expand Up @@ -423,15 +421,12 @@ async def chat_completion_stream_generator(
not tool_choice_function_name
and self._should_stream_with_auto_tool_parsing(request))

should_stream_with_reasoning_parsing = (
self._should_stream_with_reasoning_parsing(request))

all_previous_token_ids: Optional[list[list[int]]]
function_name_returned: Optional[list[bool]] = None

# Only one of these will be used, thus previous_texts and
# all_previous_token_ids will not be used twice in the same iteration.
if tool_choice_auto or should_stream_with_reasoning_parsing:
if tool_choice_auto or self.reasoning_parser:
# These are only required in "auto" tool choice case
previous_texts = [""] * num_choices
all_previous_token_ids = [[]] * num_choices
Expand All @@ -446,20 +441,14 @@ async def chat_completion_stream_generator(
previous_texts, all_previous_token_ids = None, None

try:
# There is no need to check if the reasoning_parser is None
# because the should_stream_with_reasoning_parsing check
# already ensures that the reasoning_parser is not None.
# but the pre-commit hook requires it.
if should_stream_with_reasoning_parsing and \
self.reasoning_parser is not None:
if self.reasoning_parser:
reasoning_parser = self.reasoning_parser(tokenizer)
except RuntimeError as e:
logger.exception("Error in reasoning parser creation.")
data = self.create_streaming_error_response(str(e))
yield f"data: {data}\n\n"
yield "data: [DONE]\n\n"
return

# Prepare the tool parser if it's needed
try:
if tool_choice_auto and self.tool_parser:
Expand Down Expand Up @@ -592,7 +581,7 @@ async def chat_completion_stream_generator(
delta_message: Optional[DeltaMessage]

# just update previous_texts and previous_token_ids
if tool_choice_auto or should_stream_with_reasoning_parsing:
if tool_choice_auto or self.reasoning_parser:
assert previous_texts is not None
assert all_previous_token_ids is not None
previous_text = previous_texts[i]
Expand All @@ -603,7 +592,7 @@ async def chat_completion_stream_generator(

# handle streaming deltas for tools with named tool_choice
if tool_choice_function_name:
if (self.enable_reasoning
if (self.reasoning_parser
and not reasoning_parser.is_reasoning_end(
previous_token_ids)):
assert reasoning_parser is not None
Expand All @@ -630,7 +619,7 @@ async def chat_completion_stream_generator(
current_text = ""
else:
# Just to add remaining `content`
if self.enable_reasoning:
if self.reasoning_parser:
delta_text = previous_text + delta_text
current_text = ""

Expand Down Expand Up @@ -660,7 +649,7 @@ async def chat_completion_stream_generator(

# handle streaming deltas for tools with "auto" tool choice
# and reasoning parser
elif tool_choice_auto and self.enable_reasoning:
elif tool_choice_auto and self.reasoning_parser:
assert tool_parser is not None
assert reasoning_parser is not None
assert added_content_delta_arr is not None
Expand Down Expand Up @@ -728,8 +717,7 @@ async def chat_completion_stream_generator(
delta_token_ids=output.token_ids,
request=request))
# when only reasoning
elif self.enable_reasoning:
assert reasoning_parser is not None
elif self.reasoning_parser:
delta_message = (reasoning_parser.
extract_reasoning_content_streaming(
previous_text,
Expand All @@ -744,7 +732,7 @@ async def chat_completion_stream_generator(
delta_message = DeltaMessage(content=delta_text)

# update the previous values for the next iteration
if tool_choice_auto or should_stream_with_reasoning_parsing:
if tool_choice_auto or self.reasoning_parser:
assert previous_texts is not None
assert all_previous_token_ids is not None
previous_texts[i] = current_text
Expand Down Expand Up @@ -931,17 +919,9 @@ async def chat_completion_full_generator(
)
else:
logprobs = None

should_stream_with_reasoning_parsing = (
self._should_stream_with_reasoning_parsing(request))

# In the OpenAI API the finish_reason is "tools_called"
# if the tool choice is auto and the model produced a tool
# call. The same is not true for named function calls
auto_tools_called = False

if should_stream_with_reasoning_parsing and \
self.reasoning_parser is not None:
if self.reasoning_parser:
try:
reasoning_parser = self.reasoning_parser(tokenizer)
except RuntimeError as e:
Expand Down Expand Up @@ -1176,17 +1156,6 @@ def _should_stream_with_auto_tool_parsing(self,
return (request.tools and self.tool_parser and self.enable_auto_tools
and request.tool_choice in ['auto', None])

def _should_stream_with_reasoning_parsing(self,
request: ChatCompletionRequest):
"""
Utility function to check if streamed tokens should go through the
reasoning parser that was configured.

We only want to do this IF reasoning is enabled and a reasoning
parser is configured.
"""
return self.enable_reasoning and self.reasoning_parser is not None

def _should_check_for_unstreamed_tool_arg_tokens(
self,
delta_message: Optional[DeltaMessage],
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/guided_decoding/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ async def get_guided_decoding_logits_processor(
reasoning_backend: str | None = None) -> LogitsProcessor | None:

reasoner = None
if reasoning_backend is not None:
if reasoning_backend:
reasoner_class = ReasoningParserManager.get_reasoning_parser(
reasoning_backend)
reasoner = reasoner_class(tokenizer)
Expand Down Expand Up @@ -146,7 +146,7 @@ def get_local_guided_decoding_logits_processor(
guided_params = maybe_backend_fallback(guided_params)

reasoner = None
if reasoning_backend is not None:
if reasoning_backend:
reasoner_class = ReasoningParserManager.get_reasoning_parser(
reasoning_backend)
reasoner = reasoner_class(tokenizer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __call__(self, input_ids: List[int],
"""Use the FSM to bias the logits before sampling the next token."""

# Skip the structured logits processing if reasoning is not finished.
# reasoner is not None only when `--enable-reasoning` is set.
# reasoner is not None only when `--reasoning-parser` is set.
if self._reasoner is not None:
if not self._reasoner.is_reasoning_end(input_ids):
return scores
Expand Down
Loading