diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md index 323bf849a082..a079eb8b77e7 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/source/features/reasoning_outputs.md @@ -21,11 +21,10 @@ vLLM currently supports the following reasoning models: ## Quickstart -To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. +To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. ```bash -vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --enable-reasoning --reasoning-parser deepseek_r1 +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 ``` Next, make a request to the model that should return the reasoning content in the response. @@ -140,8 +139,7 @@ Remember to check whether the `reasoning_content` exists in the response before The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now. ```bash -VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --enable-reasoning --reasoning-parser deepseek_r1 +VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 ``` Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine. @@ -316,9 +314,8 @@ class DeepSeekReasoner(Reasoner): The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case. -Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags. +Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags. ```bash -vllm serve \ - --enable-reasoning --reasoning-parser example +vllm serve --reasoning-parser example ``` diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py index cb7f30d93255..5da9236c5306 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py @@ -9,7 +9,7 @@ ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --enable-reasoning --reasoning-parser deepseek_r1 + --reasoning-parser deepseek_r1 ``` This example demonstrates how to generate chat completions from reasoning models diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py index 8c6470aa3dd4..9417abd3989a 100644 --- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py @@ -9,7 +9,7 @@ ```bash vllm serve Qwen/QwQ-32B \ - --enable-reasoning --reasoning-parser deepseek_r1 \ + --reasoning-parser deepseek_r1 \ --enable-auto-tool-choice --tool-call-parser hermes ``` diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index 6f5f7b5fa20b..4bf7731cb41e 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -8,7 +8,7 @@ ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --enable-reasoning --reasoning-parser deepseek_r1 + --reasoning-parser deepseek_r1 ``` This example demonstrates how to generate chat completions from reasoning models diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index 90481cdc0fb7..9cc0a5f2476b 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -8,7 +8,7 @@ ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --enable-reasoning --reasoning-parser deepseek_r1 + --reasoning-parser deepseek_r1 ``` Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py index 53df1d9241b7..e00f001ef730 100644 --- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py +++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py @@ -13,9 +13,9 @@ @pytest.fixture(scope="module") def server(): # noqa: F811 args = [ - "--max-model-len", "8192", "--enforce-eager", "--enable-reasoning", - "--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice", - "--tool-call-parser", "hermes" + "--max-model-len", "8192", "--enforce-eager", "--reasoning-parser", + "deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser", + "hermes" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index e0285b5e5566..8d1abe28a027 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -122,31 +122,23 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser): """Ensure validation fails if reasoning is enabled with auto tool choice""" args = serve_parser.parse_args(args=[ "--enable-auto-tool-choice", - "--enable-reasoning", + "--reasoning-parser", + "deepseek_r1", ]) with pytest.raises(TypeError): validate_parsed_serve_args(args) -def test_enable_reasoning_passes_with_reasoning_parser(serve_parser): +def test_passes_with_reasoning_parser(serve_parser): """Ensure validation passes if reasoning is enabled with a reasoning parser""" args = serve_parser.parse_args(args=[ - "--enable-reasoning", "--reasoning-parser", "deepseek_r1", ]) validate_parsed_serve_args(args) -def test_enable_reasoning_fails_without_reasoning_parser(serve_parser): - """Ensure validation fails if reasoning is enabled - without a reasoning parser""" - args = serve_parser.parse_args(args=["--enable-reasoning"]) - with pytest.raises(TypeError): - validate_parsed_serve_args(args) - - def test_chat_template_validation_for_happy_paths(serve_parser): """Ensure validation passes if the chat template exists""" args = serve_parser.parse_args( diff --git a/vllm/config.py b/vllm/config.py index 43038da37302..41261ab578c4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3223,10 +3223,9 @@ def guided_decoding_backend(self, value: GuidedDecodingBackend): in the JSON schema. This is only supported for the `guidance` backend and is used to better align its behaviour with `outlines` and `xgrammar`.""" - reasoning_backend: Optional[str] = None + reasoning_backend: str = "" """Select the reasoning parser depending on the model that you're using. - This is used to parse the reasoning content into OpenAI API format. - Required for `--enable-reasoning`.""" + This is used to parse the reasoning content into OpenAI API format.""" def compute_hash(self) -> str: """ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c7a580cf1051..d4902bfc2b7c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -364,8 +364,9 @@ class EngineArgs: calculate_kv_scales: bool = CacheConfig.calculate_kv_scales additional_config: Optional[Dict[str, Any]] = None - enable_reasoning: Optional[bool] = None - reasoning_parser: Optional[str] = DecodingConfig.reasoning_backend + enable_reasoning: Optional[bool] = None # DEPRECATED + reasoning_parser: str = DecodingConfig.reasoning_backend + use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load def __post_init__(self): @@ -801,8 +802,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--enable-reasoning", action="store_true", default=False, - help="Whether to enable reasoning_content for the model. " - "If enabled, the model will be able to generate reasoning content." + help= + "[DEPRECATED] " \ + "The --enable-reasoning flag is deprecated as of v0.8.6. " + "Use --reasoning-parser to specify " \ + "the reasoning parser backend instead. " + "This flag (--enable-reasoning) will be " \ + "removed in v0.10.0. " + "When --reasoning-parser is specified, " \ + "reasoning mode is automatically enabled." ) return parser @@ -1091,7 +1099,6 @@ def create_engine_config( disable_additional_properties=\ self.guided_decoding_disable_additional_properties, reasoning_backend=self.reasoning_parser - if self.enable_reasoning else None, ) show_hidden_metrics = False diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8481181eb8e8..0930bae02e41 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -2096,7 +2096,7 @@ def _build_logits_processors( guided_decoding.backend = guided_decoding.backend or \ self.decoding_config.backend - if self.decoding_config.reasoning_backend is not None: + if self.decoding_config.reasoning_backend: logger.debug("Building with reasoning backend %s", self.decoding_config.reasoning_backend) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 136819580897..9746d9697a66 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -967,7 +967,6 @@ async def init_app_state( return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, - enable_reasoning=args.enable_reasoning, reasoning_parser=args.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.runner_type == "generate" else None @@ -1053,7 +1052,7 @@ async def run_server(args, **uvicorn_kwargs) -> None: f"(chose from {{ {','.join(valid_tool_parses)} }})") valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys() - if args.enable_reasoning \ + if args.reasoning_parser \ and args.reasoning_parser not in valid_reasoning_parses: raise KeyError( f"invalid reasoning parser: {args.reasoning_parser} " diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index b3824013f055..a2639d374791 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -284,11 +284,6 @@ def validate_parsed_serve_args(args: argparse.Namespace): raise TypeError("Error: --enable-auto-tool-choice requires " "--tool-call-parser") - # Enable reasoning needs a reasoning parser to be valid - if args.enable_reasoning and not args.reasoning_parser: - raise TypeError("Error: --enable-reasoning requires " - "--reasoning-parser") - def create_parser_for_docs() -> FlexibleArgumentParser: parser_for_docs = FlexibleArgumentParser( diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index dd0b67df4f15..83a92a98026e 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -58,8 +58,7 @@ def __init__( chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, return_tokens_as_token_ids: bool = False, - enable_reasoning: bool = False, - reasoning_parser: Optional[str] = None, + reasoning_parser: str = "", enable_auto_tools: bool = False, tool_parser: Optional[str] = None, enable_prompt_tokens_details: bool = False, @@ -82,18 +81,17 @@ def __init__( " the parallel_tool_calls client option is preset for " "compatibility reasons, it will be ignored.") - self.enable_reasoning: bool = enable_reasoning self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = None - if self.enable_reasoning: + if reasoning_parser: try: self.reasoning_parser = ( ReasoningParserManager.get_reasoning_parser( reasoning_parser)) + assert self.reasoning_parser is not None except Exception as e: - raise TypeError("Error: --enable-reasoning requires " - f"reasoning_parser:'{reasoning_parser}' " - "which has not been registered") from e + raise TypeError( + f"{reasoning_parser=} has not been registered") from e self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None if self.enable_auto_tools: try: @@ -423,15 +421,12 @@ async def chat_completion_stream_generator( not tool_choice_function_name and self._should_stream_with_auto_tool_parsing(request)) - should_stream_with_reasoning_parsing = ( - self._should_stream_with_reasoning_parsing(request)) - all_previous_token_ids: Optional[list[list[int]]] function_name_returned: Optional[list[bool]] = None # Only one of these will be used, thus previous_texts and # all_previous_token_ids will not be used twice in the same iteration. - if tool_choice_auto or should_stream_with_reasoning_parsing: + if tool_choice_auto or self.reasoning_parser: # These are only required in "auto" tool choice case previous_texts = [""] * num_choices all_previous_token_ids = [[]] * num_choices @@ -446,12 +441,7 @@ async def chat_completion_stream_generator( previous_texts, all_previous_token_ids = None, None try: - # There is no need to check if the reasoning_parser is None - # because the should_stream_with_reasoning_parsing check - # already ensures that the reasoning_parser is not None. - # but the pre-commit hook requires it. - if should_stream_with_reasoning_parsing and \ - self.reasoning_parser is not None: + if self.reasoning_parser: reasoning_parser = self.reasoning_parser(tokenizer) except RuntimeError as e: logger.exception("Error in reasoning parser creation.") @@ -459,7 +449,6 @@ async def chat_completion_stream_generator( yield f"data: {data}\n\n" yield "data: [DONE]\n\n" return - # Prepare the tool parser if it's needed try: if tool_choice_auto and self.tool_parser: @@ -592,7 +581,7 @@ async def chat_completion_stream_generator( delta_message: Optional[DeltaMessage] # just update previous_texts and previous_token_ids - if tool_choice_auto or should_stream_with_reasoning_parsing: + if tool_choice_auto or self.reasoning_parser: assert previous_texts is not None assert all_previous_token_ids is not None previous_text = previous_texts[i] @@ -603,7 +592,7 @@ async def chat_completion_stream_generator( # handle streaming deltas for tools with named tool_choice if tool_choice_function_name: - if (self.enable_reasoning + if (self.reasoning_parser and not reasoning_parser.is_reasoning_end( previous_token_ids)): assert reasoning_parser is not None @@ -630,7 +619,7 @@ async def chat_completion_stream_generator( current_text = "" else: # Just to add remaining `content` - if self.enable_reasoning: + if self.reasoning_parser: delta_text = previous_text + delta_text current_text = "" @@ -660,7 +649,7 @@ async def chat_completion_stream_generator( # handle streaming deltas for tools with "auto" tool choice # and reasoning parser - elif tool_choice_auto and self.enable_reasoning: + elif tool_choice_auto and self.reasoning_parser: assert tool_parser is not None assert reasoning_parser is not None assert added_content_delta_arr is not None @@ -728,8 +717,7 @@ async def chat_completion_stream_generator( delta_token_ids=output.token_ids, request=request)) # when only reasoning - elif self.enable_reasoning: - assert reasoning_parser is not None + elif self.reasoning_parser: delta_message = (reasoning_parser. extract_reasoning_content_streaming( previous_text, @@ -744,7 +732,7 @@ async def chat_completion_stream_generator( delta_message = DeltaMessage(content=delta_text) # update the previous values for the next iteration - if tool_choice_auto or should_stream_with_reasoning_parsing: + if tool_choice_auto or self.reasoning_parser: assert previous_texts is not None assert all_previous_token_ids is not None previous_texts[i] = current_text @@ -931,17 +919,9 @@ async def chat_completion_full_generator( ) else: logprobs = None - - should_stream_with_reasoning_parsing = ( - self._should_stream_with_reasoning_parsing(request)) - - # In the OpenAI API the finish_reason is "tools_called" - # if the tool choice is auto and the model produced a tool - # call. The same is not true for named function calls auto_tools_called = False - if should_stream_with_reasoning_parsing and \ - self.reasoning_parser is not None: + if self.reasoning_parser: try: reasoning_parser = self.reasoning_parser(tokenizer) except RuntimeError as e: @@ -1176,17 +1156,6 @@ def _should_stream_with_auto_tool_parsing(self, return (request.tools and self.tool_parser and self.enable_auto_tools and request.tool_choice in ['auto', None]) - def _should_stream_with_reasoning_parsing(self, - request: ChatCompletionRequest): - """ - Utility function to check if streamed tokens should go through the - reasoning parser that was configured. - - We only want to do this IF reasoning is enabled and a reasoning - parser is configured. - """ - return self.enable_reasoning and self.reasoning_parser is not None - def _should_check_for_unstreamed_tool_arg_tokens( self, delta_message: Optional[DeltaMessage], diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 4e4d697f49a9..a2b61a1b19e4 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -103,7 +103,7 @@ async def get_guided_decoding_logits_processor( reasoning_backend: str | None = None) -> LogitsProcessor | None: reasoner = None - if reasoning_backend is not None: + if reasoning_backend: reasoner_class = ReasoningParserManager.get_reasoning_parser( reasoning_backend) reasoner = reasoner_class(tokenizer) @@ -146,7 +146,7 @@ def get_local_guided_decoding_logits_processor( guided_params = maybe_backend_fallback(guided_params) reasoner = None - if reasoning_backend is not None: + if reasoning_backend: reasoner_class = ReasoningParserManager.get_reasoning_parser( reasoning_backend) reasoner = reasoner_class(tokenizer) diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 31af4593f112..936fd0f06867 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -61,7 +61,7 @@ def __call__(self, input_ids: List[int], """Use the FSM to bias the logits before sampling the next token.""" # Skip the structured logits processing if reasoning is not finished. - # reasoner is not None only when `--enable-reasoning` is set. + # reasoner is not None only when `--reasoning-parser` is set. if self._reasoner is not None: if not self._reasoner.is_reasoning_end(input_ids): return scores diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index eb7b2b74ffbe..ac2d73626d78 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -346,7 +346,7 @@ def __call__(self, input_ids: list[int], scores: torch.Tensor) -> torch.Tensor: # Skip the structured logits processing if reasoning is not finished. - # reasoner is not None only when `--enable-reasoning` is set. + # reasoner is not None only when `--reasoning-parser` is set. if self.reasoner is not None and \ not self.reasoner.is_reasoning_end( input_ids):