From 649c6efbbd71bc4f53ee16d4a6a303d2077b8b03 Mon Sep 17 00:00:00 2001 From: NekoMimiUnagi Date: Fri, 6 Jun 2025 15:46:53 -0700 Subject: [PATCH 1/7] Add progress bar for `LLM.beam_search` Adds an optional token-level progress bar to the `LLM.beam_search()` method using `tqdm`. This improves visibility for long-running inference by allowing users to estimate progress and remaining time. The progress bar is enabled via a new `use_tqdm` boolean argument (default: False), and it wraps the `range(max_tokens)` loop. Also includes a logger warning when the bar is enabled to clarify that the progress shown is a token-level upper bound and may terminate early due to stopping conditions. The tqdm bar is labeled "Beam search" with units shown as "tokens". Example: outputs = llm.beam_search(prompts, sampling_params, use_tqdm=True) This change improves developer experience and aligns `beam_search` closer to `generate` and `chat`, which provide better runtime feedback. Signed-off-by: Ruosen Li Signed-off-by: Ubuntu Signed-off-by: Ruosen Li --- vllm/entrypoints/llm.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index fd28bf39e2d5..1a594b2b5b1d 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -531,6 +531,7 @@ def beam_search( prompts: list[Union[TokensPrompt, TextPrompt]], params: BeamSearchParams, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, + use_tqdm: bool = False, ) -> list[BeamSearchOutput]: """ Generate sequences using beam search. @@ -602,7 +603,19 @@ def create_tokens_prompt_from_beam( **mm_kwargs, ), ) - for _ in range(max_tokens): + token_iter = range(max_tokens) + if use_tqdm: + token_iter = tqdm(range(max_tokens, desc="Beam Searching")) + warnings.warn( + "The progress bar shows the upper bound on token steps and " + "may finish early due to stopping conditions. It does not " + "reflect instance-level progress.", + stacklevel=2, + unit="token", + unit_scale=False, + ) + + for _ in token_iter: all_beams: list[BeamSearchSequence] = list( sum((instance.beams for instance in instances), [])) pos = [0] + list( From 3c3c4ac1e95f234c47468035193dcdebf1db9c2c Mon Sep 17 00:00:00 2001 From: NekoMimiUnagi Date: Fri, 6 Jun 2025 15:48:50 -0700 Subject: [PATCH 2/7] Update the comment for `use_tqdm` Signed-off-by: Ruosen Li Signed-off-by: Ubuntu Signed-off-by: Ruosen Li --- vllm/entrypoints/llm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1a594b2b5b1d..12314af597d0 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -541,6 +541,7 @@ def beam_search( of token IDs. params: The beam search parameters. lora_request: LoRA request to use for generation, if any. + use_tqdm: Whether to use tqdm to display the progress bar. """ # TODO: how does beam search work together with length penalty, # frequency, penalty, and stopping criteria, etc.? From cf908718a2df4ff0f129b55632c66dadb512f2e6 Mon Sep 17 00:00:00 2001 From: NekoMimiUnagi Date: Fri, 6 Jun 2025 16:32:03 -0700 Subject: [PATCH 3/7] Update llm.py based on Code Review Signed-off-by: Ruosen Li Signed-off-by: Ubuntu Signed-off-by: Ruosen Li --- vllm/entrypoints/llm.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 12314af597d0..effaf4e49dd8 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -604,18 +604,13 @@ def create_tokens_prompt_from_beam( **mm_kwargs, ), ) - token_iter = range(max_tokens) - if use_tqdm: - token_iter = tqdm(range(max_tokens, desc="Beam Searching")) - warnings.warn( - "The progress bar shows the upper bound on token steps and " - "may finish early due to stopping conditions. It does not " - "reflect instance-level progress.", - stacklevel=2, - unit="token", - unit_scale=False, - ) - + token_iter = tqdm(range(max_tokens), desc="Beam search", unit="token", unit_scale=False) + warnings.warn( + "The progress bar shows the upper bound on token steps and " + "may finish early due to stopping conditions. It does not " + "reflect instance-level progress.", + stacklevel=2, + ) for _ in token_iter: all_beams: list[BeamSearchSequence] = list( sum((instance.beams for instance in instances), [])) From 99d7ab5e8d5f2d8d498d177cd7b40bbd22458f3e Mon Sep 17 00:00:00 2001 From: Ruosen Li Date: Fri, 6 Jun 2025 19:04:16 -0500 Subject: [PATCH 4/7] Fix use_tqdm bug and reformat the file Signed-off-by: Ruosen Li Signed-off-by: Ubuntu Signed-off-by: Ruosen Li --- vllm/entrypoints/llm.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index effaf4e49dd8..9ee913ab7598 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -604,13 +604,19 @@ def create_tokens_prompt_from_beam( **mm_kwargs, ), ) - token_iter = tqdm(range(max_tokens), desc="Beam search", unit="token", unit_scale=False) - warnings.warn( - "The progress bar shows the upper bound on token steps and " - "may finish early due to stopping conditions. It does not " - "reflect instance-level progress.", - stacklevel=2, - ) + token_iter = range(max_tokens) + if use_tqdm: + token_iter = tqdm(range(max_tokens), + desc="Beam search", + unit="token", + unit_scale=False) + warnings.warn( + "The progress bar shows the upper bound on token steps and " + "may finish early due to stopping conditions. It does not " + "reflect instance-level progress.", + stacklevel=2, + ) + for _ in token_iter: all_beams: list[BeamSearchSequence] = list( sum((instance.beams for instance in instances), [])) From 418f086f7a7f1be966a5afdd3dd4fbcba11e3b51 Mon Sep 17 00:00:00 2001 From: NekoMimiUnagi Date: Mon, 9 Jun 2025 03:46:17 -0500 Subject: [PATCH 5/7] Update vllm/entrypoints/llm.py Update codes based on the suggestion from @22quinn. Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> Signed-off-by: Ubuntu Signed-off-by: Ruosen Li --- vllm/entrypoints/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9ee913ab7598..7c45f9c635ba 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -606,7 +606,7 @@ def create_tokens_prompt_from_beam( token_iter = range(max_tokens) if use_tqdm: - token_iter = tqdm(range(max_tokens), + token_iter = tqdm(token_iter, desc="Beam search", unit="token", unit_scale=False) From 00f12b4c3f72228812a6e2a18bc1d7d0cd3e8de6 Mon Sep 17 00:00:00 2001 From: Ruosen Li Date: Tue, 17 Jun 2025 12:00:16 -0500 Subject: [PATCH 6/7] update the log based on the suggestion from @aarnphm Signed-off-by: Ruosen Li --- vllm/entrypoints/llm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 7c45f9c635ba..c595f778fee4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -610,11 +610,10 @@ def create_tokens_prompt_from_beam( desc="Beam search", unit="token", unit_scale=False) - warnings.warn( + logger.warning( "The progress bar shows the upper bound on token steps and " "may finish early due to stopping conditions. It does not " "reflect instance-level progress.", - stacklevel=2, ) for _ in token_iter: From 3e27f6424248a136a94f230dc25697e1ff97b988 Mon Sep 17 00:00:00 2001 From: Ruosen Li Date: Tue, 17 Jun 2025 12:13:42 -0500 Subject: [PATCH 7/7] update code format Signed-off-by: Ruosen Li --- vllm/entrypoints/llm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index c595f778fee4..081122c8cb34 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -613,8 +613,7 @@ def create_tokens_prompt_from_beam( logger.warning( "The progress bar shows the upper bound on token steps and " "may finish early due to stopping conditions. It does not " - "reflect instance-level progress.", - ) + "reflect instance-level progress.") for _ in token_iter: all_beams: list[BeamSearchSequence] = list(