From 649c6efbbd71bc4f53ee16d4a6a303d2077b8b03 Mon Sep 17 00:00:00 2001
From: NekoMimiUnagi <leeruosen@hotmail.com>
Date: Fri, 6 Jun 2025 15:46:53 -0700
Subject: [PATCH 1/7] Add progress bar for `LLM.beam_search`

Adds an optional token-level progress bar to the `LLM.beam_search()` method using `tqdm`. This improves visibility for long-running inference by allowing users to estimate progress and remaining time.

The progress bar is enabled via a new `use_tqdm` boolean argument (default: False), and it wraps the `range(max_tokens)` loop.

Also includes a logger warning when the bar is enabled to clarify that the progress shown is a token-level upper bound and may terminate early due to stopping conditions. The tqdm bar is labeled "Beam search" with units shown as "tokens".

Example:
    outputs = llm.beam_search(prompts, sampling_params, use_tqdm=True)

This change improves developer experience and aligns `beam_search` closer to `generate` and `chat`, which provide better runtime feedback.

Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
Signed-off-by: Ubuntu <ubuntu@ip-172-31-71-179.ec2.internal>
Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
---
 vllm/entrypoints/llm.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fd28bf39e2d5..1a594b2b5b1d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -531,6 +531,7 @@ def beam_search(
         prompts: list[Union[TokensPrompt, TextPrompt]],
         params: BeamSearchParams,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        use_tqdm: bool = False,
     ) -> list[BeamSearchOutput]:
         """
         Generate sequences using beam search.
@@ -602,7 +603,19 @@ def create_tokens_prompt_from_beam(
                     **mm_kwargs,
                 ), )
 
-        for _ in range(max_tokens):
+        token_iter = range(max_tokens)
+        if use_tqdm:
+            token_iter = tqdm(range(max_tokens, desc="Beam Searching"))
+            warnings.warn(
+                "The progress bar shows the upper bound on token steps and "
+                "may finish early due to stopping conditions. It does not "
+                "reflect instance-level progress.",
+                stacklevel=2,
+                unit="token",
+                unit_scale=False,
+            )
+          
+        for _ in token_iter:
             all_beams: list[BeamSearchSequence] = list(
                 sum((instance.beams for instance in instances), []))
             pos = [0] + list(

From 3c3c4ac1e95f234c47468035193dcdebf1db9c2c Mon Sep 17 00:00:00 2001
From: NekoMimiUnagi <leeruosen@hotmail.com>
Date: Fri, 6 Jun 2025 15:48:50 -0700
Subject: [PATCH 2/7] Update the comment for `use_tqdm`

Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
Signed-off-by: Ubuntu <ubuntu@ip-172-31-71-179.ec2.internal>
Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
---
 vllm/entrypoints/llm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1a594b2b5b1d..12314af597d0 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -541,6 +541,7 @@ def beam_search(
                 of token IDs.
             params: The beam search parameters.
             lora_request: LoRA request to use for generation, if any.
+            use_tqdm: Whether to use tqdm to display the progress bar.
         """
         # TODO: how does beam search work together with length penalty,
         # frequency, penalty, and stopping criteria, etc.?

From cf908718a2df4ff0f129b55632c66dadb512f2e6 Mon Sep 17 00:00:00 2001
From: NekoMimiUnagi <leeruosen@hotmail.com>
Date: Fri, 6 Jun 2025 16:32:03 -0700
Subject: [PATCH 3/7] Update llm.py based on Code Review

Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
Signed-off-by: Ubuntu <ubuntu@ip-172-31-71-179.ec2.internal>
Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
---
 vllm/entrypoints/llm.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 12314af597d0..effaf4e49dd8 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -604,18 +604,13 @@ def create_tokens_prompt_from_beam(
                     **mm_kwargs,
                 ), )
 
-        token_iter = range(max_tokens)
-        if use_tqdm:
-            token_iter = tqdm(range(max_tokens, desc="Beam Searching"))
-            warnings.warn(
-                "The progress bar shows the upper bound on token steps and "
-                "may finish early due to stopping conditions. It does not "
-                "reflect instance-level progress.",
-                stacklevel=2,
-                unit="token",
-                unit_scale=False,
-            )
-          
+        token_iter = tqdm(range(max_tokens), desc="Beam search", unit="token", unit_scale=False)
+        warnings.warn(
+            "The progress bar shows the upper bound on token steps and "
+            "may finish early due to stopping conditions. It does not "
+            "reflect instance-level progress.",
+            stacklevel=2,
+        )          
         for _ in token_iter:
             all_beams: list[BeamSearchSequence] = list(
                 sum((instance.beams for instance in instances), []))

From 99d7ab5e8d5f2d8d498d177cd7b40bbd22458f3e Mon Sep 17 00:00:00 2001
From: Ruosen Li <rxl190028@utdallas.edu>
Date: Fri, 6 Jun 2025 19:04:16 -0500
Subject: [PATCH 4/7] Fix use_tqdm bug and reformat the file

Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
Signed-off-by: Ubuntu <ubuntu@ip-172-31-71-179.ec2.internal>
Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
---
 vllm/entrypoints/llm.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index effaf4e49dd8..9ee913ab7598 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -604,13 +604,19 @@ def create_tokens_prompt_from_beam(
                     **mm_kwargs,
                 ), )
 
-        token_iter = tqdm(range(max_tokens), desc="Beam search", unit="token", unit_scale=False)
-        warnings.warn(
-            "The progress bar shows the upper bound on token steps and "
-            "may finish early due to stopping conditions. It does not "
-            "reflect instance-level progress.",
-            stacklevel=2,
-        )          
+        token_iter = range(max_tokens)
+        if use_tqdm:
+            token_iter = tqdm(range(max_tokens),
+                              desc="Beam search",
+                              unit="token",
+                              unit_scale=False)
+            warnings.warn(
+                "The progress bar shows the upper bound on token steps and "
+                "may finish early due to stopping conditions. It does not "
+                "reflect instance-level progress.",
+                stacklevel=2,
+            )
+
         for _ in token_iter:
             all_beams: list[BeamSearchSequence] = list(
                 sum((instance.beams for instance in instances), []))

From 418f086f7a7f1be966a5afdd3dd4fbcba11e3b51 Mon Sep 17 00:00:00 2001
From: NekoMimiUnagi <leeruosen@hotmail.com>
Date: Mon, 9 Jun 2025 03:46:17 -0500
Subject: [PATCH 5/7] Update vllm/entrypoints/llm.py

Update codes based on the suggestion from @22quinn.

Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: Ubuntu <ubuntu@ip-172-31-71-179.ec2.internal>
Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
---
 vllm/entrypoints/llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9ee913ab7598..7c45f9c635ba 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -606,7 +606,7 @@ def create_tokens_prompt_from_beam(
 
         token_iter = range(max_tokens)
         if use_tqdm:
-            token_iter = tqdm(range(max_tokens),
+            token_iter = tqdm(token_iter,
                               desc="Beam search",
                               unit="token",
                               unit_scale=False)

From 00f12b4c3f72228812a6e2a18bc1d7d0cd3e8de6 Mon Sep 17 00:00:00 2001
From: Ruosen Li <rxl190028@utdallas.edu>
Date: Tue, 17 Jun 2025 12:00:16 -0500
Subject: [PATCH 6/7] update the log based on the suggestion from @aarnphm

Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
---
 vllm/entrypoints/llm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7c45f9c635ba..c595f778fee4 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -610,11 +610,10 @@ def create_tokens_prompt_from_beam(
                               desc="Beam search",
                               unit="token",
                               unit_scale=False)
-            warnings.warn(
+            logger.warning(
                 "The progress bar shows the upper bound on token steps and "
                 "may finish early due to stopping conditions. It does not "
                 "reflect instance-level progress.",
-                stacklevel=2,
             )
 
         for _ in token_iter:

From 3e27f6424248a136a94f230dc25697e1ff97b988 Mon Sep 17 00:00:00 2001
From: Ruosen Li <rxl190028@utdallas.edu>
Date: Tue, 17 Jun 2025 12:13:42 -0500
Subject: [PATCH 7/7] update code format

Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
---
 vllm/entrypoints/llm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index c595f778fee4..081122c8cb34 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -613,8 +613,7 @@ def create_tokens_prompt_from_beam(
             logger.warning(
                 "The progress bar shows the upper bound on token steps and "
                 "may finish early due to stopping conditions. It does not "
-                "reflect instance-level progress.",
-            )
+                "reflect instance-level progress.")
 
         for _ in token_iter:
             all_beams: list[BeamSearchSequence] = list(