vllm-project · DarkLight1337 · May 26, 2025 · May 20, 2025 · May 20, 2025 · May 21, 2025
@@ -17,7 +17,7 @@ repos:
   - id: ruff
     args: [--output-format, github, --fix]
   - id: ruff-format
-    files: ^(.buildkite|benchmarks)/.*
+    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/codespell-project/codespell
   rev: v2.4.1
   hooks:

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
+This example shows how to use vLLM for running offline inference
 with the correct prompt format on audio language models.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+
 import os
 from dataclasses import asdict
 from typing import NamedTuple, Optional
@@ -22,7 +23,7 @@
 question_per_audio_count = {
     0: "What is 1+1?",
     1: "What is recited in the audio?",
-    2: "What sport and what nursery rhyme are referenced?"
+    2: "What sport and what nursery rhyme are referenced?",
 }
 
 
@@ -72,8 +73,7 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
 # MiniCPM-O
 def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
     model_name = "openbmb/MiniCPM-o-2_6"
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
@@ -82,19 +82,18 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         limit_mm_per_prompt={"audio": audio_count},
     )
 
-    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_tokens = ["<|im_end|>", "<|endoftext|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
     audio_placeholder = "(<audio>./</audio>)" * audio_count
     audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
-    messages = [{
-        'role': 'user',
-        'content': f'{audio_placeholder}\n{question}'
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True,
-                                           chat_template=audio_chat_template)
+    messages = [{"role": "user", "content": f"{audio_placeholder}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        chat_template=audio_chat_template,
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -113,7 +112,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     # Since the vision-lora and speech-lora co-exist with the base model,
     # we have to manually specify the path of the lora weights.
     speech_lora_path = os.path.join(model_path, "speech-lora")
-    placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
+    placeholders = "".join([f"<|audio_{i + 1}|>" for i in range(audio_count)])
 
     prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
 
@@ -145,15 +144,19 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
         limit_mm_per_prompt={"audio": audio_count},
     )
 
-    audio_in_prompt = "".join([
-        f"Audio {idx+1}: "
-        f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
-    ])
+    audio_in_prompt = "".join(
+        [
+            f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
+            for idx in range(audio_count)
+        ]
+    )
 
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n"
-              f"{audio_in_prompt}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -172,19 +175,22 @@ def run_qwen2_5_omni(question: str, audio_count: int):
         limit_mm_per_prompt={"audio": audio_count},
     )
 
-    audio_in_prompt = "".join([
-        "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
-    ])
+    audio_in_prompt = "".join(
+        ["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)]
+    )
 
     default_system = (
         "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
         "Group, capable of perceiving auditory and visual inputs, as well as "
-        "generating text and speech.")
+        "generating text and speech."
+    )
 
-    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
-              "<|im_start|>user\n"
-              f"{audio_in_prompt}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
@@ -196,13 +202,10 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
     model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [{
-        'role': 'user',
-        'content': "<|audio|>\n" * audio_count + question
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    messages = [{"role": "user", "content": "<|audio|>\n" * audio_count + question}]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     engine_args = EngineArgs(
         model=model_name,
@@ -220,8 +223,7 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
 
 # Whisper
 def run_whisper(question: str, audio_count: int) -> ModelRequestData:
-    assert audio_count == 1, (
-        "Whisper only support single audio input per prompt")
+    assert audio_count == 1, "Whisper only support single audio input per prompt"
     model_name = "openai/whisper-large-v3-turbo"
 
     prompt = "<|startoftranscript|>"
@@ -252,27 +254,33 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
 
 def parse_args():
     parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'audio language models')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="ultravox",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help='Number of prompts to run.')
-    parser.add_argument("--num-audios",
-                        type=int,
-                        default=1,
-                        choices=[0, 1, 2],
-                        help="Number of audio items per prompt.")
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
+        description="Demo on using vLLM for offline inference with "
+        "audio language models"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="ultravox",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1, help="Number of prompts to run."
+    )
+    parser.add_argument(
+        "--num-audios",
+        type=int,
+        default=1,
+        choices=[0, 1, 2],
+        help="Number of audio items per prompt.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
 
     return parser.parse_args()
 
@@ -283,29 +291,30 @@ def main(args):
         raise ValueError(f"Model type {model} is not supported.")
 
     audio_count = args.num_audios
-    req_data = model_example_map[model](question_per_audio_count[audio_count],
-                                        audio_count)
+    req_data = model_example_map[model](
+        question_per_audio_count[audio_count], audio_count
+    )
 
     # Disable other modalities to save memory
     default_limits = {"image": 0, "video": 0, "audio": 0}
     req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {})
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=64,
-                                     stop_token_ids=req_data.stop_token_ids)
+    sampling_params = SamplingParams(
+        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
+    )
 
     mm_data = {}
     if audio_count > 0:
         mm_data = {
             "audio": [
-                asset.audio_and_sample_rate
-                for asset in audio_assets[:audio_count]
+                asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
             ]
         }
 
@@ -315,8 +324,9 @@ def main(args):
         # Batch inference
         inputs = [inputs] * args.num_prompts
     # Add LoRA request if applicable
-    lora_request = (req_data.lora_requests *
-                    args.num_prompts if req_data.lora_requests else None)
+    lora_request = (
+        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
+    )
 
     outputs = llm.generate(
         inputs,

diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/offline_inference/automatic_prefix_caching.py
@@ -16,13 +16,16 @@
 Run:
 python examples/offline_inference/automatic_prefix_caching.py
 """
+
 import time
 
 from vllm import LLM, SamplingParams
 
 # ruff: noqa: E501
 # A prompt containing a large markdown table. The table is randomly generated by GPT-4.
-LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
+LONG_PROMPT = (
+    "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
+    + """
 | ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
 |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
 | 1   | John Doe      | 29  | Engineer      | USA           | [email protected]   | 555-1234       | 123 Elm St, Springfield, IL  |
@@ -56,6 +59,7 @@
 | 29  | Amy White     | 33  | Musician      | New Zealand   | [email protected]      | 555-5658       | 159 Maple St, Wellington, NZ |
 | 30  | Ben Black     | 38  | Chef          | Ireland       | [email protected]      | 555-7870       | 246 Fir St, Waterford, IE    |
 """
+)
 
 
 def get_generation_time(llm, sampling_params, prompts):
@@ -72,25 +76,25 @@ def get_generation_time(llm, sampling_params, prompts):
 
 def main():
     # set enable_prefix_caching=True to enable APC
-    llm = LLM(model='lmsys/longchat-13b-16k', enable_prefix_caching=True)
+    llm = LLM(model="lmsys/longchat-13b-16k", enable_prefix_caching=True)
 
     sampling_params = SamplingParams(temperature=0, max_tokens=100)
 
     # Querying the age of John Doe
     get_generation_time(
         llm,
         sampling_params,
-        LONG_PROMPT +
-        "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+        LONG_PROMPT
+        + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
     )
 
     # Querying the age of Zack Blue
     # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
     get_generation_time(
         llm,
         sampling_params,
-        LONG_PROMPT +
-        "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
+        LONG_PROMPT
+        + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
     )
 
 

diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
@@ -56,22 +56,12 @@ def print_outputs(outputs):
 
     # In this script, we demonstrate how to pass input to the chat method:
     conversation = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "Hello"
-        },
-        {
-            "role": "assistant",
-            "content": "Hello! How can I assist you today?"
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
         {
             "role": "user",
-            "content":
-            "Write an essay about the importance of higher education.",
+            "content": "Write an essay about the importance of higher education.",
         },
     ]
     outputs = llm.chat(conversation, sampling_params, use_tqdm=False)

diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
@@ -10,9 +10,9 @@ def parse_args():
     parser = FlexibleArgumentParser()
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
-    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
-                        task="classify",
-                        enforce_eager=True)
+    parser.set_defaults(
+        model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
+    )
     return parser.parse_args()
 
 
@@ -36,10 +36,11 @@ def main(args: Namespace):
     print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         probs = output.outputs.probs
-        probs_trimmed = ((str(probs[:16])[:-1] +
-                          ", ...]") if len(probs) > 16 else probs)
-        print(f"Prompt: {prompt!r} \n"
-              f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
+        probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs
+        print(
+            f"Prompt: {prompt!r} \n"
+            f"Class Probabilities: {probs_trimmed} (size={len(probs)})"
+        )
         print("-" * 60)