Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ repos:
- id: ruff
args: [--output-format, github, --fix]
- id: ruff-format
files: ^(.buildkite|benchmarks)/.*
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
Expand Down
150 changes: 80 additions & 70 deletions examples/offline_inference/audio_language.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference
This example shows how to use vLLM for running offline inference
with the correct prompt format on audio language models.

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""

import os
from dataclasses import asdict
from typing import NamedTuple, Optional
Expand All @@ -22,7 +23,7 @@
question_per_audio_count = {
0: "What is 1+1?",
1: "What is recited in the audio?",
2: "What sport and what nursery rhyme are referenced?"
2: "What sport and what nursery rhyme are referenced?",
}


Expand Down Expand Up @@ -72,8 +73,7 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
# MiniCPM-O
def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
model_name = "openbmb/MiniCPM-o-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
Expand All @@ -82,19 +82,18 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
limit_mm_per_prompt={"audio": audio_count},
)

stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_tokens = ["<|im_end|>", "<|endoftext|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

audio_placeholder = "(<audio>./</audio>)" * audio_count
audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}" # noqa: E501
messages = [{
'role': 'user',
'content': f'{audio_placeholder}\n{question}'
}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True,
chat_template=audio_chat_template)
messages = [{"role": "user", "content": f"{audio_placeholder}\n{question}"}]
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
chat_template=audio_chat_template,
)

return ModelRequestData(
engine_args=engine_args,
Expand All @@ -113,7 +112,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
speech_lora_path = os.path.join(model_path, "speech-lora")
placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
placeholders = "".join([f"<|audio_{i + 1}|>" for i in range(audio_count)])

prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

Expand Down Expand Up @@ -145,15 +144,19 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
limit_mm_per_prompt={"audio": audio_count},
)

audio_in_prompt = "".join([
f"Audio {idx+1}: "
f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
])
audio_in_prompt = "".join(
[
f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
for idx in range(audio_count)
]
)

prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
prompt = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)

return ModelRequestData(
engine_args=engine_args,
Expand All @@ -172,19 +175,22 @@ def run_qwen2_5_omni(question: str, audio_count: int):
limit_mm_per_prompt={"audio": audio_count},
)

audio_in_prompt = "".join([
"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
])
audio_in_prompt = "".join(
["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)]
)

default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech.")
"generating text and speech."
)

prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
Expand All @@ -196,13 +202,10 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [{
'role': 'user',
'content': "<|audio|>\n" * audio_count + question
}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
messages = [{"role": "user", "content": "<|audio|>\n" * audio_count + question}]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)

engine_args = EngineArgs(
model=model_name,
Expand All @@ -220,8 +223,7 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:

# Whisper
def run_whisper(question: str, audio_count: int) -> ModelRequestData:
assert audio_count == 1, (
"Whisper only support single audio input per prompt")
assert audio_count == 1, "Whisper only support single audio input per prompt"
model_name = "openai/whisper-large-v3-turbo"

prompt = "<|startoftranscript|>"
Expand Down Expand Up @@ -252,27 +254,33 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:

def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'audio language models')
parser.add_argument('--model-type',
'-m',
type=str,
default="ultravox",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=1,
help='Number of prompts to run.')
parser.add_argument("--num-audios",
type=int,
default=1,
choices=[0, 1, 2],
help="Number of audio items per prompt.")
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
description="Demo on using vLLM for offline inference with "
"audio language models"
)
parser.add_argument(
"--model-type",
"-m",
type=str,
default="ultravox",
choices=model_example_map.keys(),
help='Huggingface "model_type".',
)
parser.add_argument(
"--num-prompts", type=int, default=1, help="Number of prompts to run."
)
parser.add_argument(
"--num-audios",
type=int,
default=1,
choices=[0, 1, 2],
help="Number of audio items per prompt.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)

return parser.parse_args()

Expand All @@ -283,29 +291,30 @@ def main(args):
raise ValueError(f"Model type {model} is not supported.")

audio_count = args.num_audios
req_data = model_example_map[model](question_per_audio_count[audio_count],
audio_count)
req_data = model_example_map[model](
question_per_audio_count[audio_count], audio_count
)

# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})
req_data.engine_args.limit_mm_per_prompt or {}
)

engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)

# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2,
max_tokens=64,
stop_token_ids=req_data.stop_token_ids)
sampling_params = SamplingParams(
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
)

mm_data = {}
if audio_count > 0:
mm_data = {
"audio": [
asset.audio_and_sample_rate
for asset in audio_assets[:audio_count]
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
]
}

Expand All @@ -315,8 +324,9 @@ def main(args):
# Batch inference
inputs = [inputs] * args.num_prompts
# Add LoRA request if applicable
lora_request = (req_data.lora_requests *
args.num_prompts if req_data.lora_requests else None)
lora_request = (
req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
)

outputs = llm.generate(
inputs,
Expand Down
16 changes: 10 additions & 6 deletions examples/offline_inference/automatic_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@
Run:
python examples/offline_inference/automatic_prefix_caching.py
"""

import time

from vllm import LLM, SamplingParams

# ruff: noqa: E501
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
LONG_PROMPT = (
"You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
+ """
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
| 1 | John Doe | 29 | Engineer | USA | [email protected] | 555-1234 | 123 Elm St, Springfield, IL |
Expand Down Expand Up @@ -56,6 +59,7 @@
| 29 | Amy White | 33 | Musician | New Zealand | [email protected] | 555-5658 | 159 Maple St, Wellington, NZ |
| 30 | Ben Black | 38 | Chef | Ireland | [email protected] | 555-7870 | 246 Fir St, Waterford, IE |
"""
)


def get_generation_time(llm, sampling_params, prompts):
Expand All @@ -72,25 +76,25 @@ def get_generation_time(llm, sampling_params, prompts):

def main():
# set enable_prefix_caching=True to enable APC
llm = LLM(model='lmsys/longchat-13b-16k', enable_prefix_caching=True)
llm = LLM(model="lmsys/longchat-13b-16k", enable_prefix_caching=True)

sampling_params = SamplingParams(temperature=0, max_tokens=100)

# Querying the age of John Doe
get_generation_time(
llm,
sampling_params,
LONG_PROMPT +
"Question: what is the age of John Doe? Your answer: The age of John Doe is ",
LONG_PROMPT
+ "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
)

# Querying the age of Zack Blue
# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
get_generation_time(
llm,
sampling_params,
LONG_PROMPT +
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
LONG_PROMPT
+ "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
)


Expand Down
18 changes: 4 additions & 14 deletions examples/offline_inference/basic/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,22 +56,12 @@ def print_outputs(outputs):

# In this script, we demonstrate how to pass input to the chat method:
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hello! How can I assist you today?"},
{
"role": "user",
"content":
"Write an essay about the importance of higher education.",
"content": "Write an essay about the importance of higher education.",
},
]
outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
Expand Down
15 changes: 8 additions & 7 deletions examples/offline_inference/basic/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
task="classify",
enforce_eager=True)
parser.set_defaults(
model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
)
return parser.parse_args()


Expand All @@ -36,10 +36,11 @@ def main(args: Namespace):
print("\nGenerated Outputs:\n" + "-" * 60)
for prompt, output in zip(prompts, outputs):
probs = output.outputs.probs
probs_trimmed = ((str(probs[:16])[:-1] +
", ...]") if len(probs) > 16 else probs)
print(f"Prompt: {prompt!r} \n"
f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs
print(
f"Prompt: {prompt!r} \n"
f"Class Probabilities: {probs_trimmed} (size={len(probs)})"
)
print("-" * 60)


Expand Down
Loading