Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 2 additions & 102 deletions docs/source/features/prompt_embeds.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,59 +20,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPromp

You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:

```python
from vllm import LLM
import transformers

model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)

llm = LLM(model=model_name, enable_prompt_embeds=True)

# Refer to the HuggingFace repo for the correct format to use
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')

embedding_layer = transformers_model.get_input_embeddings()
prompt_embeds = embedding_layer(token_ids).squeeze(0)

# Single prompt inference
outputs = llm.generate({
"prompt_embeds": prompt_embeds,
})

for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)

# Batch inference

chats = [
[{"role": "user", "content": "Please tell me about the capital of France."}],
[{"role": "user", "content": "When is the day longest during the year?"}],
[{"role": "user", "content": "Where is bigger, the moon or the sun?"}]
]

token_ids_list = [
tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats
]
prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list]

outputs = llm.generate(
[
{
"prompt_embeds": prompt_embeds,
} for prompt_embeds in prompt_embeds_list
]
)

for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
<gh-file:examples/offline_inference/prompt_embed_inference.py>

## Online Serving

Expand All @@ -93,52 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \

Then, you can use the OpenAI client as follows:

```python
from openai import OpenAI
import transformers
import torch

openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)


# Refer to the HuggingFace repo for the correct format to use
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')

embedding_layer = transformers_model.get_input_embeddings()
prompt_embeds = embedding_layer(token_ids).squeeze(0)

# Prompt embeddings
buffer = io.BytesIO()
torch.save(prompt_embeds, buffer)
buffer.seek(0)
binary_data = buffer.read()
encoded_embeds = base64.b64encode(binary_data).decode('utf-8')


completion = client_with_prompt_embeds.completions.create(
model=model_name,
# NOTE: The OpenAI client does not allow `None` as an input to
# `prompt`. Use an empty string if you have no text prompts.
prompt="",
max_tokens=5,
temperature=0.0,
# NOTE: The OpenAI client allows passing in extra JSON body via the
# `extra_body` argument.
extra_body={"prompt_embeds": encoded_embeds}
)

print(completion.choices[0].text)
```
<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
103 changes: 103 additions & 0 deletions examples/offline_inference/prompt_embed_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# SPDX-License-Identifier: Apache-2.0
"""
Demonstrates how to generate prompt embeddings using
Hugging Face Transformers and use them as input to vLLM
for both single and batch inference.

Model: meta-llama/Llama-3.2-1B-Instruct
Note: This model is gated on Hugging Face Hub.
You must request access to use it:
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct

Requirements:
- vLLM
- transformers

Run:
python examples/offline_inference/prompt_embed_inference.py
"""

import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizer)

from vllm import LLM


def init_tokenizer_and_llm(model_name: str):
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
llm = LLM(model=model_name, enable_prompt_embeds=True)
return tokenizer, embedding_layer, llm


def get_prompt_embeds(chat: list[dict[str,
str]], tokenizer: PreTrainedTokenizer,
embedding_layer: torch.nn.Module):
token_ids = tokenizer.apply_chat_template(chat,
add_generation_prompt=True,
return_tensors='pt')
prompt_embeds = embedding_layer(token_ids).squeeze(0)
return prompt_embeds


def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer,
embedding_layer: torch.nn.Module):
chat = [{
"role": "user",
"content": "Please tell me about the capital of France."
}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)

outputs = llm.generate({
"prompt_embeds": prompt_embeds,
})

print("\n[Single Inference Output]")
print("-" * 30)
for o in outputs:
print(o.outputs[0].text)
print("-" * 30)


def batch_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer,
embedding_layer: torch.nn.Module):
chats = [[{
"role": "user",
"content": "Please tell me about the capital of France."
}],
[{
"role": "user",
"content": "When is the day longest during the year?"
}],
[{
"role": "user",
"content": "Where is bigger, the moon or the sun?"
}]]

prompt_embeds_list = [
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
]

outputs = llm.generate([{
"prompt_embeds": embeds
} for embeds in prompt_embeds_list])

print("\n[Batch Inference Outputs]")
print("-" * 30)
for i, o in enumerate(outputs):
print(f"Q{i+1}: {chats[i][0]['content']}")
print(f"A{i+1}: {o.outputs[0].text}\n")
print("-" * 30)


def main():
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
single_prompt_inference(llm, tokenizer, embedding_layer)
batch_prompt_inference(llm, tokenizer, embedding_layer)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# SPDX-License-Identifier: Apache-2.0
"""
vLLM OpenAI-Compatible Client with Prompt Embeddings

This script demonstrates how to:
1. Generate prompt embeddings using Hugging Face Transformers
2. Encode them in base64 format
3. Send them to a vLLM server via the OpenAI-compatible Completions API

Run the vLLM server first:
vllm serve meta-llama/Llama-3.2-1B-Instruct \
--task generate \
--max-model-len 4096 \
--enable-prompt-embeds

Run the client:
python examples/online_serving/prompt_embed_inference_with_openai_client.py

Model: meta-llama/Llama-3.2-1B-Instruct
Note: This model is gated on Hugging Face Hub.
You must request access to use it:
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct

Dependencies:
- transformers
- torch
- openai
"""
import base64
import io

import torch
import transformers
from openai import OpenAI


def main():
client = OpenAI(
api_key="EMPTY",
base_url="http://localhost:8000/v1",
)

model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(
model_name)

# Refer to the HuggingFace repo for the correct format to use
chat = [{
"role": "user",
"content": "Please tell me about the capital of France."
}]
token_ids = tokenizer.apply_chat_template(chat,
add_generation_prompt=True,
return_tensors='pt')

embedding_layer = transformers_model.get_input_embeddings()
prompt_embeds = embedding_layer(token_ids).squeeze(0)

# Prompt embeddings
buffer = io.BytesIO()
torch.save(prompt_embeds, buffer)
buffer.seek(0)
binary_data = buffer.read()
encoded_embeds = base64.b64encode(binary_data).decode('utf-8')

completion = client.completions.create(
model=model_name,
# NOTE: The OpenAI client does not allow `None` as an input to
# `prompt`. Use an empty string if you have no text prompts.
prompt="",
max_tokens=5,
temperature=0.0,
# NOTE: The OpenAI client allows passing in extra JSON body via the
# `extra_body` argument.
extra_body={"prompt_embeds": encoded_embeds})

print("-" * 30)
print(completion.choices[0].text)
print("-" * 30)


if __name__ == "__main__":
main()