Skip to content

Multimodal Support (Llava 1.5) #821

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
4ec3539
llava v1.5 integration
damian0815 Oct 15, 2023
48f4228
Point llama.cpp to fork
abetlen Nov 6, 2023
61a1e5c
Add llava shared library target
abetlen Nov 6, 2023
46ce323
Fix type
abetlen Nov 6, 2023
0d8a91b
Update llama.cpp
abetlen Nov 6, 2023
0c95066
Add llava api
abetlen Nov 6, 2023
7b98141
Merge branch 'llava-1.5' into damian0815-feat_llava_integration
abetlen Nov 6, 2023
9406d63
Revert changes to llama and llama_cpp
abetlen Nov 6, 2023
6878def
Merge branch 'main' into feat_llava_integration
abetlen Nov 6, 2023
82007d0
Update llava example
abetlen Nov 6, 2023
625f852
Merge branch 'feat_llava_integration' of github.com:damian0815/llama-…
abetlen Nov 6, 2023
f6fe6b0
Add types for new gpt-4-vision-preview api
abetlen Nov 6, 2023
39e2be1
Fix typo
abetlen Nov 6, 2023
7c3009e
Update llama.cpp
abetlen Nov 7, 2023
1f1abfd
Update llama_types to match OpenAI v1 API
abetlen Nov 7, 2023
2a369f4
Update ChatCompletionFunction type
abetlen Nov 7, 2023
2ea2adf
Reorder request parameters
abetlen Nov 7, 2023
87fc84b
More API type fixes
abetlen Nov 7, 2023
5091b9c
Even More Type Updates
abetlen Nov 7, 2023
22a776d
Add parameter for custom chat_handler to Llama class
abetlen Nov 7, 2023
5ac8115
Fix circular import
abetlen Nov 7, 2023
cb749f2
Convert to absolute imports
abetlen Nov 7, 2023
d2d2a2d
Fix
abetlen Nov 7, 2023
177114c
Fix pydantic Jsontype bug
abetlen Nov 8, 2023
21165e7
Accept list of prompt tokens in create_completion
abetlen Nov 8, 2023
74c414c
Add llava1.5 chat handler
abetlen Nov 8, 2023
34aa858
Add Multimodal notebook
abetlen Nov 8, 2023
66dda36
Clean up examples
abetlen Nov 8, 2023
71adef4
Add server docs
abetlen Nov 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,23 @@ if (LLAMA_BUILD)
FILES $<TARGET_RUNTIME_DLLS:llama>
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
add_subdirectory(vendor/llama.cpp/examples/llava)
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
install(
TARGETS llava_shared
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
# Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
install(
TARGETS llava_shared
LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
endif()
77 changes: 77 additions & 0 deletions docs/server.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# OpenAI Compatible Server

`llama-cpp-python` offers an OpenAI API compatible web server.

This web server can be used to serve local models and easily connect them to existing clients.

## Setup

### Installation

The server can be installed by running the following command:

```bash
pip install llama-cpp-python[server]
```

### Running the server

The server can then be started by running the following command:

```bash
python3 -m llama_cpp.server --model <model_path>
```

### Server options

For a full list of options, run:

```bash
python3 -m llama_cpp.server --help
```

NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.

## Guides

### Multi-modal Models

`llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
read information from both text and images.

You'll first need to download one of the available multi-modal models in GGUF format:

- [llava1.5 7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
- [llava1.5 13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)

Then when you run the server you'll need to also specify the path to the clip model used for image embedding

```bash
python3 -m llama_cpp.server --model <model_path> --clip-model-path <clip_model_path>
```

Then you can just use the OpenAI API as normal

```python3
from openai import OpenAI

client = OpenAI(base_url="http://<host>:<port>/v1", api_key="sk-xxx")
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "<image_url>"
},
},
{"type": "text", "text": "What does the image say"},
],
}
],
)
print(response)
```
84 changes: 84 additions & 0 deletions examples/notebooks/Multimodal.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ChatCompletion(id='chatcmpl-65a710ba-41d1-4d0a-a124-a44b2b4a0189', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content=' The image reads \"LlamaC++.\"', role='assistant', function_call=None, tool_calls=None))], created=1699413274, model='gpt-4-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=10, prompt_tokens=624, total_tokens=634))\n"
]
}
],
"source": [
"from openai import OpenAI\n",
"\n",
"import urllib.request\n",
"import base64\n",
"\n",
"def get_data_url(url):\n",
" return \"data:image/png;base64,\" + base64.b64encode(urllib.request.urlopen(url).read()).decode(\"utf-8\")\n",
"\n",
"client = OpenAI(base_url=\"http://100.64.159.73:8000/v1\", api_key=\"sk-1234\")\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-4-vision-preview\",\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image_url\",\n",
" \"image_url\": {\n",
" \"url\": get_data_url(\"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\"),\n",
" # \"url\": \"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\",\n",
" },\n",
" },\n",
" {\"type\": \"text\", \"text\": \"What does the image say\"},\n",
" ],\n",
" }\n",
" ],\n",
")\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5+"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
43 changes: 29 additions & 14 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
import diskcache
import ctypes

from . import llama_cpp
from .llama_types import *
from .llama_grammar import LlamaGrammar
import llama_cpp.llama_cpp as llama_cpp
import llama_cpp.llama_chat_format as llama_chat_format

import numpy as np
Expand Down Expand Up @@ -752,6 +752,7 @@ def __init__(
numa: bool = False,
# Chat Format Params
chat_format: str = "llama-2",
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
# Misc
verbose: bool = True,
# Extra Params
Expand Down Expand Up @@ -784,6 +785,7 @@ def __init__(
lora_path: Path to a LoRA file to apply to the model.
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
chat_format: String specifying the chat format to use when calling create_chat_completion.
chat_handler: Optional chat handler to use when calling create_chat_completion.
verbose: Print verbose output to stderr.

Raises:
Expand Down Expand Up @@ -910,6 +912,7 @@ def __init__(
print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)

self.chat_format = chat_format
self.chat_handler = chat_handler

self._n_vocab = self.n_vocab()
self._n_ctx = self.n_ctx()
Expand Down Expand Up @@ -1231,7 +1234,7 @@ def create_embedding(
else:
inputs = input

data: List[EmbeddingData] = []
data: List[Embedding] = []
total_tokens = 0
for index, input in enumerate(inputs):
tokens = self.tokenize(input.encode("utf-8"), special=True)
Expand Down Expand Up @@ -1276,7 +1279,7 @@ def embed(self, input: str) -> List[float]:

def _create_completion(
self,
prompt: str,
prompt: Union[str, List[int]],
suffix: Optional[str] = None,
max_tokens: int = 16,
temperature: float = 0.8,
Expand All @@ -1297,7 +1300,9 @@ def _create_completion(
stopping_criteria: Optional[StoppingCriteriaList] = None,
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
) -> Union[
Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
]:
assert self._ctx is not None
assert suffix is None or suffix.__class__ is str

Expand All @@ -1309,7 +1314,7 @@ def _create_completion(
self.tokenize(prompt.encode("utf-8"), special=True)
if prompt != ""
else [self.token_bos()]
)
) if isinstance(prompt, str) else prompt
text: bytes = b""
returned_tokens: int = 0
stop = (
Expand All @@ -1322,7 +1327,7 @@ def _create_completion(

if len(prompt_tokens) >= self._n_ctx:
raise ValueError(
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self._ctx)}"
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
)

if max_tokens <= 0:
Expand Down Expand Up @@ -1732,7 +1737,7 @@ def _create_completion(

def create_completion(
self,
prompt: str,
prompt: Union[str, List[int]],
suffix: Optional[str] = None,
max_tokens: int = 128,
temperature: float = 0.8,
Expand All @@ -1753,7 +1758,7 @@ def create_completion(
stopping_criteria: Optional[StoppingCriteriaList] = None,
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
) -> Union[Completion, Iterator[CompletionChunk]]:
) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
"""Generate text from a prompt.

Args:
Expand Down Expand Up @@ -1800,7 +1805,7 @@ def create_completion(
grammar=grammar,
)
if stream:
chunks: Iterator[CompletionChunk] = completion_or_chunks
chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
return chunks
completion: Completion = next(completion_or_chunks) # type: ignore
return completion
Expand Down Expand Up @@ -1828,7 +1833,7 @@ def __call__(
stopping_criteria: Optional[StoppingCriteriaList] = None,
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
) -> Union[Completion, Iterator[CompletionChunk]]:
) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
"""Generate text from a prompt.

Args:
Expand Down Expand Up @@ -1879,7 +1884,9 @@ def create_chat_completion(
self,
messages: List[ChatCompletionRequestMessage],
functions: Optional[List[ChatCompletionFunction]] = None,
function_call: Optional[Union[str, ChatCompletionFunctionCall]] = None,
function_call: Optional[ChatCompletionRequestFunctionCall] = None,
tools: Optional[List[ChatCompletionTool]] = None,
tool_choice: Optional[ChatCompletionToolChoiceOption] = None,
temperature: float = 0.2,
top_p: float = 0.95,
top_k: int = 40,
Expand All @@ -1896,7 +1903,9 @@ def create_chat_completion(
model: Optional[str] = None,
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
) -> Union[
CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
]:
"""Generate a chat completion from a list of messages.

Args:
Expand All @@ -1912,12 +1921,16 @@ def create_chat_completion(
Returns:
Generated chat completion or a stream of chat completion chunks.
"""
handler = llama_chat_format.get_chat_completion_handler(self.chat_format)
handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
self.chat_format
)
return handler(
self,
llama=self,
messages=messages,
functions=functions,
function_call=function_call,
tools=tools,
tool_choice=tool_choice,
temperature=temperature,
top_p=top_p,
top_k=top_k,
Expand Down Expand Up @@ -1974,6 +1987,7 @@ def __getstate__(self):
numa=self.numa,
# Chat Format Params
chat_format=self.chat_format,
chat_handler=self.chat_handler,
# Misc
verbose=self.verbose,
)
Expand Down Expand Up @@ -2015,6 +2029,7 @@ def __setstate__(self, state):
numa=state["numa"],
# Chat Format Params
chat_format=state["chat_format"],
chat_handler=state["chat_handler"],
# Misc
verbose=state["verbose"],
)
Expand Down
Loading