From 5717f723e6cfbda6ef6f4c8d1cc80ab16221ea30 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 4 Oct 2024 17:29:36 +0200 Subject: [PATCH 1/6] feat(vllm): add support for image-to-text Related to https://github.com/mudler/LocalAI/issues/3670 Signed-off-by: Ettore Di Giacinto --- backend/python/vllm/backend.py | 47 ++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 2cf15c1ca83e..31a327ecd8a7 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -5,6 +5,8 @@ import signal import sys import os +from typing import List +from PIL import Image import backend_pb2 import backend_pb2_grpc @@ -15,6 +17,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.multimodal.utils import fetch_image _ONE_DAY_IN_SECONDS = 60 * 60 * 24 @@ -105,6 +108,7 @@ async def LoadModel(self, request, context): try: self.llm = AsyncLLMEngine.from_engine_args(engine_args) except Exception as err: + print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr) return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") try: @@ -117,7 +121,7 @@ async def LoadModel(self, request, context): ) except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - + print("Model loaded successfully", file=sys.stderr) return backend_pb2.Result(message="Model loaded successfully", success=True) async def Predict(self, request, context): @@ -196,15 +200,25 @@ async def _predict(self, request, context, streaming=False): if request.Seed != 0: sampling_params.seed = request.Seed + # Extract image paths and process images prompt = request.Prompt - - # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template + image_paths = request.Images + image_data = [self.load_image(img_path) for img_path in image_paths] + + # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template if not request.Prompt and request.UseTokenizerTemplate and request.Messages: prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) - # Generate text + # Generate text using the LLM engine request_id = random_uuid() - outputs = self.llm.generate(prompt, sampling_params, request_id) + outputs = self.llm.generate( + { + "prompt": prompt, + "multi_modal_data": {"image": image_data} if image_data else None, + }, + sampling_params=sampling_params, + request_id=request_id, + ) # Stream the results generated_text = "" @@ -227,9 +241,32 @@ async def _predict(self, request, context, streaming=False): if streaming: return + # Remove the image files from /tmp folder + for img_path in image_paths: + try: + os.remove(img_path) + except Exception as e: + print(f"Error removing image file: {img_path}, {e}", file=sys.stderr) + # Sending the final generated text yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) + def load_image(self, image_path: str) -> Image: + """ + Load an image from the given file path. + + Args: + image_path (str): The path to the image file. + + Returns: + Image: The loaded image. + """ + try: + return Image.open(image_path) + except Exception as e: + print(f"Error loading image {image_path}: {e}", file=sys.stderr) + return None + async def serve(address): # Start asyncio gRPC server server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) From f3f9d1df81e43de032f28e968b8f95e62137949b Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 4 Oct 2024 19:27:55 +0200 Subject: [PATCH 2/6] feat(vllm): add support for video-to-text Closes: https://github.com/mudler/LocalAI/issues/2318 Signed-off-by: Ettore Di Giacinto --- backend/python/vllm/backend.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 31a327ecd8a7..b1ce42b95d94 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -18,6 +18,7 @@ from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.multimodal.utils import fetch_image +from vllm.assets.video import VideoAsset _ONE_DAY_IN_SECONDS = 60 * 60 * 24 @@ -202,19 +203,27 @@ async def _predict(self, request, context, streaming=False): # Extract image paths and process images prompt = request.Prompt + image_paths = request.Images image_data = [self.load_image(img_path) for img_path in image_paths] + videos_path = request.Videos + video_data = [self.load_video(video_path) for video_path in videos_path] + # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template if not request.Prompt and request.UseTokenizerTemplate and request.Messages: prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) # Generate text using the LLM engine request_id = random_uuid() + print(f"Generating text with request_id: {request_id}", file=sys.stderr) outputs = self.llm.generate( { "prompt": prompt, - "multi_modal_data": {"image": image_data} if image_data else None, + "multi_modal_data": { + "image": image_data if image_data else None, + "video": video_data if video_data else None, + } if image_data or video_data else None, }, sampling_params=sampling_params, request_id=request_id, @@ -251,7 +260,7 @@ async def _predict(self, request, context, streaming=False): # Sending the final generated text yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) - def load_image(self, image_path: str) -> Image: + def load_image(self, image_path: str): """ Load an image from the given file path. @@ -265,6 +274,23 @@ def load_image(self, image_path: str) -> Image: return Image.open(image_path) except Exception as e: print(f"Error loading image {image_path}: {e}", file=sys.stderr) + return self.load_video(image_path) + + def load_video(self, video_path: str): + """ + Load a video from the given file path. + + Args: + video_path (str): The path to the image file. + + Returns: + Image: The loaded image. + """ + try: + video = VideoAsset(name=video_path).np_ndarrays + return video + except Exception as e: + print(f"Error loading video {image_path}: {e}", file=sys.stderr) return None async def serve(address): From 288adf66e8ed8d340256f39a848b3cd188c732a7 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 4 Oct 2024 19:28:48 +0200 Subject: [PATCH 3/6] feat(vllm): support CPU installations Signed-off-by: Ettore Di Giacinto --- backend/python/vllm/install.sh | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 78a3d5ba2601..5144e725f4d6 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -13,4 +13,18 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" fi -installRequirements +if [ "x${BUILD_TYPE}" == "x" ]; then + ensureVenv + export VLLM_TARGET_DEVICE=cpu + if [ ! -d vllm ]; then + git clone https://github.com/vllm-project/vllm + fi + pushd vllm + uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes + uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu + VLLM_TARGET_DEVICE=cpu python setup.py install + popd + rm -rf vllm + else + installRequirements +fi From 471c761e87cf4f0a7de4bb062909851405afec03 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 4 Oct 2024 19:29:04 +0200 Subject: [PATCH 4/6] feat(vllm): add bnb Signed-off-by: Ettore Di Giacinto --- backend/python/vllm/requirements-cublas11.txt | 3 ++- backend/python/vllm/requirements-cublas12.txt | 3 ++- backend/python/vllm/requirements-hipblas.txt | 3 ++- backend/python/vllm/requirements-intel.txt | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/backend/python/vllm/requirements-cublas11.txt b/backend/python/vllm/requirements-cublas11.txt index 4381772756dd..c448a91db592 100644 --- a/backend/python/vllm/requirements-cublas11.txt +++ b/backend/python/vllm/requirements-cublas11.txt @@ -1,4 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cu118 accelerate torch -transformers \ No newline at end of file +transformers +bitsandbytes \ No newline at end of file diff --git a/backend/python/vllm/requirements-cublas12.txt b/backend/python/vllm/requirements-cublas12.txt index 765a1ef558e6..e007f0946daa 100644 --- a/backend/python/vllm/requirements-cublas12.txt +++ b/backend/python/vllm/requirements-cublas12.txt @@ -1,3 +1,4 @@ accelerate torch -transformers \ No newline at end of file +transformers +bitsandbytes \ No newline at end of file diff --git a/backend/python/vllm/requirements-hipblas.txt b/backend/python/vllm/requirements-hipblas.txt index c73d8141d3a5..9dff852d5705 100644 --- a/backend/python/vllm/requirements-hipblas.txt +++ b/backend/python/vllm/requirements-hipblas.txt @@ -1,4 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/rocm6.0 accelerate torch -transformers \ No newline at end of file +transformers +bitsandbytes \ No newline at end of file diff --git a/backend/python/vllm/requirements-intel.txt b/backend/python/vllm/requirements-intel.txt index 1f82c46e0f83..9544336884ea 100644 --- a/backend/python/vllm/requirements-intel.txt +++ b/backend/python/vllm/requirements-intel.txt @@ -4,4 +4,5 @@ accelerate torch transformers optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 \ No newline at end of file +setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 +bitsandbytes \ No newline at end of file From 3d737d1e3a48fac418f7d2219d64c5fff88e2cc7 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 4 Oct 2024 19:31:52 +0200 Subject: [PATCH 5/6] chore: add docs reference Signed-off-by: Ettore Di Giacinto --- backend/python/vllm/install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 5144e725f4d6..8a9e70a83c8f 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -15,6 +15,7 @@ fi if [ "x${BUILD_TYPE}" == "x" ]; then ensureVenv + # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html export VLLM_TARGET_DEVICE=cpu if [ ! -d vllm ]; then git clone https://github.com/vllm-project/vllm From d68e7ae928c79b49ce06a85e5504d7cf6882a1a9 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 4 Oct 2024 19:35:09 +0200 Subject: [PATCH 6/6] Apply suggestions from code review Signed-off-by: Ettore Di Giacinto --- backend/python/vllm/backend.py | 2 +- backend/python/vllm/install.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index b1ce42b95d94..dfbb1503338a 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -284,7 +284,7 @@ def load_video(self, video_path: str): video_path (str): The path to the image file. Returns: - Image: The loaded image. + Video: The loaded video. """ try: video = VideoAsset(name=video_path).np_ndarrays diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 8a9e70a83c8f..022cf8bf191e 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -16,7 +16,6 @@ fi if [ "x${BUILD_TYPE}" == "x" ]; then ensureVenv # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html - export VLLM_TARGET_DEVICE=cpu if [ ! -d vllm ]; then git clone https://github.com/vllm-project/vllm fi