From 5717f723e6cfbda6ef6f4c8d1cc80ab16221ea30 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 4 Oct 2024 17:29:36 +0200
Subject: [PATCH 1/6] feat(vllm): add support for image-to-text

Related to https://github.com/mudler/LocalAI/issues/3670

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/vllm/backend.py | 47 ++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 2cf15c1ca83e..31a327ecd8a7 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -5,6 +5,8 @@
 import signal
 import sys
 import os
+from typing import List
+from PIL import Image
 
 import backend_pb2
 import backend_pb2_grpc
@@ -15,6 +17,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.multimodal.utils import fetch_image
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
@@ -105,6 +108,7 @@ async def LoadModel(self, request, context):
         try:
             self.llm = AsyncLLMEngine.from_engine_args(engine_args)
         except Exception as err:
+            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
 
         try:
@@ -117,7 +121,7 @@ async def LoadModel(self, request, context):
            )
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-
+        print("Model loaded successfully", file=sys.stderr)
         return backend_pb2.Result(message="Model loaded successfully", success=True)
 
     async def Predict(self, request, context):
@@ -196,15 +200,25 @@ async def _predict(self, request, context, streaming=False):
         if request.Seed != 0:
             sampling_params.seed = request.Seed
 
+        # Extract image paths and process images
         prompt = request.Prompt
-        
-        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
+        image_paths = request.Images
+        image_data = [self.load_image(img_path) for img_path in image_paths]
+
+        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
         if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
             prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
 
-        # Generate text
+        # Generate text using the LLM engine
         request_id = random_uuid()
-        outputs = self.llm.generate(prompt, sampling_params, request_id)
+        outputs = self.llm.generate(
+            {
+                "prompt": prompt,
+                "multi_modal_data": {"image": image_data} if image_data else None,
+            },
+            sampling_params=sampling_params,
+            request_id=request_id,
+        )
 
         # Stream the results
         generated_text = ""
@@ -227,9 +241,32 @@ async def _predict(self, request, context, streaming=False):
         if streaming:
             return
 
+        # Remove the image files from /tmp folder
+        for img_path in image_paths:
+            try:
+                os.remove(img_path)
+            except Exception as e:
+                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
+
         # Sending the final generated text
         yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
 
+    def load_image(self, image_path: str) -> Image:
+        """
+        Load an image from the given file path.
+        
+        Args:
+            image_path (str): The path to the image file.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+            return Image.open(image_path)
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            return None
+
 async def serve(address):
     # Start asyncio gRPC server
     server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))

From f3f9d1df81e43de032f28e968b8f95e62137949b Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 4 Oct 2024 19:27:55 +0200
Subject: [PATCH 2/6] feat(vllm): add support for video-to-text

Closes: https://github.com/mudler/LocalAI/issues/2318

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/vllm/backend.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 31a327ecd8a7..b1ce42b95d94 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -18,6 +18,7 @@
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
+from vllm.assets.video import VideoAsset
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
@@ -202,19 +203,27 @@ async def _predict(self, request, context, streaming=False):
 
         # Extract image paths and process images
         prompt = request.Prompt
+
         image_paths = request.Images
         image_data = [self.load_image(img_path) for img_path in image_paths]
 
+        videos_path = request.Videos
+        video_data = [self.load_video(video_path) for video_path in videos_path]
+
         # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
         if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
             prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
 
         # Generate text using the LLM engine
         request_id = random_uuid()
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
         outputs = self.llm.generate(
             {
                 "prompt": prompt,
-                "multi_modal_data": {"image": image_data} if image_data else None,
+                "multi_modal_data": {
+                    "image": image_data if image_data else None,
+                    "video": video_data if video_data else None,
+                } if image_data or video_data else None,
             },
             sampling_params=sampling_params,
             request_id=request_id,
@@ -251,7 +260,7 @@ async def _predict(self, request, context, streaming=False):
         # Sending the final generated text
         yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
 
-    def load_image(self, image_path: str) -> Image:
+    def load_image(self, image_path: str):
         """
         Load an image from the given file path.
         
@@ -265,6 +274,23 @@ def load_image(self, image_path: str) -> Image:
             return Image.open(image_path)
         except Exception as e:
             print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            return self.load_video(image_path)
+
+    def load_video(self, video_path: str):
+        """
+        Load a video from the given file path.
+        
+        Args:
+            video_path (str): The path to the image file.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+            video = VideoAsset(name=video_path).np_ndarrays
+            return video
+        except Exception as e:
+            print(f"Error loading video {image_path}: {e}", file=sys.stderr)
             return None
 
 async def serve(address):

From 288adf66e8ed8d340256f39a848b3cd188c732a7 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 4 Oct 2024 19:28:48 +0200
Subject: [PATCH 3/6] feat(vllm): support CPU installations

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/vllm/install.sh | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 78a3d5ba2601..5144e725f4d6 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -13,4 +13,18 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
     EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 
-installRequirements
+if [ "x${BUILD_TYPE}" == "x" ]; then
+        ensureVenv
+        export VLLM_TARGET_DEVICE=cpu
+        if [ ! -d vllm ]; then
+            git clone https://github.com/vllm-project/vllm
+        fi
+        pushd vllm
+            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
+            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+            VLLM_TARGET_DEVICE=cpu python setup.py install
+        popd
+        rm -rf vllm
+    else
+        installRequirements
+fi

From 471c761e87cf4f0a7de4bb062909851405afec03 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 4 Oct 2024 19:29:04 +0200
Subject: [PATCH 4/6] feat(vllm): add bnb

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/vllm/requirements-cublas11.txt | 3 ++-
 backend/python/vllm/requirements-cublas12.txt | 3 ++-
 backend/python/vllm/requirements-hipblas.txt  | 3 ++-
 backend/python/vllm/requirements-intel.txt    | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/backend/python/vllm/requirements-cublas11.txt b/backend/python/vllm/requirements-cublas11.txt
index 4381772756dd..c448a91db592 100644
--- a/backend/python/vllm/requirements-cublas11.txt
+++ b/backend/python/vllm/requirements-cublas11.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 accelerate
 torch
-transformers
\ No newline at end of file
+transformers
+bitsandbytes
\ No newline at end of file
diff --git a/backend/python/vllm/requirements-cublas12.txt b/backend/python/vllm/requirements-cublas12.txt
index 765a1ef558e6..e007f0946daa 100644
--- a/backend/python/vllm/requirements-cublas12.txt
+++ b/backend/python/vllm/requirements-cublas12.txt
@@ -1,3 +1,4 @@
 accelerate
 torch
-transformers
\ No newline at end of file
+transformers
+bitsandbytes
\ No newline at end of file
diff --git a/backend/python/vllm/requirements-hipblas.txt b/backend/python/vllm/requirements-hipblas.txt
index c73d8141d3a5..9dff852d5705 100644
--- a/backend/python/vllm/requirements-hipblas.txt
+++ b/backend/python/vllm/requirements-hipblas.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 accelerate
 torch
-transformers
\ No newline at end of file
+transformers
+bitsandbytes
\ No newline at end of file
diff --git a/backend/python/vllm/requirements-intel.txt b/backend/python/vllm/requirements-intel.txt
index 1f82c46e0f83..9544336884ea 100644
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -4,4 +4,5 @@ accelerate
 torch
 transformers
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+bitsandbytes
\ No newline at end of file

From 3d737d1e3a48fac418f7d2219d64c5fff88e2cc7 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 4 Oct 2024 19:31:52 +0200
Subject: [PATCH 5/6] chore: add docs reference

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/vllm/install.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 5144e725f4d6..8a9e70a83c8f 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -15,6 +15,7 @@ fi
 
 if [ "x${BUILD_TYPE}" == "x" ]; then
         ensureVenv
+        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
         export VLLM_TARGET_DEVICE=cpu
         if [ ! -d vllm ]; then
             git clone https://github.com/vllm-project/vllm

From d68e7ae928c79b49ce06a85e5504d7cf6882a1a9 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 4 Oct 2024 19:35:09 +0200
Subject: [PATCH 6/6] Apply suggestions from code review

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 backend/python/vllm/backend.py | 2 +-
 backend/python/vllm/install.sh | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index b1ce42b95d94..dfbb1503338a 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -284,7 +284,7 @@ def load_video(self, video_path: str):
             video_path (str): The path to the image file.
 
         Returns:
-            Image: The loaded image.
+            Video: The loaded video.
         """
         try:
             video = VideoAsset(name=video_path).np_ndarrays
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 8a9e70a83c8f..022cf8bf191e 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -16,7 +16,6 @@ fi
 if [ "x${BUILD_TYPE}" == "x" ]; then
         ensureVenv
         # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
-        export VLLM_TARGET_DEVICE=cpu
         if [ ! -d vllm ]; then
             git clone https://github.com/vllm-project/vllm
         fi