Merge pull request #152 from michaelhhogue/ollama-llava

joshbickett · web-flow · commit 33af25cb2f53 · 2024-02-08T20:07:58.000-08:00
Add support for LLaVA through Ollama
diff --git a/README.md b/README.md
@@ -108,6 +108,31 @@ Start `operate` with the SoM model
 operate -m gpt-4-with-som
 ```
 
+### Locally Hosted LLaVA Through Ollama
+If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!   
+*Note: Ollama currently only supports MacOS and Linux*   
+
+First, install Ollama on your machine from https://ollama.ai/download.   
+
+Once Ollama is installed, pull the LLaVA model:
+```
+ollama pull llava
+```
+This will download the model on your machine which takes approximately 5 GB of storage.   
+
+When Ollama has finished pulling LLaVA, start the server:
+```
+ollama serve
+```
+
+That's it! Now start `operate` and select the LLaVA model:
+```
+operate -m llava
+```   
+**Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time.
+
+Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama)
+
 ### Voice Mode `--voice`
 The framework supports voice inputs for the objective. Try voice by following the instructions below. 
 **Clone the repo** to a directory on your computer:
diff --git a/operate/models/apis.py b/operate/models/apis.py
@@ -5,7 +5,7 @@
 import traceback
 import io
 import easyocr
-
+import ollama
 
 from PIL import Image
 from ultralytics import YOLO
@@ -53,6 +53,9 @@ async def get_next_action(model, messages, objective, session_id):
         return "coming soon"
     elif model == "gemini-pro-vision":
         return call_gemini_pro_vision(messages, objective), None
+    elif model == "llava":
+        operation = call_ollama_llava(messages), None
+        return operation
 
     raise ModelNotRecognizedException(model)
 
@@ -464,6 +467,86 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
         return call_gpt_4_vision_preview(messages)
 
 
+def call_ollama_llava(messages):
+    if VERBOSE:
+        print("[call_ollama_llava]")
+    time.sleep(1)
+    try:
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        # Call the function to capture the screen with the cursor
+        capture_screen_with_cursor(screenshot_filename)
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        if VERBOSE:
+            print(
+                "[call_ollama_llava] user_prompt",
+                user_prompt,
+            )
+
+        vision_message = {
+            "role": "user",
+            "content": user_prompt,
+            "images": [screenshot_filename],
+        }
+        messages.append(vision_message)
+
+        response = ollama.chat(
+            model="llava",
+            messages=messages,
+        )
+        
+        # Important: Remove the image path from the message history.
+        # Ollama will attempt to load each image reference and will
+        # eventually timeout.
+        messages[-1]["images"] = None
+        
+        content = response['message']['content'].strip()
+
+        if content.startswith("```json"):
+            content = content[len("```json") :]  # Remove starting ```json
+            if content.endswith("```"):
+                content = content[: -len("```")]  # Remove ending
+
+        assistant_message = {"role": "assistant", "content": content}
+        if VERBOSE:
+            print(
+                "[call_ollama_llava] content",
+                content,
+            )
+        content = json.loads(content)
+
+        messages.append(assistant_message)
+
+        return content
+
+    except ollama.ResponseError as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}",
+            e,
+        )
+        
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
+            e,
+        )
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
+            content,
+        )
+        if VERBOSE:
+            traceback.print_exc()
+        return call_ollama_llava(messages)
+
+
 def get_last_assistant_message(messages):
     """
     Retrieve the last message from the assistant in the messages array.
diff --git a/requirements.txt b/requirements.txt
@@ -1,53 +1,102 @@
+aiohttp==3.9.1
+aiosignal==1.3.1
 annotated-types==0.6.0
 anyio==3.7.1
+attrs==23.2.0
+cachetools==5.3.2
 certifi==2023.7.22
 charset-normalizer==3.3.2
 colorama==0.4.6
 contourpy==1.2.0
 cycler==0.12.1
 distro==1.8.0
+easyocr==1.7.1
 EasyProcess==1.1
 entrypoint2==1.1
 exceptiongroup==1.1.3
+filelock==3.13.1
 fonttools==4.44.0
+frozenlist==1.4.1
+fsspec==2024.2.0
+google-ai-generativelanguage==0.4.0
+google-api-core==2.16.2
+google-auth==2.27.0
+google-generativeai==0.3.0
+googleapis-common-protos==1.62.0
+grpcio==1.60.1
+grpcio-status==1.60.1
 h11==0.14.0
 httpcore==1.0.2
-httpx==0.25.1
+httpx==0.25.2
 idna==3.4
+imageio==2.33.1
 importlib-resources==6.1.1
+Jinja2==3.1.3
 kiwisolver==1.4.5
+lazy_loader==0.3
+MarkupSafe==2.1.5
 matplotlib==3.8.1
 MouseInfo==0.1.3
+mpmath==1.3.0
 mss==9.0.1
+multidict==6.0.5
+networkx==3.2.1
+ninja==1.11.1.1
 numpy==1.26.1
+ollama==0.1.6
 openai==1.2.3
+opencv-python==4.9.0.80
+opencv-python-headless==4.9.0.80
 packaging==23.2
+pandas==2.2.0
 Pillow==10.1.0
 prompt-toolkit==3.0.39
+proto-plus==1.23.0
+protobuf==4.25.2
+psutil==5.9.8
+py-cpuinfo==9.0.0
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
 PyAutoGUI==0.9.54
+pyclipper==1.3.0.post5
 pydantic==2.4.2
 pydantic_core==2.10.1
 PyGetWindow==0.0.9
 PyMsgBox==1.0.9
+pyobjc-core==10.1
+pyobjc-framework-Cocoa==10.1
+pyobjc-framework-Quartz==10.1
 pyparsing==3.1.1
 pyperclip==1.8.2
 PyRect==0.2.0
 pyscreenshot==3.1
 PyScreeze==0.1.29
-python3-xlib==0.15
+python-bidi==0.4.2
 python-dateutil==2.8.2
 python-dotenv==1.0.0
+python3-xlib==0.15
 pytweening==1.0.7
+pytz==2024.1
+PyYAML==6.0.1
 requests==2.31.0
+rsa==4.9
 rubicon-objc==0.4.7
+scikit-image==0.22.0
+scipy==1.12.0
+seaborn==0.13.2
+shapely==2.0.2
 six==1.16.0
 sniffio==1.3.0
+sympy==1.12
+thop==0.1.1.post2209072238
+tifffile==2024.1.30
+torch==2.2.0
+torchvision==0.17.0
 tqdm==4.66.1
 typing_extensions==4.8.0
+tzdata==2023.4
+ultralytics==8.0.227
 urllib3==2.0.7
 wcwidth==0.2.9
+yarl==1.9.4
 zipp==3.17.0
-google-generativeai==0.3.0
-aiohttp==3.9.1
-ultralytics==8.0.227
-easyocr==1.7.1