Skip to content

Commit 33af25c

Browse files
authored
Merge pull request #152 from michaelhhogue/ollama-llava
Add support for LLaVA through Ollama
2 parents 10bb8bf + ce2d42e commit 33af25c

File tree

3 files changed

+164
-7
lines changed

3 files changed

+164
-7
lines changed

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,31 @@ Start `operate` with the SoM model
108108
operate -m gpt-4-with-som
109109
```
110110

111+
### Locally Hosted LLaVA Through Ollama
112+
If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!
113+
*Note: Ollama currently only supports MacOS and Linux*
114+
115+
First, install Ollama on your machine from https://ollama.ai/download.
116+
117+
Once Ollama is installed, pull the LLaVA model:
118+
```
119+
ollama pull llava
120+
```
121+
This will download the model on your machine which takes approximately 5 GB of storage.
122+
123+
When Ollama has finished pulling LLaVA, start the server:
124+
```
125+
ollama serve
126+
```
127+
128+
That's it! Now start `operate` and select the LLaVA model:
129+
```
130+
operate -m llava
131+
```
132+
**Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time.
133+
134+
Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama)
135+
111136
### Voice Mode `--voice`
112137
The framework supports voice inputs for the objective. Try voice by following the instructions below.
113138
**Clone the repo** to a directory on your computer:

operate/models/apis.py

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import traceback
66
import io
77
import easyocr
8-
8+
import ollama
99

1010
from PIL import Image
1111
from ultralytics import YOLO
@@ -53,6 +53,9 @@ async def get_next_action(model, messages, objective, session_id):
5353
return "coming soon"
5454
elif model == "gemini-pro-vision":
5555
return call_gemini_pro_vision(messages, objective), None
56+
elif model == "llava":
57+
operation = call_ollama_llava(messages), None
58+
return operation
5659

5760
raise ModelNotRecognizedException(model)
5861

@@ -464,6 +467,86 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
464467
return call_gpt_4_vision_preview(messages)
465468

466469

470+
def call_ollama_llava(messages):
471+
if VERBOSE:
472+
print("[call_ollama_llava]")
473+
time.sleep(1)
474+
try:
475+
screenshots_dir = "screenshots"
476+
if not os.path.exists(screenshots_dir):
477+
os.makedirs(screenshots_dir)
478+
479+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
480+
# Call the function to capture the screen with the cursor
481+
capture_screen_with_cursor(screenshot_filename)
482+
483+
if len(messages) == 1:
484+
user_prompt = get_user_first_message_prompt()
485+
else:
486+
user_prompt = get_user_prompt()
487+
488+
if VERBOSE:
489+
print(
490+
"[call_ollama_llava] user_prompt",
491+
user_prompt,
492+
)
493+
494+
vision_message = {
495+
"role": "user",
496+
"content": user_prompt,
497+
"images": [screenshot_filename],
498+
}
499+
messages.append(vision_message)
500+
501+
response = ollama.chat(
502+
model="llava",
503+
messages=messages,
504+
)
505+
506+
# Important: Remove the image path from the message history.
507+
# Ollama will attempt to load each image reference and will
508+
# eventually timeout.
509+
messages[-1]["images"] = None
510+
511+
content = response['message']['content'].strip()
512+
513+
if content.startswith("```json"):
514+
content = content[len("```json") :] # Remove starting ```json
515+
if content.endswith("```"):
516+
content = content[: -len("```")] # Remove ending
517+
518+
assistant_message = {"role": "assistant", "content": content}
519+
if VERBOSE:
520+
print(
521+
"[call_ollama_llava] content",
522+
content,
523+
)
524+
content = json.loads(content)
525+
526+
messages.append(assistant_message)
527+
528+
return content
529+
530+
except ollama.ResponseError as e:
531+
print(
532+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}",
533+
e,
534+
)
535+
536+
except Exception as e:
537+
print(
538+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
539+
e,
540+
)
541+
print(
542+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
543+
content,
544+
)
545+
if VERBOSE:
546+
traceback.print_exc()
547+
return call_ollama_llava(messages)
548+
549+
467550
def get_last_assistant_message(messages):
468551
"""
469552
Retrieve the last message from the assistant in the messages array.

requirements.txt

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,102 @@
1+
aiohttp==3.9.1
2+
aiosignal==1.3.1
13
annotated-types==0.6.0
24
anyio==3.7.1
5+
attrs==23.2.0
6+
cachetools==5.3.2
37
certifi==2023.7.22
48
charset-normalizer==3.3.2
59
colorama==0.4.6
610
contourpy==1.2.0
711
cycler==0.12.1
812
distro==1.8.0
13+
easyocr==1.7.1
914
EasyProcess==1.1
1015
entrypoint2==1.1
1116
exceptiongroup==1.1.3
17+
filelock==3.13.1
1218
fonttools==4.44.0
19+
frozenlist==1.4.1
20+
fsspec==2024.2.0
21+
google-ai-generativelanguage==0.4.0
22+
google-api-core==2.16.2
23+
google-auth==2.27.0
24+
google-generativeai==0.3.0
25+
googleapis-common-protos==1.62.0
26+
grpcio==1.60.1
27+
grpcio-status==1.60.1
1328
h11==0.14.0
1429
httpcore==1.0.2
15-
httpx==0.25.1
30+
httpx==0.25.2
1631
idna==3.4
32+
imageio==2.33.1
1733
importlib-resources==6.1.1
34+
Jinja2==3.1.3
1835
kiwisolver==1.4.5
36+
lazy_loader==0.3
37+
MarkupSafe==2.1.5
1938
matplotlib==3.8.1
2039
MouseInfo==0.1.3
40+
mpmath==1.3.0
2141
mss==9.0.1
42+
multidict==6.0.5
43+
networkx==3.2.1
44+
ninja==1.11.1.1
2245
numpy==1.26.1
46+
ollama==0.1.6
2347
openai==1.2.3
48+
opencv-python==4.9.0.80
49+
opencv-python-headless==4.9.0.80
2450
packaging==23.2
51+
pandas==2.2.0
2552
Pillow==10.1.0
2653
prompt-toolkit==3.0.39
54+
proto-plus==1.23.0
55+
protobuf==4.25.2
56+
psutil==5.9.8
57+
py-cpuinfo==9.0.0
58+
pyasn1==0.5.1
59+
pyasn1-modules==0.3.0
2760
PyAutoGUI==0.9.54
61+
pyclipper==1.3.0.post5
2862
pydantic==2.4.2
2963
pydantic_core==2.10.1
3064
PyGetWindow==0.0.9
3165
PyMsgBox==1.0.9
66+
pyobjc-core==10.1
67+
pyobjc-framework-Cocoa==10.1
68+
pyobjc-framework-Quartz==10.1
3269
pyparsing==3.1.1
3370
pyperclip==1.8.2
3471
PyRect==0.2.0
3572
pyscreenshot==3.1
3673
PyScreeze==0.1.29
37-
python3-xlib==0.15
74+
python-bidi==0.4.2
3875
python-dateutil==2.8.2
3976
python-dotenv==1.0.0
77+
python3-xlib==0.15
4078
pytweening==1.0.7
79+
pytz==2024.1
80+
PyYAML==6.0.1
4181
requests==2.31.0
82+
rsa==4.9
4283
rubicon-objc==0.4.7
84+
scikit-image==0.22.0
85+
scipy==1.12.0
86+
seaborn==0.13.2
87+
shapely==2.0.2
4388
six==1.16.0
4489
sniffio==1.3.0
90+
sympy==1.12
91+
thop==0.1.1.post2209072238
92+
tifffile==2024.1.30
93+
torch==2.2.0
94+
torchvision==0.17.0
4595
tqdm==4.66.1
4696
typing_extensions==4.8.0
97+
tzdata==2023.4
98+
ultralytics==8.0.227
4799
urllib3==2.0.7
48100
wcwidth==0.2.9
101+
yarl==1.9.4
49102
zipp==3.17.0
50-
google-generativeai==0.3.0
51-
aiohttp==3.9.1
52-
ultralytics==8.0.227
53-
easyocr==1.7.1

0 commit comments

Comments
 (0)