|
5 | 5 | import traceback
|
6 | 6 | import io
|
7 | 7 | import easyocr
|
8 |
| - |
| 8 | +import ollama |
9 | 9 |
|
10 | 10 | from PIL import Image
|
11 | 11 | from ultralytics import YOLO
|
@@ -53,6 +53,9 @@ async def get_next_action(model, messages, objective, session_id):
|
53 | 53 | return "coming soon"
|
54 | 54 | elif model == "gemini-pro-vision":
|
55 | 55 | return call_gemini_pro_vision(messages, objective), None
|
| 56 | + elif model == "llava": |
| 57 | + operation = call_ollama_llava(messages), None |
| 58 | + return operation |
56 | 59 |
|
57 | 60 | raise ModelNotRecognizedException(model)
|
58 | 61 |
|
@@ -464,6 +467,86 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
|
464 | 467 | return call_gpt_4_vision_preview(messages)
|
465 | 468 |
|
466 | 469 |
|
| 470 | +def call_ollama_llava(messages): |
| 471 | + if VERBOSE: |
| 472 | + print("[call_ollama_llava]") |
| 473 | + time.sleep(1) |
| 474 | + try: |
| 475 | + screenshots_dir = "screenshots" |
| 476 | + if not os.path.exists(screenshots_dir): |
| 477 | + os.makedirs(screenshots_dir) |
| 478 | + |
| 479 | + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") |
| 480 | + # Call the function to capture the screen with the cursor |
| 481 | + capture_screen_with_cursor(screenshot_filename) |
| 482 | + |
| 483 | + if len(messages) == 1: |
| 484 | + user_prompt = get_user_first_message_prompt() |
| 485 | + else: |
| 486 | + user_prompt = get_user_prompt() |
| 487 | + |
| 488 | + if VERBOSE: |
| 489 | + print( |
| 490 | + "[call_ollama_llava] user_prompt", |
| 491 | + user_prompt, |
| 492 | + ) |
| 493 | + |
| 494 | + vision_message = { |
| 495 | + "role": "user", |
| 496 | + "content": user_prompt, |
| 497 | + "images": [screenshot_filename], |
| 498 | + } |
| 499 | + messages.append(vision_message) |
| 500 | + |
| 501 | + response = ollama.chat( |
| 502 | + model="llava", |
| 503 | + messages=messages, |
| 504 | + ) |
| 505 | + |
| 506 | + # Important: Remove the image path from the message history. |
| 507 | + # Ollama will attempt to load each image reference and will |
| 508 | + # eventually timeout. |
| 509 | + messages[-1]["images"] = None |
| 510 | + |
| 511 | + content = response['message']['content'].strip() |
| 512 | + |
| 513 | + if content.startswith("```json"): |
| 514 | + content = content[len("```json") :] # Remove starting ```json |
| 515 | + if content.endswith("```"): |
| 516 | + content = content[: -len("```")] # Remove ending |
| 517 | + |
| 518 | + assistant_message = {"role": "assistant", "content": content} |
| 519 | + if VERBOSE: |
| 520 | + print( |
| 521 | + "[call_ollama_llava] content", |
| 522 | + content, |
| 523 | + ) |
| 524 | + content = json.loads(content) |
| 525 | + |
| 526 | + messages.append(assistant_message) |
| 527 | + |
| 528 | + return content |
| 529 | + |
| 530 | + except ollama.ResponseError as e: |
| 531 | + print( |
| 532 | + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}", |
| 533 | + e, |
| 534 | + ) |
| 535 | + |
| 536 | + except Exception as e: |
| 537 | + print( |
| 538 | + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}", |
| 539 | + e, |
| 540 | + ) |
| 541 | + print( |
| 542 | + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", |
| 543 | + content, |
| 544 | + ) |
| 545 | + if VERBOSE: |
| 546 | + traceback.print_exc() |
| 547 | + return call_ollama_llava(messages) |
| 548 | + |
| 549 | + |
467 | 550 | def get_last_assistant_message(messages):
|
468 | 551 | """
|
469 | 552 | Retrieve the last message from the assistant in the messages array.
|
|
0 commit comments