khoj-ai
diff --git a/‎src/khoj/database/adapters/__init__.py
+24-22 b/‎src/khoj/database/adapters/__init__.py
+24-22
diff --git a/‎src/khoj/processor/conversation/anthropic/anthropic_chat.py
+22-12 b/‎src/khoj/processor/conversation/anthropic/anthropic_chat.py
+22-12
diff --git a/‎src/khoj/processor/conversation/anthropic/utils.py
+21-64 b/‎src/khoj/processor/conversation/anthropic/utils.py
+21-64
@@ -763,9 +763,9 @@ async def ais_agent_accessible(agent: Agent, user: KhojUser) -> bool:
         return False
 
     @staticmethod
-    def get_conversation_agent_by_id(agent_id: int):
-        agent = Agent.objects.filter(id=agent_id).first()
-        if agent == AgentAdapters.get_default_agent():
+    async def aget_conversation_agent_by_id(agent_id: int):
+        agent = await Agent.objects.filter(id=agent_id).afirst()
+        if agent == await AgentAdapters.aget_default_agent():
             # If the agent is set to the default agent, then return None and let the default application code be used
             return None
         return agent
@@ -1109,14 +1109,6 @@ def get_all_chat_models():
     async def aget_all_chat_models():
         return await sync_to_async(list)(ChatModel.objects.prefetch_related("ai_model_api").all())
 
-    @staticmethod
-    def get_vision_enabled_config():
-        chat_models = ConversationAdapters.get_all_chat_models()
-        for config in chat_models:
-            if config.vision_enabled:
-                return config
-        return None
-
     @staticmethod
     async def aget_vision_enabled_config():
         chat_models = await ConversationAdapters.aget_all_chat_models()
@@ -1171,7 +1163,11 @@ def get_chat_model(user: KhojUser):
     @staticmethod
     async def aget_chat_model(user: KhojUser):
         subscribed = await ais_user_subscribed(user)
-        config = await UserConversationConfig.objects.filter(user=user).prefetch_related("setting").afirst()
+        config = (
+            await UserConversationConfig.objects.filter(user=user)
+            .prefetch_related("setting", "setting__ai_model_api")
+            .afirst()
+        )
         if subscribed:
             # Subscibed users can use any available chat model
             if config:
@@ -1387,7 +1383,7 @@ def create_conversation_from_public_conversation(
 
     @staticmethod
     @require_valid_user
-    def save_conversation(
+    async def save_conversation(
         user: KhojUser,
         conversation_log: dict,
         client_application: ClientApplication = None,
@@ -1396,19 +1392,21 @@ def save_conversation(
     ):
         slug = user_message.strip()[:200] if user_message else None
         if conversation_id:
-            conversation = Conversation.objects.filter(user=user, client=client_application, id=conversation_id).first()
+            conversation = await Conversation.objects.filter(
+                user=user, client=client_application, id=conversation_id
+            ).afirst()
         else:
             conversation = (
-                Conversation.objects.filter(user=user, client=client_application).order_by("-updated_at").first()
+                await Conversation.objects.filter(user=user, client=client_application).order_by("-updated_at").afirst()
             )
 
         if conversation:
             conversation.conversation_log = conversation_log
             conversation.slug = slug
             conversation.updated_at = datetime.now(tz=timezone.utc)
-            conversation.save()
+            await conversation.asave()
         else:
-            Conversation.objects.create(
+            await Conversation.objects.acreate(
                 user=user, conversation_log=conversation_log, client=client_application, slug=slug
             )
 
@@ -1455,17 +1453,21 @@ async def aget_conversation_starters(user: KhojUser, max_results=3):
         return random.sample(all_questions, max_results)
 
     @staticmethod
-    def get_valid_chat_model(user: KhojUser, conversation: Conversation, is_subscribed: bool):
+    async def aget_valid_chat_model(user: KhojUser, conversation: Conversation, is_subscribed: bool):
         agent: Agent = (
-            conversation.agent if is_subscribed and AgentAdapters.get_default_agent() != conversation.agent else None
+            conversation.agent
+            if is_subscribed and await AgentAdapters.aget_default_agent() != conversation.agent
+            else None
         )
         if agent and agent.chat_model:
-            chat_model = conversation.agent.chat_model
+            chat_model = await ChatModel.objects.select_related("ai_model_api").aget(
+                pk=conversation.agent.chat_model.pk
+            )
         else:
-            chat_model = ConversationAdapters.get_chat_model(user)
+            chat_model = await ConversationAdapters.aget_chat_model(user)
 
         if chat_model is None:
-            chat_model = ConversationAdapters.get_default_chat_model()
+            chat_model = await ConversationAdapters.aget_default_chat_model()
 
         if chat_model.model_type == ChatModel.ModelType.OFFLINE:
             if state.offline_chat_processor_config is None or state.offline_chat_processor_config.loaded_model is None:
 
@@ -1,6 +1,6 @@
 import logging
 from datetime import datetime, timedelta
-from typing import Dict, List, Optional
+from typing import AsyncGenerator, Dict, List, Optional
 
 import pyjson5
 from langchain.schema import ChatMessage
@@ -137,7 +137,7 @@ def anthropic_send_message_to_model(
     )
 
 
-def converse_anthropic(
+async def converse_anthropic(
     references,
     user_query,
     online_results: Optional[Dict[str, Dict]] = None,
@@ -161,7 +161,7 @@ def converse_anthropic(
     generated_asset_results: Dict[str, Dict] = {},
     deepthought: Optional[bool] = False,
     tracer: dict = {},
-):
+) -> AsyncGenerator[str, None]:
     """
     Converse with user using Anthropic's Claude
     """
@@ -191,11 +191,17 @@ def converse_anthropic(
 
     # Get Conversation Primer appropriate to Conversation Type
     if conversation_commands == [ConversationCommand.Notes] and is_none_or_empty(references):
-        completion_func(chat_response=prompts.no_notes_found.format())
-        return iter([prompts.no_notes_found.format()])
+        response = prompts.no_notes_found.format()
+        if completion_func:
+            await completion_func(chat_response=response)
+        yield response
+        return
     elif conversation_commands == [ConversationCommand.Online] and is_none_or_empty(online_results):
-        completion_func(chat_response=prompts.no_online_results_found.format())
-        return iter([prompts.no_online_results_found.format()])
+        response = prompts.no_online_results_found.format()
+        if completion_func:
+            await completion_func(chat_response=response)
+        yield response
+        return
 
     context_message = ""
     if not is_none_or_empty(references):
@@ -228,17 +234,21 @@ def converse_anthropic(
     logger.debug(f"Conversation Context for Claude: {messages_to_print(messages)}")
 
     # Get Response from Claude
-    return anthropic_chat_completion_with_backoff(
+    full_response = ""
+    async for chunk in anthropic_chat_completion_with_backoff(
         messages=messages,
-        compiled_references=references,
-        online_results=online_results,
         model_name=model,
         temperature=0.2,
         api_key=api_key,
         api_base_url=api_base_url,
         system_prompt=system_prompt,
-        completion_func=completion_func,
         max_prompt_size=max_prompt_size,
         deepthought=deepthought,
         tracer=tracer,
-    )
+    ):
+        full_response += chunk
+        yield chunk
+
+    # Call completion_func once finish streaming and we have the full response
+    if completion_func:
+        await completion_func(chat_response=full_response)
@@ -1,5 +1,5 @@
 import logging
-from threading import Thread
+from time import perf_counter
 from typing import Dict, List
 
 import anthropic
@@ -13,13 +13,13 @@
 )
 
 from khoj.processor.conversation.utils import (
-    ThreadedGenerator,
     commit_conversation_trace,
     get_image_from_base64,
     get_image_from_url,
 )
 from khoj.utils.helpers import (
-    get_ai_api_info,
+    get_anthropic_async_client,
+    get_anthropic_client,
     get_chat_usage_metrics,
     is_none_or_empty,
     is_promptrace_enabled,
@@ -28,24 +28,12 @@
 logger = logging.getLogger(__name__)
 
 anthropic_clients: Dict[str, anthropic.Anthropic | anthropic.AnthropicVertex] = {}
+anthropic_async_clients: Dict[str, anthropic.AsyncAnthropic | anthropic.AsyncAnthropicVertex] = {}
 
 DEFAULT_MAX_TOKENS_ANTHROPIC = 8000
 MAX_REASONING_TOKENS_ANTHROPIC = 12000
 
 
-def get_anthropic_client(api_key, api_base_url=None) -> anthropic.Anthropic | anthropic.AnthropicVertex:
-    api_info = get_ai_api_info(api_key, api_base_url)
-    if api_info.api_key:
-        client = anthropic.Anthropic(api_key=api_info.api_key)
-    else:
-        client = anthropic.AnthropicVertex(
-            region=api_info.region,
-            project_id=api_info.project,
-            credentials=api_info.credentials,
-        )
-    return client
-
-
 @retry(
     wait=wait_random_exponential(min=1, max=10),
     stop=stop_after_attempt(2),
@@ -126,60 +114,23 @@ def anthropic_completion_with_backoff(
     before_sleep=before_sleep_log(logger, logging.DEBUG),
     reraise=True,
 )
-def anthropic_chat_completion_with_backoff(
+async def anthropic_chat_completion_with_backoff(
     messages: list[ChatMessage],
-    compiled_references,
-    online_results,
     model_name,
     temperature,
     api_key,
     api_base_url,
     system_prompt: str,
     max_prompt_size=None,
-    completion_func=None,
-    deepthought=False,
-    model_kwargs=None,
-    tracer={},
-):
-    g = ThreadedGenerator(compiled_references, online_results, completion_func=completion_func)
-    t = Thread(
-        target=anthropic_llm_thread,
-        args=(
-            g,
-            messages,
-            system_prompt,
-            model_name,
-            temperature,
-            api_key,
-            api_base_url,
-            max_prompt_size,
-            deepthought,
-            model_kwargs,
-            tracer,
-        ),
-    )
-    t.start()
-    return g
-
-
-def anthropic_llm_thread(
-    g,
-    messages: list[ChatMessage],
-    system_prompt: str,
-    model_name: str,
-    temperature,
-    api_key,
-    api_base_url=None,
-    max_prompt_size=None,
     deepthought=False,
     model_kwargs=None,
     tracer={},
 ):
     try:
-        client = anthropic_clients.get(api_key)
+        client = anthropic_async_clients.get(api_key)
         if not client:
-            client = get_anthropic_client(api_key, api_base_url)
-            anthropic_clients[api_key] = client
+            client = get_anthropic_async_client(api_key, api_base_url)
+            anthropic_async_clients[api_key] = client
 
         model_kwargs = model_kwargs or dict()
         max_tokens = DEFAULT_MAX_TOKENS_ANTHROPIC
@@ -193,7 +144,8 @@ def anthropic_llm_thread(
 
         aggregated_response = ""
         final_message = None
-        with client.messages.stream(
+        start_time = perf_counter()
+        async with client.messages.stream(
             messages=formatted_messages,
             model=model_name,  # type: ignore
             temperature=temperature,
@@ -202,10 +154,17 @@ def anthropic_llm_thread(
             max_tokens=max_tokens,
             **model_kwargs,
         ) as stream:
-            for text in stream.text_stream:
+            async for text in stream.text_stream:
+                # Log the time taken to start response
+                if aggregated_response == "":
+                    logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
+                # Handle streamed response chunk
                 aggregated_response += text
-                g.send(text)
-            final_message = stream.get_final_message()
+                yield text
+            final_message = await stream.get_final_message()
+
+        # Log the time taken to stream the entire response
+        logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
 
         # Calculate cost of chat
         input_tokens = final_message.usage.input_tokens
@@ -222,9 +181,7 @@ def anthropic_llm_thread(
         if is_promptrace_enabled():
             commit_conversation_trace(messages, aggregated_response, tracer)
     except Exception as e:
-        logger.error(f"Error in anthropic_llm_thread: {e}", exc_info=True)
-    finally:
-        g.close()
+        logger.error(f"Error in anthropic_chat_completion_with_backoff stream: {e}", exc_info=True)
 
 
 def format_messages_for_anthropic(messages: list[ChatMessage], system_prompt: str = None):