From c035edc9d2a2db14417ef64e45f22c5b6fe4fe7c Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Fri, 18 Jul 2025 01:05:35 +0100
Subject: [PATCH 1/2] feat: RAG solver

---
 README.md                              | 101 ++++-
 ovos_solver_openai_persona/__init__.py |   9 +-
 ovos_solver_openai_persona/engines.py  |   5 +-
 ovos_solver_openai_persona/rag.py      | 500 +++++++++++++++++++++++++
 setup.py                               |   4 +-
 5 files changed, 596 insertions(+), 23 deletions(-)
 create mode 100644 ovos_solver_openai_persona/rag.py
diff --git a/README.md b/README.md
index ac34a64..659de82 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,26 @@
 # <img src='https://raw.githack.com/FortAwesome/Font-Awesome/master/svgs/solid/robot.svg' card_color='#40DBB0' width='50' height='50' style='vertical-align:bottom'/> OVOS OpenAI Plugin
 
-Leverages [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) to provide the following ovos plugins: 
-- `ovos-solver-openai-plugin` for usage with [ovos-persona](https://github.com/OpenVoiceOS/ovos-persona) (and in older ovos releases with [ovos-skill-fallback-chatgpt]())
-- `ovos-dialog-transformer-openai-plugin` to rewrite OVOS dialogs just before TTS executes in [ovos-audio](https://github.com/OpenVoiceOS/ovos-audio)
-- `ovos-summarizer-openai-plugin` to summarize text, not used directly but provided for consumption by other plugins/skills
+This plugin is designed to leverage the **OpenAI API** for various functionalities within the OpenVoiceOS ecosystem. It provides a set of OVOS plugins that interact with OpenAI's services. Crucially, it is also compatible with **self-hosted OpenAI-compatible alternatives**, such as the [OVOS Persona Server](https://github.com/OpenVoiceOS/ovos-persona-server), or any other project that implements the full suite of OpenAI API endpoints (Chat Completions, Embeddings, Files, and Vector Stores). This flexibility allows you to choose between cloud-based OpenAI services or a local, private setup.
+
+Specifically, this plugin provides:
+
+  - `ovos-solver-openai-plugin` for general chat completions, primarily for usage with [ovos-persona](https://github.com/OpenVoiceOS/ovos-persona) (and in older ovos releases with [ovos-skill-fallback-chatgpt](https://www.google.com/search?q=))
+  - `ovos-solver-openai-rag-plugin` for Retrieval Augmented Generation using a compatible backend (like `ovos-persona-server`) as a knowledge source.
+  - `ovos-dialog-transformer-openai-plugin` to rewrite OVOS dialogs just before TTS executes in [ovos-audio](https://github.com/OpenVoiceOS/ovos-audio)
+  - `ovos-summarizer-openai-plugin` to summarize text, not used directly but provided for consumption by other plugins/skills
+
+-----
 
 ## Install
 
 `pip install ovos-openai-plugin`
 
+-----
+
 ## Persona Usage
 
-To create your own persona using a OpenAI compatible server create a .json in `~/.config/ovos_persona/llm.json`:  
+To create your own persona using a OpenAI compatible server create a .json in `~/.config/ovos_persona/llm.json`:
+
 ```json
 {
   "name": "My Local LLM",
@@ -30,21 +39,76 @@ Then say "Chat with {name_from_json}" to enable it, more details can be found in
 
 This plugins also provides a default "Remote LLama" demo persona, it points to a public server hosted by @goldyfruit.
 
+-----
+
+## RAG Solver Usage
+
+The `ovos-solver-openai-rag-plugin` enables **Retrieval Augmented Generation (RAG)**. This means your OVOS assistant can answer questions by first searching for relevant information in a configured knowledge base (a "vector store" hosted by a compatible backend like `ovos-persona-server`), and then using an LLM to generate a coherent answer based on that retrieved context.
+
+This is particularly useful for:
+
+  * Answering questions about specific documentation, personal notes, or proprietary data.
+  * Reducing LLM hallucinations by grounding responses in factual, provided information.
+
+### How it Works
+
+1.  **Search**: When a user asks a question, the RAG solver first sends the query to the configured backend's vector store search endpoint.
+2.  **Retrieve**: The backend returns relevant text chunks (documents or passages) from your indexed data.
+3.  **Augment**: These retrieved chunks are then injected into the LLM's prompt, along with the user's original query and conversation history.
+4.  **Generate**: The LLM processes this augmented prompt and generates an answer, prioritizing the provided context.
+
+### Configuration
+
+To use the RAG solver, you need to configure it in your `~/.config/ovos_persona/llm.json` file. You will need:
+
+1.  A **compatible OpenAI API backend running** (e.g., [ovos-persona-server](https://github.com/OpenVoiceOS/ovos-persona-server)) with a populated vector store.
+2.  The `vector_store_id` of your created vector store on that backend.
+3.  The `llm_model` and `llm_api_key` for the LLM that your chosen backend will use for chat completions.
+
+Here's an example `llm.json` configuration for a RAG persona:
+
+```json
+{
+  "name": "My RAG Assistant",
+  "solvers": [
+    "ovos-solver-openai-rag-plugin"
+  ],
+  "ovos-solver-openai-rag-plugin": {
+    "persona_server_url": "http://localhost:8337/v1",  // URL of your OpenAI-compatible backend
+    "vector_store_id": "vs_your_vector_store_id_here", // <<< REPLACE THIS!
+    "max_num_results": 5,                             // Max text chunks to retrieve
+    "max_context_tokens": 2000,                       // Max tokens from retrieved context for LLM
+    "system_prompt_template": "You are a helpful assistant. Use the following context to answer the user's question. If the answer is not in the context, state that you don't know.\n\nContext:\n{context}\n\nQuestion:\n{question}",
+    "llm_model": "llama3.1:8b",                       // The LLM model name used by the backend
+    "llm_api_key": "sk-xxxx",                         // API key for the LLM on the backend (can be dummy for local setups)
+    "llm_temperature": 0.7,
+    "llm_top_p": 1.0,
+    "llm_max_tokens": 500,
+    "enable_memory": true,                            // Enable conversation history for RAG
+    "memory_size": 3                                  // Number of Q&A pairs to remember
+  }
+}
+```
+
+-----
+
 ## Dialog Transformer
 
-you can rewrite text dynamically based on specific personas, such as simplifying explanations or mimicking a specific tone.  
+You can rewrite text dynamically based on specific personas, such as simplifying explanations or mimicking a specific tone.
 
 #### Example Usage:
-- **`rewrite_prompt`:** `"rewrite the text as if you were explaining it to a 5-year-old"`  
-- **Input:** `"Quantum mechanics is a branch of physics that describes the behavior of particles at the smallest scales."`  
-- **Output:** `"Quantum mechanics is like a special kind of science that helps us understand really tiny things."`  
+
+  - **`rewrite_prompt`:** `"rewrite the text as if you were explaining it to a 5-year-old"`
+  - **Input:** `"Quantum mechanics is a branch of physics that describes the behavior of particles at the smallest scales."`
+  - **Output:** `"Quantum mechanics is like a special kind of science that helps us understand really tiny things."`
 
 Examples of `rewrite_prompt` Values:
-- `"rewrite the text as if it was an angry old man speaking"`  
-- `"Add more 'dude'ness to it"`  
-- `"Explain it like you're teaching a child"`  
 
-To enable this plugin, add the following to your `mycroft.conf`:  
+  - `"rewrite the text as if it was an angry old man speaking"`
+  - `"Add more 'dude'ness to it"`
+  - `"Explain it like you're teaching a child"`
+
+To enable this plugin, add the following to your `mycroft.conf`:
 
 ```json
 "dialog_transformers": {
@@ -57,6 +121,8 @@ To enable this plugin, add the following to your `mycroft.conf`:
 
 > 💡 the user utterance will be appended after `rewrite_prompt` for the actual query
 
+-----
+
 ## Direct Usage
 
 ```python
@@ -71,11 +137,10 @@ print(bot.spoken_answer("Quem encontrou o caminho maritimo para o Brazil", lang=
 
 ```
 
-## Remote Persona / Proxies
-
-You can run any persona behind a OpenAI compatible server via [ovos-persona-server](https://github.com/OpenVoiceOS/ovos-persona-server). 
+-----
 
-This allows you to offload the workload to a standalone server, either for performance reasons or to keep api keys in a single safe place.
+## Remote Persona / Proxies
 
-Then just configure this plugin to point to your persona server like it was OpenAI
+You can run any persona behind an **OpenAI-compatible server** (such as [ovos-persona-server](https://github.com/OpenVoiceOS/ovos-persona-server)).
 
+This allows you to offload the workload to a standalone server, either for performance reasons or to keep API keys in a single safe place. Then, you just configure this plugin to point to your self-hosted server as if it were the official OpenAI API.
diff --git a/ovos_solver_openai_persona/__init__.py b/ovos_solver_openai_persona/__init__.py
index 8fc85e1..7d4f396 100644
--- a/ovos_solver_openai_persona/__init__.py
+++ b/ovos_solver_openai_persona/__init__.py
@@ -38,4 +38,11 @@ def __init__(self, *args, **kwargs):
 
     # Quantum mechanics is a branch of physics that deals with the behavior of particles on a very small scale, such as atoms and subatomic particles. It explores the idea that particles can exist in multiple states at once and that their behavior is not predictable in the traditional sense.
     print(bot.spoken_answer("what is the definition of computer", lang="en-US"))
-    # O português Pedro Álvares Cabral encontrou o caminho marítimo para o Brasil em 1500. Ele foi o responsável por descobrir o litoral brasileiro, embora Cristóvão Colombo tenha chegado à América do Sul em 1498, cinco anos antes. Cabral desembarcou na atual costa de Alagoas, no Nordeste do Brasil.
+    # Okay, let's break down the definition of a computer. Here's a comprehensive explanation, covering different aspects:
+    #
+    # At its core, a computer is an electronic device that can:
+    #
+    #    Receive Input: Take in data and instructions.
+    #    Process Data: Perform calculations and manipulate data based on those instructions.
+    #    Store Data:  Save data and instructions for later use.
+    #    Produce Output:  Present the results of processing in a
\ No newline at end of file
diff --git a/ovos_solver_openai_persona/engines.py b/ovos_solver_openai_persona/engines.py
index dd25ad6..9c8877f 100644
--- a/ovos_solver_openai_persona/engines.py
+++ b/ovos_solver_openai_persona/engines.py
@@ -126,7 +126,7 @@ def __init__(self, config=None,
         self.system_prompt = config.get("system_prompt") or config.get("initial_prompt")
         if not self.system_prompt:
             self.system_prompt =  "You are a helpful assistant."
-            LOG.error(f"system prompt not set in config! defaulting to '{self.system_prompt}'")
+            LOG.debug(f"system prompt not set in config! defaulting to '{self.system_prompt}'")
 
     # OpenAI API integration
     def _do_api_request(self, messages):
@@ -256,7 +256,7 @@ def get_messages(self, utt, system_prompt=None) -> MessageList:
         messages.append({"role": "user", "content": utt})
         return messages
 
-    # abstract Solver methods
+    ## chat completions api - message list as input
     def continue_chat(self, messages: MessageList,
                       lang: Optional[str],
                       units: Optional[str] = None) -> Optional[str]:
@@ -317,6 +317,7 @@ def stream_chat_utterances(self, messages: MessageList,
                     yield post_process_sentence(answer)
                 answer = ""
 
+    ## completions api - single text as input
     def stream_utterances(self, query: str,
                           lang: Optional[str] = None,
                           units: Optional[str] = None) -> Iterable[str]:
diff --git a/ovos_solver_openai_persona/rag.py b/ovos_solver_openai_persona/rag.py
new file mode 100644
index 0000000..ae7861b
--- /dev/null
+++ b/ovos_solver_openai_persona/rag.py
@@ -0,0 +1,500 @@
+import json
+import requests
+from ovos_plugin_manager.solvers import ChatMessageSolver
+from ovos_plugin_manager.templates.language import LanguageTranslator, LanguageDetector
+from ovos_utils.log import LOG
+from typing import Optional, List, Iterable, Dict, Any
+
+
+class RequestException(Exception):
+    """Custom exception for API request errors."""
+    pass
+
+
+class OpenAIRAGSolver(ChatMessageSolver):
+    """
+    An OVOS Solver plugin that implements Retrieval Augmented Generation (RAG)
+    by interacting with OpenAI or Persona Server backend for vector store search
+    and then directly calling the Persona Server's chat completions endpoint
+    with the augmented context.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None,
+                 translator: Optional[LanguageTranslator] = None,
+                 detector: Optional[LanguageDetector] = None,
+                 priority: int = 50,
+                 enable_tx: bool = False,
+                 enable_cache: bool = False,
+                 internal_lang: Optional[str] = None):
+        """
+        Initializes the PersonaServerRAGSolver.
+
+        Args:
+            config (dict): Configuration dictionary for the solver. Expected keys:
+                - "api_url" (str): Base URL of the ovos-persona-server (e.g., "http://localhost:8337/v1").
+                - "vector_store_id" (str): The ID of the vector store to query for RAG.
+                - "max_num_results" (int, optional): Max number of chunks to retrieve from search. Defaults to 5.
+                - "max_context_tokens" (int, optional): Max tokens for retrieved context in the LLM prompt. Defaults to 2000.
+                - "system_prompt_template" (str, optional): Template for the RAG system prompt.
+                                                          Must contain "{context}" and "{question}" placeholders.
+                - "llm_model" (str, optional): The model name to use for chat completions on the Persona Server.
+                - "key" (str, optional): API key for the Persona Server's chat completions endpoint.
+                - "llm_temperature" (float, optional): Sampling temperature for LLM. Defaults to 0.7.
+                - "llm_top_p" (float, optional): Top-p sampling for LLM. Defaults to 1.0.
+                - "llm_max_tokens" (int, optional): Max tokens for LLM generation. Defaults to 500.
+            translator (LanguageTranslator, optional): Language translator instance.
+            detector (LanguageDetector, optional): Language detector instance.
+            priority (int): Solver priority.
+            enable_tx (bool): Enable translation.
+            enable_cache (bool): Enable caching.
+            internal_lang (str, optional): Internal language code.
+
+        Raises:
+            ValueError: If required configuration parameters are missing or invalid.
+        """
+        super().__init__(config=config, translator=translator,
+                         detector=detector, priority=priority,
+                         enable_tx=enable_tx, enable_cache=enable_cache,
+                         internal_lang=internal_lang)
+
+        # Persona Server RAG Configuration
+        self.api_url = self.config.get("api_url")
+        self.vector_store_id = self.config.get("vector_store_id")
+        self.max_num_results = self.config.get("max_num_results", 5)
+        self.max_context_tokens = self.config.get("max_context_tokens", 2000)
+
+        if not self.api_url:
+            raise ValueError("api_url must be set in config for PersonaServerRAGSolver")
+        if not self.vector_store_id:
+            raise ValueError("vector_store_id must be set in config for PersonaServerRAGSolver")
+
+        # RAG System Prompt Template
+        self.system_prompt_template = self.config.get("system_prompt_template")
+        if not self.system_prompt_template:
+            self.system_prompt_template = (
+                "You are a helpful assistant. Use the following context to answer the user's question. "
+                "If the answer is not in the context, state that you don't know.\n\n"
+                "Context:\n{context}\n\nQuestion:\n{question}"
+            )
+            LOG.debug(f"system_prompt_template not set, defaulting to: '{self.system_prompt_template}'")
+        elif "{context}" not in self.system_prompt_template or "{question}" not in self.system_prompt_template:
+            raise ValueError("system_prompt_template must contain '{context}' and '{question}' placeholders.")
+
+        # LLM Parameters for direct call to Persona Server's chat completions
+        self.llm_model = self.config.get("llm_model")
+        self.key = self.config.get("key") # This is the key for the Persona Server's chat endpoint
+        self.llm_temperature = self.config.get("llm_temperature", 0.7)
+        self.llm_top_p = self.config.get("llm_top_p", 1.0)
+        self.llm_max_tokens = self.config.get("llm_max_tokens", 500)
+
+        if not self.llm_model:
+             LOG.warning("llm_model not set. This is fine for Persona Server, but ensure your LLM provider allows it")
+        # key can be optional for local Ollama setups, but good practice to include check
+        if not self.key:
+             LOG.warning("key not set. This might be fine for local Ollama, but ensure your Persona Server allows unauthenticated access or provide a key.")
+
+        # Memory for this RAG solver
+        self.memory = config.get("enable_memory", True)
+        self.max_utts = config.get("memory_size", 3)
+        self.qa_pairs = []  # Stores (user_query, final_rag_answer) for history
+
+    def _search_vector_store(self, query: str) -> List[str]:
+        """
+        Performs a search against the ovos-persona-server's vector store.
+
+        Args:
+            query (str): The user's query string.
+
+        Returns:
+            List[str]: A list of relevant text chunks (content) retrieved from the vector store.
+
+        Raises:
+            RequestException: If the search API call fails or returns an error.
+        """
+        search_url = f"{self.api_url}/vector_stores/{self.vector_store_id}/search"
+        headers = {"Content-Type": "application/json"}
+        payload = {
+            "query": query,
+            "max_num_results": self.max_num_results
+        }
+        LOG.debug(f"Sending RAG search request to {search_url} with query: {query}")
+
+        try:
+            response = requests.post(search_url, headers=headers, data=json.dumps(payload))
+            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
+            data = response.json()
+
+            if "data" not in data or not isinstance(data["data"], list):
+                raise RequestException(f"Unexpected response format from RAG search: {data}")
+
+            # Extract content from search results
+            retrieved_chunks = [item["content"] for item in data["data"] if "content" in item]
+            LOG.debug(f"Retrieved {len(retrieved_chunks)} chunks from vector store.")
+            return retrieved_chunks
+        except requests.exceptions.RequestException as e:
+            LOG.error(f"Error during RAG search request to {search_url}: {e}")
+            raise RequestException(f"Failed to retrieve context from vector store: {e}")
+        except json.JSONDecodeError as e:
+            LOG.error(f"Failed to parse JSON response from RAG search: {e}")
+            raise RequestException(f"Invalid JSON response from vector store: {e}")
+
+    def _build_llm_messages(self, user_query: str, retrieved_context_chunks: List[str], chat_history: List[Dict[str, str]]) -> List[Dict[str, str]]:
+        """
+        Constructs the complete message list for the LLM, including RAG context and chat history.
+
+        Args:
+            user_query (str): The current user's utterance.
+            retrieved_context_chunks (List[str]): List of text chunks retrieved from the vector store.
+            chat_history (List[Dict[str, str]]): The conversation history from `self.qa_pairs`.
+
+        Returns:
+            List[Dict[str, str]]: A new list of messages, augmented with the RAG context and history.
+        """
+        context_str = ""
+        current_context_tokens = 0
+
+        # Build context string, respecting max_context_tokens
+        for chunk in retrieved_context_chunks:
+            # Estimate tokens for the chunk if added to a system prompt
+            chunk_tokens = len(chunk.split()) # Simple word count as token estimate
+            if current_context_tokens + chunk_tokens <= self.max_context_tokens:
+                context_str += chunk + "\n\n"
+                current_context_tokens += chunk_tokens
+            else:
+                LOG.debug(f"Truncating RAG context due to max_context_tokens limit. Added {current_context_tokens} tokens.")
+                break
+
+        # Construct the RAG-augmented system prompt
+        rag_system_prompt = self.system_prompt_template.format(
+            context=context_str.strip(),
+            question=user_query
+        )
+
+        # Start with the RAG-augmented system prompt
+        messages: List[Dict[str, str]] = [{"role": "system", "content": rag_system_prompt}]
+
+        # Append prior conversation history
+        for q, a in self.qa_pairs[-1 * self.max_utts:]: # Use RAG solver's memory
+            messages.append({"role": "user", "content": q})
+            messages.append({"role": "assistant", "content": a})
+
+        # Append the current user query
+        messages.append({"role": "user", "content": user_query})
+
+        LOG.debug(f"Constructed LLM prompt messages: {messages}")
+        return messages
+
+    def get_chat_history(self) -> List[Dict[str, str]]:
+        """
+        Returns the chat history managed by this RAG solver.
+        This method is called by the base ChatMessageSolver.
+        """
+        # The base class expects a list of messages (role, content).
+        # We store (query, answer) tuples.
+        history_messages = []
+        for q, a in self.qa_pairs[-1 * self.max_utts:]:
+            history_messages.append({"role": "user", "content": q})
+            history_messages.append({"role": "assistant", "content": a})
+        return history_messages
+
+    ## chat completions api - message list as input
+    def continue_chat(self, messages: List[Dict[str, str]],
+                      lang: Optional[str],
+                      units: Optional[str] = None) -> Optional[str]:
+        """
+        Generates a chat response using RAG by directly calling the Persona Server's
+        chat completions endpoint.
+
+        Args:
+            messages: List of chat messages with 'role' and 'content' keys.
+                      The last user message is used for RAG retrieval and as the current query.
+            lang: Optional language code for the response.
+            units: Optional unit system for numerical values.
+
+        Returns:
+            The generated response as a string, or None if no valid response is produced.
+        """
+        user_query = messages[-1]["content"] # Get the current user query
+
+        # 1. Search vector store for context
+        try:
+            retrieved_chunks = self._search_vector_store(user_query)
+        except RequestException:
+            LOG.warning("RAG search failed, proceeding with LLM without augmented context.")
+            retrieved_chunks = []
+
+        # 2. Build augmented messages for the LLM, including RAG solver's history
+        augmented_messages = self._build_llm_messages(user_query, retrieved_chunks, messages[:-1]) # Pass existing history
+
+        # 3. Call Persona Server's chat completions endpoint
+        chat_completions_url = f"{self.api_url}/chat/completions"
+        headers = {"Content-Type": "application/json"}
+        if self.key:
+            headers["Authorization"] = f"Bearer {self.key}"
+
+        payload = {
+            "model": self.llm_model,
+            "messages": augmented_messages,
+            "max_tokens": self.llm_max_tokens,
+            "temperature": self.llm_temperature,
+            "top_p": self.llm_top_p,
+            "stream": False # Non-streaming call
+        }
+        LOG.debug(f"Sending LLM request to {chat_completions_url} with payload: {payload}")
+
+        try:
+            response = requests.post(chat_completions_url, headers=headers, data=json.dumps(payload))
+            response.raise_for_status()
+            data = response.json()
+
+            if "choices" in data and len(data["choices"]) > 0 and "message" in data["choices"][0]:
+                answer = data["choices"][0]["message"]["content"]
+                if self.memory and answer:
+                    self.qa_pairs.append((user_query, answer)) # Store for future turns
+                return answer
+            else:
+                raise RequestException(f"Unexpected response format from LLM: {data}")
+        except requests.exceptions.RequestException as e:
+            LOG.error(f"Error during LLM chat completions request to {chat_completions_url}: {e}")
+            raise RequestException(f"Failed to get LLM response: {e}")
+        except json.JSONDecodeError as e:
+            LOG.error(f"Failed to parse JSON response from LLM: {e}")
+            raise RequestException(f"Invalid JSON response from LLM: {e}")
+
+    def stream_chat_utterances(self, messages: List[Dict[str, str]],
+                               lang: Optional[str] = None,
+                               units: Optional[str] = None) -> Iterable[str]: # Yields raw data: lines
+        """
+        Stream utterances for the given chat history using RAG by directly calling the Persona Server's
+        chat completions endpoint in streaming mode.
+
+        Args:
+            messages: The chat messages. The last user message is used for RAG retrieval and as the current query.
+            lang (Optional[str]): Optional language code. Defaults to None.
+            units (Optional[str]): Optional units for the query. Defaults to None.
+
+        Returns:
+            Iterable[str]: An iterable of raw data: [JSON] strings from the streaming API.
+        """
+        user_query = messages[-1]["content"] # Get the current user query
+
+        # 1. Search vector store for context
+        try:
+            retrieved_chunks = self._search_vector_store(user_query)
+        except RequestException:
+            LOG.warning("RAG search failed, proceeding with LLM without augmented context.")
+            retrieved_chunks = []
+
+        # 2. Build augmented messages for the LLM, including RAG solver's history
+        augmented_messages = self._build_llm_messages(user_query, retrieved_chunks, messages[:-1]) # Pass existing history
+
+        # 3. Call Persona Server's chat completions endpoint in streaming mode
+        chat_completions_url = f"{self.api_url}/chat/completions"
+        headers = {"Content-Type": "application/json"}
+        if self.key:
+            headers["Authorization"] = f"Bearer {self.key}"
+
+        payload = {
+            "model": self.llm_model,
+            "messages": augmented_messages,
+            "max_tokens": self.llm_max_tokens,
+            "temperature": self.llm_temperature,
+            "top_p": self.llm_top_p,
+            "stream": True # Streaming call
+        }
+        LOG.debug(f"Sending streaming LLM request to {chat_completions_url} with payload: {payload}")
+
+        full_answer = "" # To reconstruct the full answer for memory
+        try:
+            with requests.post(chat_completions_url, headers=headers, data=json.dumps(payload), stream=True) as response:
+                response.raise_for_status()
+                for line in response.iter_lines():
+                    if line:
+                        decoded_line = line.decode('utf-8')
+                        # The Persona Server already sends "data: " prefix and "[DONE]"
+                        # So we can yield the line directly
+                        if decoded_line.startswith("data: "):
+                            json_part = decoded_line[len("data: "):].strip()
+                            if json_part == "[DONE]":
+                                break
+                            try:
+                                chunk_dict = json.loads(json_part)
+                                content = chunk_dict.get("choices", [{}])[0].get("delta", {}).get("content")
+                                if content:
+                                    full_answer += content
+                            except json.JSONDecodeError:
+                                pass # Ignore non-JSON lines
+                        yield decoded_line # Yield the raw data: line
+        except requests.exceptions.RequestException as e:
+            LOG.error(f"Error during streaming LLM chat completions request to {chat_completions_url}: {e}")
+            # Yield an error chunk in the stream
+            yield f"data: {json.dumps({'error': str(e), 'done': True})}\n\n"
+
+        if self.memory and full_answer:
+            self.qa_pairs.append((user_query, full_answer)) # Store for future turns
+
+
+    ## completions api - single text as input (delegates to chat)
+    def stream_utterances(self, query: str,
+                          lang: Optional[str] = None,
+                          units: Optional[str] = None) -> Iterable[str]:
+        """
+        Stream utterances for the given query using RAG.
+
+        Args:
+            query (str): The query text.
+            lang (Optional[str]): Optional language code. Defaults to None.
+            units (Optional[str]): Optional units for the query. Defaults to None.
+
+        Returns:
+            Iterable[str]: An iterable of raw data: [JSON] strings from the streaming API.
+        """
+        # For stream_utterances, we directly build a single-turn message list
+        # We need to include existing chat history here as well for proper context
+        messages: List[Dict[str, str]] = self.get_chat_history()
+        messages.append({"role": "user", "content": query})
+        yield from self.stream_chat_utterances(messages, lang, units)
+
+    def get_spoken_answer(self, query: str,
+                          lang: Optional[str] = None,
+                          units: Optional[str] = None) -> Optional[str]:
+        """
+        Obtain the spoken answer for a given query using RAG.
+
+        Args:
+            query (str): The query text.
+            lang (Optional[str]): Optional language code. Defaults to None.
+            units (Optional[str]): Optional units for the query. Defaults to None.
+
+        Returns:
+            str: The spoken answer as a text response.
+        """
+        # For get_spoken_answer, we need to include existing chat history
+        messages: List[Dict[str, str]] = self.get_chat_history()
+        messages.append({"role": "user", "content": query})
+        return self.continue_chat(messages=messages, lang=lang, units=units)
+
+    def get_messages(self, utt: str, system_prompt: Optional[str] = None) -> List[Dict[str, str]]:
+        """
+        Builds a message list including the RAG solver's chat history and the current user utterance.
+        The system prompt for the LLM is constructed dynamically in _build_llm_messages.
+        """
+        messages = self.get_chat_history()
+        messages.append({"role": "user", "content": utt})
+        return messages
+
+
+if __name__ == "__main__":
+    # --- Live Test Example for PersonaServerRAGSolver ---
+    # This example assumes:
+    # 1. ovos-persona-server is running at http://localhost:8337
+    # 2. You have a vector store created and populated with documents
+    #    (e.g., using the curl examples from the README)
+
+    # --- IMPORTANT: Replace these with your actual values ---
+    PERSONA_SERVER_URL = "http://0.0.0.0:8337/v1"
+    VECTOR_STORE_ID = "vs_YgqHwhmyJ48kkI6jU6G3ApSq"  # <<< REPLACE THIS with your vector store ID
+    LLM_API_KEY = "sk-xxxx"  # Can be any non-empty string for local setups like Ollama
+    LLM_MODEL = "qwen2.5:7b" # Or "gpt-4o-mini", etc.
+    # ----------------------------------------------------
+
+    rag_solver_config = {
+        "api_url": PERSONA_SERVER_URL,
+        "key": LLM_API_KEY,
+        "llm_model": LLM_MODEL,
+        "vector_store_id": VECTOR_STORE_ID,
+        "max_num_results": 3,  # Max chunks to retrieve
+        "max_context_tokens": 1000,  # Max tokens for the context in the LLM prompt
+        "system_prompt_template": (
+            "You are a helpful assistant. Use the following context to answer the user's question. "
+            "You can give personal opinions and deviate from the goal, but keep things factual."
+            "If the answer is not in the context and can not be inferred from the conversation history, state that you don't know.\n\n"
+            "Context:\n{context}\n\nQuestion:\n{question}"
+        ),
+        "llm_temperature": 0.7,
+        "llm_top_p": 1.0,
+        "llm_max_tokens": 500,
+        "enable_memory": True, # Enable memory for this solver
+        "memory_size": 3 # Store 3 Q&A pairs
+    }
+
+    print("--- Initializing PersonaServerRAGSolver ---")
+    try:
+        rag_solver = OpenAIRAGSolver(config=rag_solver_config)
+        print("Solver initialized successfully.")
+    except (ValueError, RuntimeError, TypeError) as e:
+        print(f"Error initializing RAG solver: {e}")
+        print("Please ensure your configuration (api_url, vector_store_id, llm_model, key) is correct.")
+        exit(1)
+
+    print("\n--- Testing get_spoken_answer (non-streaming) ---")
+    test_query_non_streaming = "What is Nedzo"
+    print(f"Query: {test_query_non_streaming}")
+    try:
+        answer = rag_solver.get_spoken_answer(query=test_query_non_streaming, lang="en")
+        print(f"Answer: {answer}")
+    except RequestException as e:
+        print(f"Error during non-streaming RAG query: {e}")
+        print("Please ensure ovos-persona-server is running and your vector_store_id is valid.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+    print("\n--- Testing stream_utterances (streaming) ---")
+    test_query_streaming = "what is the purpose and goal"
+    print(f"Query: {test_query_streaming}")
+    print("Streaming Answer: ", end="")
+    try:
+        for chunk_line in rag_solver.stream_utterances(query=test_query_streaming, lang="en"):
+            # The Persona Server sends "data: {json_chunk}\n\n" or "data: [DONE]\n\n"
+            # We need to parse the JSON and extract content
+            if chunk_line.startswith("data: "):
+                json_part = chunk_line[len("data: "):].strip()
+                if json_part == "[DONE]":
+                    break
+                try:
+                    chunk_dict = json.loads(json_part)
+                    content = chunk_dict.get("choices", [{}])[0].get("delta", {}).get("content")
+                    if content:
+                        print(content, end="", flush=True)
+                except json.JSONDecodeError:
+                    # Handle cases where the line is not valid JSON (e.g., just "data: ")
+                    pass
+        print()  # Newline after streaming finishes
+    except RequestException as e:
+        print(f"\nError during streaming RAG query: {e}")
+        print("Please ensure ovos-persona-server is running and your vector_store_id is valid.")
+    except Exception as e:
+        print(f"\nAn unexpected error occurred: {e}")
+
+    print("\n--- Testing with a query that might not find context ---")
+    test_query_no_context = "What color is the sky on Mars?"
+    print(f"Query: {test_query_no_context}")
+    try:
+        answer_no_context = rag_solver.get_spoken_answer(query=test_query_no_context, lang="en")
+        print(f"Answer: {answer_no_context}")
+    except RequestException as e:
+        print(f"Error during RAG query (no context): {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+    print("\n--- Testing multi-turn conversation with memory ---")
+    print("Query 1:What is the role of agents?")
+    try:
+        answer1 = rag_solver.get_spoken_answer(query="What is the role of agents", lang="en")
+        print(f"Answer 1: {answer1}")
+    except Exception as e:
+        print(f"Error in multi-turn query 1: {e}")
+
+    print("\nQuery 2: how many of them must be supported at once?")
+    try:
+        answer2 = rag_solver.get_spoken_answer(query="how many of them must be supported at once?", lang="en")
+        print(f"Answer 2: {answer2}")
+    except Exception as e:
+        print(f"Error in multi-turn query 2: {e}")
+
+    print("\nQuery 3: summarize our conversation so far")
+    try:
+        answer3 = rag_solver.get_spoken_answer(query="summarize our conversation so far", lang="en")
+        print(f"Answer 3: {answer3}")
+    except Exception as e:
+        print(f"Error in multi-turn query 3: {e}")
diff --git a/setup.py b/setup.py
index 31ceec0..c2bbfd9 100755
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,6 @@ def required(requirements_file):
 
 def get_version():
     """ Find the version of the package"""
-    version = None
     version_file = os.path.join(BASEDIR, 'ovos_solver_openai_persona', 'version.py')
     major, minor, build, alpha = (None, None, None, None)
     with open(version_file) as f:
@@ -49,6 +48,7 @@ def get_version():
 
 PERSONA_ENTRY_POINT = 'Remote Llama=ovos_solver_openai_persona:LLAMA_DEMO'
 PLUGIN_ENTRY_POINT = 'ovos-solver-openai-plugin=ovos_solver_openai_persona.engines:OpenAIChatCompletionsSolver'
+RAG_ENTRY_POINT = 'ovos-solver-openai-rag-plugin=ovos_solver_openai_persona.rag:OpenAIRAGSolver'
 DIALOG_PLUGIN_ENTRY_POINT = 'ovos-dialog-transformer-openai-plugin=ovos_solver_openai_persona.dialog_transformers:OpenAIDialogTransformer'
 SUMMARIZER_ENTRY_POINT = 'ovos-summarizer-openai-plugin=ovos_solver_openai_persona.summarizer:OpenAISummarizer'
 
@@ -65,7 +65,7 @@ def get_version():
     zip_safe=True,
     keywords='ovos plugin utterance fallback query',
     entry_points={
-        'neon.plugin.solver': PLUGIN_ENTRY_POINT,
+        'opm.solver.chat': [PLUGIN_ENTRY_POINT, RAG_ENTRY_POINT],
         "opm.transformer.dialog": DIALOG_PLUGIN_ENTRY_POINT,
         'opm.solver.summarization': SUMMARIZER_ENTRY_POINT,
         "opm.plugin.persona": PERSONA_ENTRY_POINT

From a06bc25487033ea44db2a1d3731c2371914f7cc0 Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Fri, 18 Jul 2025 01:29:23 +0100
Subject: [PATCH 2/2] docstrs

---
 ovos_solver_openai_persona/engines.py | 50 +++++++-------
 ovos_solver_openai_persona/rag.py     | 95 +++++++++++++++------------
 2 files changed, 78 insertions(+), 67 deletions(-)

diff --git a/ovos_solver_openai_persona/engines.py b/ovos_solver_openai_persona/engines.py
index 9c8877f..0ac4ebc 100644
--- a/ovos_solver_openai_persona/engines.py
+++ b/ovos_solver_openai_persona/engines.py
@@ -101,10 +101,10 @@ def __init__(self, config=None,
                  enable_cache: bool = False,
                  internal_lang: Optional[str] = None):
         """
-        Initializes the OpenAIChatCompletionsSolver with API configuration, memory settings, and system prompt.
-         
+        Initialize an OpenAIChatCompletionsSolver instance with API configuration, conversation memory settings, and system prompt.
+
         Raises:
-            ValueError: If the API key is not provided in the configuration.
+            ValueError: If the API key is missing from the configuration.
         """
         super().__init__(config=config, translator=translator,
                  detector=detector, priority=priority,
@@ -131,16 +131,16 @@ def __init__(self, config=None,
     # OpenAI API integration
     def _do_api_request(self, messages):
         """
-        Sends a chat completion request to the OpenAI API and returns the assistant's reply.
+        Send a chat completion request to the OpenAI API using the provided conversation history and return the assistant's reply.
         
-        Args:
-            messages: A list of message dictionaries representing the conversation history.
+        Parameters:
+            messages (list): Conversation history as a list of message dictionaries.
         
         Returns:
-            The content of the assistant's reply as a string.
+            str: The assistant's reply content.
         
         Raises:
-            RequestException: If the OpenAI API returns an error in the response.
+            RequestException: If the OpenAI API response contains an error.
         """
         s = requests.Session()
         headers = {
@@ -243,14 +243,14 @@ def get_chat_history(self, system_prompt=None):
 
     def get_messages(self, utt, system_prompt=None) -> MessageList:
         """
-        Builds a list of chat messages including the system prompt, recent conversation history, and the current user utterance.
+        Constructs a list of chat messages for the API, including the system prompt, recent conversation history, and the current user utterance.
         
-        Args:
-        	utt: The current user input to be appended as the latest message.
+        Parameters:
+        	utt: The current user input to be added as the latest message.
         	system_prompt: Optional system prompt to use as the initial message.
         
         Returns:
-        	A list of message dictionaries representing the chat context for the API.
+        	A list of message dictionaries representing the chat context.
         """
         messages = self.get_chat_history(system_prompt)
         messages.append({"role": "user", "content": utt})
@@ -261,17 +261,17 @@ def continue_chat(self, messages: MessageList,
                       lang: Optional[str],
                       units: Optional[str] = None) -> Optional[str]:
         """
-        Generates a chat response using the provided message history and updates memory if enabled.
+        Generate a chat response based on the provided message history and update conversation memory if enabled.
 
-        If the first message is not a system prompt, prepends the system prompt. Processes the API response and returns a cleaned answer, or None if the answer is empty or only punctuation/underscores. Updates internal memory with the latest question and answer if memory is enabled.
+        If the first message is not a system prompt, prepends the system prompt. Returns a cleaned response string, or None if the response is empty or contains only punctuation or underscores. Updates internal memory with the latest user message and answer when memory is enabled.
 
-        Args:
-            messages: List of chat messages with 'role' and 'content' keys.
-            lang: Optional language code for the response.
-            units: Optional unit system for numerical values.
+        Parameters:
+            messages (MessageList): List of chat messages, each with 'role' and 'content' keys.
+            lang (Optional[str]): Language code for the response.
+            units (Optional[str]): Unit system for numerical values.
 
         Returns:
-            The generated response as a string, or None if no valid response is produced.
+            Optional[str]: The generated response string, or None if no valid response is produced.
         """
         if messages[0]["role"] != "system":
             messages = [{"role": "system", "content": self.system_prompt }] + messages
@@ -322,15 +322,15 @@ def stream_utterances(self, query: str,
                           lang: Optional[str] = None,
                           units: Optional[str] = None) -> Iterable[str]:
         """
-        Stream utterances for the given query as they become available.
+        Yields partial responses for a query as they are generated by the chat completions API.
 
-        Args:
-            query (str): The query text.
-            lang (Optional[str]): Optional language code. Defaults to None.
-            units (Optional[str]): Optional units for the query. Defaults to None.
+        Parameters:
+            query (str): The user query to send to the chat model.
+            lang (Optional[str]): Language code for the response, if applicable.
+            units (Optional[str]): Units relevant to the query, if applicable.
 
         Returns:
-            Iterable[str]: An iterable of utterances.
+            Iterable[str]: An iterator yielding segments of the model's response as they become available.
         """
         messages = self.get_messages(query)
         yield from self.stream_chat_utterances(messages, lang, units)
diff --git a/ovos_solver_openai_persona/rag.py b/ovos_solver_openai_persona/rag.py
index ae7861b..0d23ab8 100644
--- a/ovos_solver_openai_persona/rag.py
+++ b/ovos_solver_openai_persona/rag.py
@@ -100,16 +100,16 @@ def __init__(self, config: Optional[Dict[str, Any]] = None,
 
     def _search_vector_store(self, query: str) -> List[str]:
         """
-        Performs a search against the ovos-persona-server's vector store.
+        Searches the configured vector store for relevant text chunks matching the user query.
 
-        Args:
-            query (str): The user's query string.
+        Parameters:
+            query (str): The user's query string to search for relevant context.
 
         Returns:
-            List[str]: A list of relevant text chunks (content) retrieved from the vector store.
+            List[str]: A list of text chunks retrieved from the vector store that are relevant to the query.
 
         Raises:
-            RequestException: If the search API call fails or returns an error.
+            RequestException: If the search request fails or the response format is invalid.
         """
         search_url = f"{self.api_url}/vector_stores/{self.vector_store_id}/search"
         headers = {"Content-Type": "application/json"}
@@ -140,15 +140,17 @@ def _search_vector_store(self, query: str) -> List[str]:
 
     def _build_llm_messages(self, user_query: str, retrieved_context_chunks: List[str], chat_history: List[Dict[str, str]]) -> List[Dict[str, str]]:
         """
-        Constructs the complete message list for the LLM, including RAG context and chat history.
+        Constructs the message list for the LLM by combining retrieved context, recent chat history, and the current user query.
 
-        Args:
-            user_query (str): The current user's utterance.
-            retrieved_context_chunks (List[str]): List of text chunks retrieved from the vector store.
-            chat_history (List[Dict[str, str]]): The conversation history from `self.qa_pairs`.
+        The method concatenates relevant context chunks (up to a token limit), formats the system prompt with this context and the user's question, appends recent Q&A pairs from memory, and adds the current user query as the final message.
+
+        Parameters:
+            user_query (str): The user's current question or utterance.
+            retrieved_context_chunks (List[str]): Relevant text segments retrieved from the vector store.
+            chat_history (List[Dict[str, str]]): Previous conversation history.
 
         Returns:
-            List[Dict[str, str]]: A new list of messages, augmented with the RAG context and history.
+            List[Dict[str, str]]: The complete list of messages to send to the LLM, including system prompt, chat history, and user query.
         """
         context_str = ""
         current_context_tokens = 0
@@ -186,8 +188,10 @@ def _build_llm_messages(self, user_query: str, retrieved_context_chunks: List[st
 
     def get_chat_history(self) -> List[Dict[str, str]]:
         """
-        Returns the chat history managed by this RAG solver.
-        This method is called by the base ChatMessageSolver.
+        Return the recent chat history as a list of user and assistant messages.
+
+        Returns:
+            List of message dictionaries representing the most recent question-answer pairs, formatted with roles 'user' and 'assistant'.
         """
         # The base class expects a list of messages (role, content).
         # We store (query, answer) tuples.
@@ -202,17 +206,18 @@ def continue_chat(self, messages: List[Dict[str, str]],
                       lang: Optional[str],
                       units: Optional[str] = None) -> Optional[str]:
         """
-        Generates a chat response using RAG by directly calling the Persona Server's
-        chat completions endpoint.
+        Generate a chat response by augmenting the user query with retrieved context from a vector store and sending the constructed prompt to the Persona Server's chat completions endpoint.
 
-        Args:
-            messages: List of chat messages with 'role' and 'content' keys.
-                      The last user message is used for RAG retrieval and as the current query.
-            lang: Optional language code for the response.
-            units: Optional unit system for numerical values.
+        Parameters:
+            messages (List[Dict[str, str]]): List of chat messages, where the last message is treated as the current user query.
+            lang (Optional[str]): Optional language code for the response.
+            units (Optional[str]): Optional unit system for numerical values.
 
         Returns:
-            The generated response as a string, or None if no valid response is produced.
+            Optional[str]: The generated response as a string, or None if no valid response is produced.
+
+        Raises:
+            RequestException: If the Persona Server's chat completions endpoint returns an error or an invalid response.
         """
         user_query = messages[-1]["content"] # Get the current user query
 
@@ -265,16 +270,17 @@ def stream_chat_utterances(self, messages: List[Dict[str, str]],
                                lang: Optional[str] = None,
                                units: Optional[str] = None) -> Iterable[str]: # Yields raw data: lines
         """
-        Stream utterances for the given chat history using RAG by directly calling the Persona Server's
-        chat completions endpoint in streaming mode.
+        Streams chat completion responses from the Persona Server using Retrieval Augmented Generation (RAG), yielding each line of streamed data as it arrives.
 
-        Args:
-            messages: The chat messages. The last user message is used for RAG retrieval and as the current query.
-            lang (Optional[str]): Optional language code. Defaults to None.
-            units (Optional[str]): Optional units for the query. Defaults to None.
+        The method retrieves relevant context from the vector store based on the latest user query, augments the chat history, and streams the LLM's response line by line. If enabled, it stores the full answer in memory for multi-turn conversations.
+
+        Parameters:
+            messages (List[Dict[str, str]]): The chat history, with the last message as the current user query.
+            lang (Optional[str]): Optional language code for the query.
+            units (Optional[str]): Optional units for the query.
 
         Returns:
-            Iterable[str]: An iterable of raw data: [JSON] strings from the streaming API.
+            Iterable[str]: Yields each raw data line (as a string) from the streaming API response.
         """
         user_query = messages[-1]["content"] # Get the current user query
 
@@ -339,15 +345,15 @@ def stream_utterances(self, query: str,
                           lang: Optional[str] = None,
                           units: Optional[str] = None) -> Iterable[str]:
         """
-        Stream utterances for the given query using RAG.
+        Streams the assistant's response for a given user query, incorporating current chat history and Retrieval Augmented Generation context.
 
-        Args:
-            query (str): The query text.
-            lang (Optional[str]): Optional language code. Defaults to None.
-            units (Optional[str]): Optional units for the query. Defaults to None.
+        Parameters:
+            query (str): The user's input query.
+            lang (Optional[str]): Language code for the response, if applicable.
+            units (Optional[str]): Units relevant to the query, if applicable.
 
         Returns:
-            Iterable[str]: An iterable of raw data: [JSON] strings from the streaming API.
+            Iterable[str]: Yields raw data chunks from the streaming chat completions API.
         """
         # For stream_utterances, we directly build a single-turn message list
         # We need to include existing chat history here as well for proper context
@@ -359,15 +365,15 @@ def get_spoken_answer(self, query: str,
                           lang: Optional[str] = None,
                           units: Optional[str] = None) -> Optional[str]:
         """
-        Obtain the spoken answer for a given query using RAG.
+        Return the assistant's spoken answer to a user query, incorporating recent chat history for context.
 
-        Args:
-            query (str): The query text.
-            lang (Optional[str]): Optional language code. Defaults to None.
-            units (Optional[str]): Optional units for the query. Defaults to None.
+        Parameters:
+            query (str): The user's input question.
+            lang (Optional[str]): Language code for the response, if specified.
+            units (Optional[str]): Units relevant to the query, if specified.
 
         Returns:
-            str: The spoken answer as a text response.
+            Optional[str]: The assistant's text response, or None if no answer is generated.
         """
         # For get_spoken_answer, we need to include existing chat history
         messages: List[Dict[str, str]] = self.get_chat_history()
@@ -376,8 +382,13 @@ def get_spoken_answer(self, query: str,
 
     def get_messages(self, utt: str, system_prompt: Optional[str] = None) -> List[Dict[str, str]]:
         """
-        Builds a message list including the RAG solver's chat history and the current user utterance.
-        The system prompt for the LLM is constructed dynamically in _build_llm_messages.
+        Return the current chat history messages with the latest user utterance appended.
+
+        Parameters:
+        	utt (str): The current user utterance to add to the message list.
+
+        Returns:
+        	List of message dictionaries representing the conversation history plus the new user message.
         """
         messages = self.get_chat_history()
         messages.append({"role": "user", "content": utt})