intel
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎LlamaCPP/.gitignore‎
Lines changed: 12 additions & 0 deletions b/‎LlamaCPP/.gitignore‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎LlamaCPP/llama_adapter.py‎
Lines changed: 187 additions & 0 deletions b/‎LlamaCPP/llama_adapter.py‎
Lines changed: 187 additions & 0 deletions
diff --git a/‎LlamaCPP/llama_cpp_backend.py‎
Lines changed: 47 additions & 0 deletions b/‎LlamaCPP/llama_cpp_backend.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎LlamaCPP/llama_cpp_test.py‎
Lines changed: 26 additions & 0 deletions b/‎LlamaCPP/llama_cpp_test.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎LlamaCPP/llama_interface.py‎
Lines changed: 24 additions & 0 deletions b/‎LlamaCPP/llama_interface.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎LlamaCPP/llama_params.py‎
Lines changed: 15 additions & 0 deletions b/‎LlamaCPP/llama_params.py‎
Lines changed: 15 additions & 0 deletions
@@ -10,6 +10,8 @@ WebUI/external/service/
 *.7z
 *.whl
 ComfyUI/
+env
+llama-cpp-env/
 *env_tmp/
 *service_tmp/
 *-env/
 
@@ -0,0 +1,12 @@
+.vscode/
+__pycache__/
+models/llm/
+temp/
+test/
+dist/
+build/
+cache/
+test/
+env/
+
+!tools/*.exe
@@ -0,0 +1,187 @@
+import threading
+from queue import Empty, Queue
+import json
+import traceback
+from typing import Dict, List, Callable
+#from model_downloader import NotEnoughDiskSpaceException, DownloadException
+#from psutil._common import bytes2human
+from llama_interface import LLMInterface
+from llama_params import LLMParams
+
+
+RAG_PROMPT_FORMAT = "Answer the questions based on the information below. \n{context}\n\nQuestion: {prompt}"
+
+class LLM_SSE_Adapter:
+    msg_queue: Queue
+    finish: bool
+    singal: threading.Event
+    llm_interface: LLMInterface
+    should_stop: bool
+
+    def __init__(self, llm_interface: LLMInterface):
+        self.msg_queue = Queue(-1)
+        self.finish = False
+        self.singal = threading.Event()
+        self.llm_interface = llm_interface
+        self.should_stop = False
+
+    def put_msg(self, data):
+        self.msg_queue.put_nowait(data)
+        self.singal.set()
+
+    def load_model_callback(self, event: str):
+        data = {"type": "load_model", "event": event}
+        self.put_msg(data)
+
+    def text_in_callback(self, msg: str):
+        data = {"type": "text_in", "value": msg}
+        self.put_msg(data)
+
+    def text_out_callback(self, msg: str, type=1):
+        data = {"type": "text_out", "value": msg, "dtype": type}
+        self.put_msg(data)
+
+    def first_latency_callback(self, first_latency: str):
+        data = {"type": "first_token_latency", "value": first_latency}
+        self.put_msg(data)
+
+    def after_latency_callback(self, after_latency: str):
+        data = {"type": "after_token_latency", "value": after_latency}
+        self.put_msg(data)
+
+    def sr_latency_callback(self, sr_latency: str):
+        data = {"type": "sr_latency", "value": sr_latency}
+        self.put_msg(data)
+
+    def error_callback(self, ex: Exception):
+        if (
+            isinstance(ex, NotImplementedError)
+            and ex.__str__() == "Access to repositories lists is not implemented."
+        ):
+            self.put_msg(
+                {
+                    "type": "error",
+                    "err_type": "repositories_not_found",
+                }
+            )
+        # elif isinstance(ex, NotEnoughDiskSpaceException):
+        #     self.put_msg(
+        #         {
+        #             "type": "error",
+        #             "err_type": "not_enough_disk_space",
+        #             "need": bytes2human(ex.requires_space),
+        #             "free": bytes2human(ex.free_space),
+        #         }
+        #     )
+        # elif isinstance(ex, DownloadException):
+        #     self.put_msg({"type": "error", "err_type": "download_exception"})
+        # # elif isinstance(ex, llm_biz.StopGenerateException):
+        # #     pass
+        elif isinstance(ex, RuntimeError):
+            self.put_msg({"type": "error", "err_type": "runtime_error"})
+        else:
+            self.put_msg({"type": "error", "err_type": "unknow_exception"})
+        print(f"exception:{str(ex)}")
+
+    def text_conversation(self, params: LLMParams):
+        thread = threading.Thread(
+            target=self.text_conversation_run,
+            args=[params],
+        )
+        thread.start()
+        return self.generator()
+    
+
+    def stream_function(self, stream):
+        for output in stream:
+            if self.llm_interface.stop_generate:
+                self.llm_interface.stop_generate = False
+                break
+            
+            if self.llm_interface.get_backend_type() == "ipex_llm":
+                # transformer style
+                self.text_out_callback(output)
+            else:
+                # openai style
+                self.text_out_callback(output["choices"][0]["delta"].get("content",""))
+        self.put_msg({"type": "finish"})
+
+    def text_conversation_run(
+        self,
+        params: LLMParams,
+    ):
+        try:
+            print("sdnmsd", self.llm_interface)
+            if (not self.llm_interface._model):
+                self.load_model_callback('start')
+                self.llm_interface.load_model(params)
+                self.load_model_callback('finish')
+            
+            prompt = params.prompt
+            if params.enable_rag:
+                last_prompt = prompt[prompt.__len__() - 1]
+                last_prompt.__setitem__(
+                    "question", process_rag(last_prompt.get("question"), params.device)
+                )
+
+            full_prompt = convert_prompt(prompt)
+            stream = self.llm_interface.create_chat_completion(full_prompt)
+            self.stream_function(stream)	
+            
+        except Exception as ex:
+            traceback.print_exc()
+            self.error_callback(ex)
+        finally:
+            self.finish = True
+            self.singal.set()
+
+    def generator(self):
+        while True:
+            while not self.msg_queue.empty():
+                try:
+                    data = self.msg_queue.get_nowait()
+                    msg = f"data:{json.dumps(data)}\0"
+                    print(msg)
+                    yield msg
+                except Empty(Exception):
+                    break
+            if not self.finish:
+                self.singal.clear()
+                self.singal.wait()
+            else:
+                break
+
+
+_default_prompt = {
+        "role": "system",
+        "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user. Please keep the output text language the same as the user input.",
+    }
+
+def convert_prompt(prompt: List[Dict[str, str]]):
+    chat_history = [_default_prompt]
+    prompt_len = prompt.__len__()
+    i = 0
+    while i < prompt_len:
+        chat_history.append({"role": "user", "content": prompt[i].get("question")})
+        if i < prompt_len - 1:
+            chat_history.append(
+                {"role": "assistant", "content": prompt[i].get("answer")}
+            )
+        i = i + 1
+    return chat_history
+
+
+def process_rag(
+        prompt: str,
+        device: str,
+        text_out_callback: Callable[[str, int], None] = None,
+    ):
+        import rag
+        rag.to(device)
+        query_success, context, rag_source = rag.query(prompt)
+        if query_success:
+            print("rag query input\r\n{}output:\r\n{}".format(prompt, context))
+            prompt = RAG_PROMPT_FORMAT.format(prompt=prompt, context=context)
+            if text_out_callback is not None:
+                text_out_callback(rag_source, 2)
+        return prompt
@@ -0,0 +1,47 @@
+from typing import Dict, List
+from os import path
+from llama_interface import LLMInterface
+from llama_cpp import CreateChatCompletionStreamResponse, Iterator, Llama
+from llama_params import LLMParams
+import model_config
+import gc
+
+class LlamaCpp(LLMInterface):
+    def __init__(self):
+        self._model = None
+        self.stop_generate = False
+        self._last_repo_id = None
+
+    def load_model(self, params: LLMParams, n_gpu_layers: int = -1, context_length: int = 16000):
+        model_repo_id = params.model_repo_id
+        if self._model is None or self._last_repo_id != model_repo_id:
+            self.unload_model()
+
+            model_base_path = model_config.llamaCppConfig.get("ggufLLM")
+            namespace, repo, *model = model_repo_id.split("/")
+            model_path = path.abspath(path.join(model_base_path,"---".join([namespace, repo]), "---".join(model)))
+            
+            self._model = Llama(
+                model_path=model_path,
+                n_gpu_layers=n_gpu_layers,
+                n_ctx=context_length,
+            )
+
+            self._last_repo_id = model_repo_id
+
+    def create_chat_completion(self, messages: List[Dict[str, str]]):
+        completion: Iterator[CreateChatCompletionStreamResponse] = self._model.create_chat_completion(
+            messages=messages,
+            stream=True,
+        )
+        return completion
+
+    def unload_model(self):
+        if self._model is not None:
+            self._model.close()
+            del self._model
+        gc.collect()
+        self._model = None
+
+    def get_backend_type(self):
+        return "llama_cpp"
@@ -0,0 +1,26 @@
+import requests
+
+
+url = "http://127.0.0.1:59003/api/llm/chat"
+params = {
+  "prompt": [{"question": "Who is the president of the United States in 5 years?" }],
+  "device": "",
+  "enable_rag": False,
+  "model_repo_id": "meta-llama-3.1-8b-instruct-q5_k_m.gguf",
+}
+response = requests.post(url, json=params, stream=True)
+# Check if the response status code is 200 (OK)
+response.raise_for_status()
+e = 1
+# Iterate over the response lines
+for line in response.iter_lines():
+    e += 1
+    if line:
+        # Decode the line (assuming UTF-8 encoding)
+        decoded_line = line.decode('utf-8')
+
+        # SSE events typically start with "data: "
+        if decoded_line.startswith("data:"):
+            # Extract the data part
+            data = decoded_line[len("data:"):]
+            print(data)  # Process the data as needed
@@ -0,0 +1,24 @@
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+from llama_params import LLMParams
+
+class LLMInterface(ABC):
+    stop_generate: bool
+    _model: Optional[object]
+
+    @abstractmethod
+    def load_model(self, params: LLMParams, **kwargs):
+        pass
+
+    @abstractmethod
+    def unload_model(self):
+        pass
+
+    @abstractmethod
+    def create_chat_completion(self, messages: List[Dict[str, str]]):
+        pass 
+
+    @abstractmethod
+    def get_backend_type(self):
+        pass
+    
@@ -0,0 +1,15 @@
+from typing import Dict, List
+
+class LLMParams:
+    prompt: List[Dict[str, str]]
+    device: int
+    enable_rag: bool 
+    model_repo_id: str
+
+    def __init__(
+        self, prompt: list, device: int, enable_rag: bool, model_repo_id: str
+    ) -> None:
+        self.prompt = prompt
+        self.device = device
+        self.enable_rag = enable_rag
+        self.model_repo_id = model_repo_id