opea-project · srinarayan-srikanthan · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025
@@ -175,25 +175,23 @@ def align_generator(self, gen, **kwargs):
     # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
     for line in gen:
         line = line.decode("utf-8")
-        start = line.find("{")
-        end = line.rfind("}") + 1
-
-        json_str = line[start:end]
-        try:
-            # sometimes yield empty chunk, do a fallback here
-            json_data = json.loads(json_str)
-            if "ops" in json_data and "op" in json_data["ops"][0]:
-                if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str):
-                    yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n"
-                else:
-                    pass
-            elif (
-                json_data["choices"][0]["finish_reason"] != "eos_token"
-                and "content" in json_data["choices"][0]["delta"]
-            ):
-                yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
-        except Exception as e:
-            yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
+        chunks = [chunk.strip() for chunk in line.split("\n\n") if chunk.strip()]
+        for line in chunks:
+            start = line.find("{")
+            end = line.rfind("}") + 1
+            json_str = line[start:end]
+            try:
+                # sometimes yield empty chunk, do a fallback here
+                json_data = json.loads(json_str)
+                if "ops" in json_data and "op" in json_data["ops"][0]:
+                    if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str):
+                        yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n"
+                    else:
+                        pass
+                elif "content" in json_data["choices"][0]["delta"]:
+                    yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
+            except Exception as e:
+                yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
     yield "data: [DONE]\n\n"
 
 

@@ -73,6 +73,17 @@ CPU example with Open Telemetry feature:
 docker compose -f compose.yaml -f compose.telemetry.yaml up -d
 ```
 
+To deploy ChatQnA services with remote endpoints, set the required environment variables mentioned below and run the 'compose_remote.yaml' file.
+
+**Note**: Set REMOTE_ENDPOINT variable value to "https://api.inference.denvrdata.com" when the remote endpoint to access is "https://api.inference.denvrdata.com/v1/chat/completions"
+
+```bash
+export REMOTE_ENDPOINT=<endpoint-url>
+export LLM_MODEL_ID=<model-id>
+export OPENAI_API_KEY=<API-KEY>
+docker compose -f compose_remote.yaml up -d
+```
+
 **Note**: developers should build docker image from source when:
 
 - Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
@@ -147,6 +158,7 @@ In the context of deploying a ChatQnA pipeline on an Intel® Xeon® platform, we
 | File                                                         | Description                                                                                                                                                           |
 | ------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | [compose.yaml](./compose.yaml)                               | Default compose file using vllm as serving framework and redis as vector database                                                                                     |
+| [compose_remote.yaml](./compose_remote.yaml)                 | Default compose file using remote inference endpoints and redis as vector database                                                                                    |
 | [compose_milvus.yaml](./compose_milvus.yaml)                 | Uses Milvus as the vector database. All other configurations remain the same as the default                                                                           |
 | [compose_pinecone.yaml](./compose_pinecone.yaml)             | Uses Pinecone as the vector database. All other configurations remain the same as the default. For more details, refer to [README_pinecone.md](./README_pinecone.md). |
 | [compose_qdrant.yaml](./compose_qdrant.yaml)                 | Uses Qdrant as the vector database. All other configurations remain the same as the default. For more details, refer to [README_qdrant.md](./README_qdrant.md).       |

@@ -102,7 +102,7 @@ services:
       - RERANK_SERVER_HOST_IP=tei-reranking-service
       - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
       - LLM_SERVER_HOST_IP=${REMOTE_ENDPOINT}
-      - OPENAI_API_KEY= ${OPENAI_API_KEY}
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
       - LLM_SERVER_PORT=80
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}

@@ -181,7 +181,6 @@ async def handle_request(self, request: Request):
 
         # Handle the chat messages to generate the prompt
         prompt = handle_message(chat_request.messages)
-
         # Get the agents flag from the request data, default to False if not provided
         agents_flag = data.get("agents_flag", False)
 
@@ -200,7 +199,6 @@ async def handle_request(self, request: Request):
 
         # Initialize the initial inputs with the generated prompt
         initial_inputs = {"query": prompt}
-
         # Check if the key index name is provided in the parameters
         if parameters.index_name:
             if agents_flag:
@@ -268,7 +266,6 @@ async def handle_request(self, request: Request):
         result_dict, runtime_graph = await megaservice.schedule(
             initial_inputs=initial_inputs, llm_parameters=parameters
         )
-
         for node, response in result_dict.items():
             # Check if the last microservice in the megaservice is LLM
             if (
@@ -277,7 +274,6 @@ async def handle_request(self, request: Request):
                 and megaservice.services[node].service_type == ServiceType.LLM
             ):
                 return response
-
         # Get the response from the last node in the runtime graph
         last_node = runtime_graph.all_leaves()[-1]
 
@@ -288,7 +284,6 @@ async def handle_request(self, request: Request):
                 response = result_dict[last_node]["text"]
             except (KeyError, TypeError):
                 response = "Response Error"
-
         choices = []
         usage = UsageInfo()
         choices.append(

@@ -91,11 +91,27 @@ Different Docker Compose files are available to select the LLM serving backend.
 - **Description:** Uses Hugging Face Text Generation Inference (TGI) optimized for Intel CPUs as the LLM serving engine.
 - **Services Deployed:** `codegen-tgi-server`, `codegen-llm-server`, `codegen-tei-embedding-server`, `codegen-retriever-server`, `redis-vector-db`, `codegen-dataprep-server`, `codegen-backend-server`, `codegen-gradio-ui-server`.
 - **To Run:**
+
   ```bash
   # Ensure environment variables (HOST_IP, HF_TOKEN) are set
   docker compose -f compose_tgi.yaml up -d
   ```
 
+  #### Deployment with remote endpoints (`compose_remote.yaml`)
+
+- **Compose File:** `compose_remote.yaml`
+- **Description:** Uses remote endpoints to access the served LLM's. This is the default configurations except for the LLM serving engine.
+- **Services Deployed:** `codegen-tei-embedding-server`, `codegen-retriever-server`, `redis-vector-db`, `codegen-dataprep-server`, `codegen-backend-server`, `codegen-gradio-ui-server`.
+- **To Run:**
+
+  ```bash
+  export OPENAI_API_KEY=<api-key>>
+  export REMOTE_ENDPOINT=<remote-endpoint> #do not include /v1
+  export LLM_MODEL_ID=<model-id>
+
+  docker compose -f compose_remote.yaml up -d
+  ```
+
 ### Configuration Parameters
 
 #### Environment Variables

@@ -6,6 +6,9 @@ services:
   codegen-xeon-backend-server:
     image: ${REGISTRY:-opea}/codegen:${TAG:-latest}
     container_name: codegen-xeon-backend-server
+    depends_on:
+      dataprep-redis-server:
+        condition: service_healthy
     ports:
       - "7778:7778"
     environment:
@@ -14,7 +17,8 @@ services:
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
       - LLM_SERVICE_HOST_IP=${REMOTE_ENDPOINT}
-      - OPENAI_API_KEY= ${OPENAI_API_KEY}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
       - RETRIEVAL_SERVICE_HOST_IP=${RETRIEVAL_SERVICE_HOST_IP}
       - REDIS_RETRIEVER_PORT=${REDIS_RETRIEVER_PORT}
       - TEI_EMBEDDING_HOST_IP=${TEI_EMBEDDING_HOST_IP}
@@ -61,6 +65,11 @@ services:
       INDEX_NAME: ${INDEX_NAME}
       HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
       LOGFLAG: true
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
     restart: unless-stopped
   tei-embedding-serving:
     image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

@@ -52,6 +52,18 @@ cd intel/cpu/xeon/
 docker compose up -d
 ```
 
+To deploy DocSum services with remote endpoints, set the required environment variables mentioned below and run the 'compose_remote.yaml' file.
+
+**Note**: Set LLM_ENDPOINT variable value to "https://api.inference.denvrdata.com" when the remote endpoint to access is "https://api.inference.denvrdata.com/v1/chat/completions"
+
+```bash
+export LLM_ENDPOINT=<endpoint-url>
+export LLM_MODEL_ID=<model-id>
+export OPENAI_API_KEY=<API-KEY>
+
+docker compose -f compose_remote.yaml up -d
+```
+
 **Note**: developers should build docker image from source when:
 
 - Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
@@ -113,10 +125,11 @@ All the DocSum containers will be stopped and then removed on completion of the
 
 In the context of deploying a DocSum pipeline on an Intel® Xeon® platform, we can pick and choose different large language model serving frameworks. The table below outlines the various configurations that are available as part of the application.
 
-| File                                   | Description                                                                               |
-| -------------------------------------- | ----------------------------------------------------------------------------------------- |
-| [compose.yaml](./compose.yaml)         | Default compose file using vllm as serving framework                                      |
-| [compose_tgi.yaml](./compose_tgi.yaml) | The LLM serving framework is TGI. All other configurations remain the same as the default |
+| File                                         | Description                                                                            |
+| -------------------------------------------- | -------------------------------------------------------------------------------------- |
+| [compose.yaml](./compose.yaml)               | Default compose file using vllm as serving framework                                   |
+| [compose_tgi.yaml](./compose_tgi.yaml)       | The LLM serving framework is TGI. All other configurations remain the same as default  |
+| [compose_remote.yaml](./compose_remote.yaml) | Uses remote inference endpoints for LLMs. All other configurations are same as default |
 
 ## DocSum Detailed Usage
 

@@ -0,0 +1,73 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  llm-docsum-vllm:
+    image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+    container_name: docsum-xeon-llm-server
+    ports:
+      - ${LLM_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      OPENAI_API_KEY: ${OPENAI_API_KEY}
+      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+      HF_TOKEN: ${HF_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+
+  whisper:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: docsum-xeon-whisper-server
+    ports:
+      - "7066:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+
+  docsum-xeon-backend-server:
+    image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+    container_name: docsum-xeon-backend-server
+    depends_on:
+      - llm-docsum-vllm
+    ports:
+      - "${BACKEND_SERVICE_PORT:-8888}:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
+    ipc: host
+    restart: always
+
+  docsum-gradio-ui:
+    image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+    container_name: docsum-xeon-ui-server
+    depends_on:
+      - docsum-xeon-backend-server
+    ports:
+      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
+      - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge