opea-project · chensuyue · Jan 22, 2025 · Jan 20, 2025
@@ -4,35 +4,13 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 
-tgi:
+vllm:
   enabled: true
-  accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
-  resources:
-    limits:
-      habana.ai/gaudi: 4
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  extraCmdArgs: ["--sharded","true","--num-shard","4"]
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+    repository: opea/vllm-gaudi
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
@@ -5,7 +5,7 @@ tgi:
   accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
@@ -33,11 +33,15 @@ tgi:
     failureThreshold: 120
 
 whisper:
+  image:
+    repository: opea/whisper-gaudi
   resources:
     limits:
       habana.ai/gaudi: 1
 
 speecht5:
+  image:
+    repository: opea/speecht5-gaudi
   resources:
     limits:
       habana.ai/gaudi: 1
@@ -0,0 +1,112 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Override CPU resource request and probe timing values in specific subcharts
+#
+# RESOURCES
+#
+# Resource request matching actual resource usage (with enough slack)
+# is important when service is scaled up, so that right amount of pods
+# get scheduled to right nodes.
+#
+# Because resource usage depends on the used devices, model, data type
+# and SW versions, and this top-level chart has overrides for them,
+# resource requests need to be specified here too.
+#
+# To test service without resource request, use "resources: {}".
+#
+# PROBES
+#
+# Inferencing pods startup / warmup takes *much* longer on CPUs than
+# with acceleration devices, and their responses are also slower,
+# especially when node is running several instances of these services.
+#
+# Kubernetes restarting pod before its startup finishes, or not
+# sending it queries because it's not in ready state due to slow
+# readiness responses, does really NOT help in getting faster responses.
+#
+# => probe timings need to be increased when running on CPU.
+
+vllm:
+  enabled: false
+tgi:
+  enabled: true
+  # TODO: add Helm value also for TGI data type option:
+  # https://github.com/opea-project/GenAIExamples/issues/330
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+
+  # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
+  #resources:
+  #  limits:
+  #    cpu: 8
+  #    memory: 70Gi
+  #  requests:
+  #    cpu: 6
+  #    memory: 65Gi
+
+  livenessProbe:
+    initialDelaySeconds: 8
+    periodSeconds: 8
+    failureThreshold: 24
+    timeoutSeconds: 4
+  readinessProbe:
+    initialDelaySeconds: 16
+    periodSeconds: 8
+    timeoutSeconds: 4
+  startupProbe:
+    initialDelaySeconds: 10
+    periodSeconds: 5
+    failureThreshold: 180
+    timeoutSeconds: 2
+
+teirerank:
+  RERANK_MODEL_ID: "BAAI/bge-reranker-base"
+
+  # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
+  resources:
+    limits:
+      cpu: 4
+      memory: 30Gi
+    requests:
+      cpu: 2
+      memory: 25Gi
+
+  livenessProbe:
+    initialDelaySeconds: 8
+    periodSeconds: 8
+    failureThreshold: 24
+    timeoutSeconds: 4
+  readinessProbe:
+    initialDelaySeconds: 8
+    periodSeconds: 8
+    timeoutSeconds: 4
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 120
+
+tei:
+  EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
+
+  # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
+  resources:
+    limits:
+      cpu: 4
+      memory: 4Gi
+    requests:
+      cpu: 2
+      memory: 3Gi
+
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 24
+    timeoutSeconds: 2
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 2
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 120
@@ -1,109 +1,5 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-# Override CPU resource request and probe timing values in specific subcharts
-#
-# RESOURCES
-#
-# Resource request matching actual resource usage (with enough slack)
-# is important when service is scaled up, so that right amount of pods
-# get scheduled to right nodes.
-#
-# Because resource usage depends on the used devices, model, data type
-# and SW versions, and this top-level chart has overrides for them,
-# resource requests need to be specified here too.
-#
-# To test service without resource request, use "resources: {}".
-#
-# PROBES
-#
-# Inferencing pods startup / warmup takes *much* longer on CPUs than
-# with acceleration devices, and their responses are also slower,
-# especially when node is running several instances of these services.
-#
-# Kubernetes restarting pod before its startup finishes, or not
-# sending it queries because it's not in ready state due to slow
-# readiness responses, does really NOT help in getting faster responses.
-#
-# => probe timings need to be increased when running on CPU.
-
-tgi:
-  # TODO: add Helm value also for TGI data type option:
-  # https://github.com/opea-project/GenAIExamples/issues/330
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-
-  # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
-  resources:
-    limits:
-      cpu: 8
-      memory: 70Gi
-    requests:
-      cpu: 6
-      memory: 65Gi
-
-  livenessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    failureThreshold: 24
-    timeoutSeconds: 4
-  readinessProbe:
-    initialDelaySeconds: 16
-    periodSeconds: 8
-    timeoutSeconds: 4
-  startupProbe:
-    initialDelaySeconds: 10
-    periodSeconds: 5
-    failureThreshold: 180
-    timeoutSeconds: 2
-
-teirerank:
-  RERANK_MODEL_ID: "BAAI/bge-reranker-base"
-
-  # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
-  resources:
-    limits:
-      cpu: 4
-      memory: 30Gi
-    requests:
-      cpu: 2
-      memory: 25Gi
-
-  livenessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    failureThreshold: 24
-    timeoutSeconds: 4
-  readinessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    timeoutSeconds: 4
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 120
-
-tei:
-  EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
-
-  # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
-  resources:
-    limits:
-      cpu: 4
-      memory: 4Gi
-    requests:
-      cpu: 2
-      memory: 3Gi
-
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 24
-    timeoutSeconds: 2
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 2
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 120
+image:
+  repository: opea/chatqna
@@ -4,12 +4,15 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 
+vllm:
+  enabled: false
 # TGI: largest bottleneck for ChatQnA
 tgi:
+  enabled: true
   accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1

@@ -6,9 +6,9 @@
 
 tgi:
   enabled: false
-
 vllm:
   enabled: true
+  shmSize: 1Gi
   accelDevice: "gaudi"
   image:
     repository: opea/vllm-gaudi
@@ -19,7 +19,7 @@ vllm:
     initialDelaySeconds: 5
     periodSeconds: 5
     timeoutSeconds: 1
-    failureThreshold: 120
+    failureThreshold: 180
   readinessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
@@ -39,7 +39,6 @@ vllm:
     "--max-seq_len-to-capture", "2048"
   ]
 
-
 # Reranking: second largest bottleneck when reranking is in use
 # (i.e. query context docs have been uploaded with data-prep)
 #