Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 8 additions & 30 deletions AgentQnA/kubernetes/helm/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,13 @@
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values

tgi:
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
resources:
limits:
habana.ai/gaudi: 4
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
extraCmdArgs: ["--sharded","true","--num-shard","4"]
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
repository: opea/vllm-gaudi
supervisor:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
ragagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
sqlagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
6 changes: 5 additions & 1 deletion AudioQnA/kubernetes/helm/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
Expand Down Expand Up @@ -33,11 +33,15 @@ tgi:
failureThreshold: 120

whisper:
image:
repository: opea/whisper-gaudi
resources:
limits:
habana.ai/gaudi: 1

speecht5:
image:
repository: opea/speecht5-gaudi
resources:
limits:
habana.ai/gaudi: 1
112 changes: 112 additions & 0 deletions ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Override CPU resource request and probe timing values in specific subcharts
#
# RESOURCES
#
# Resource request matching actual resource usage (with enough slack)
# is important when service is scaled up, so that right amount of pods
# get scheduled to right nodes.
#
# Because resource usage depends on the used devices, model, data type
# and SW versions, and this top-level chart has overrides for them,
# resource requests need to be specified here too.
#
# To test service without resource request, use "resources: {}".
#
# PROBES
#
# Inferencing pods startup / warmup takes *much* longer on CPUs than
# with acceleration devices, and their responses are also slower,
# especially when node is running several instances of these services.
#
# Kubernetes restarting pod before its startup finishes, or not
# sending it queries because it's not in ready state due to slow
# readiness responses, does really NOT help in getting faster responses.
#
# => probe timings need to be increased when running on CPU.

vllm:
enabled: false
tgi:
enabled: true
# TODO: add Helm value also for TGI data type option:
# https://github.com/opea-project/GenAIExamples/issues/330
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct

# Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
#resources:
# limits:
# cpu: 8
# memory: 70Gi
# requests:
# cpu: 6
# memory: 65Gi

livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 16
periodSeconds: 8
timeoutSeconds: 4
startupProbe:
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 180
timeoutSeconds: 2

teirerank:
RERANK_MODEL_ID: "BAAI/bge-reranker-base"

# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
resources:
limits:
cpu: 4
memory: 30Gi
requests:
cpu: 2
memory: 25Gi

livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 8
periodSeconds: 8
timeoutSeconds: 4
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 120

tei:
EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"

# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
resources:
limits:
cpu: 4
memory: 4Gi
requests:
cpu: 2
memory: 3Gi

livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 24
timeoutSeconds: 2
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 2
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 120
110 changes: 3 additions & 107 deletions ChatQnA/kubernetes/helm/cpu-values.yaml
Original file line number Diff line number Diff line change
@@ -1,109 +1,5 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Override CPU resource request and probe timing values in specific subcharts
#
# RESOURCES
#
# Resource request matching actual resource usage (with enough slack)
# is important when service is scaled up, so that right amount of pods
# get scheduled to right nodes.
#
# Because resource usage depends on the used devices, model, data type
# and SW versions, and this top-level chart has overrides for them,
# resource requests need to be specified here too.
#
# To test service without resource request, use "resources: {}".
#
# PROBES
#
# Inferencing pods startup / warmup takes *much* longer on CPUs than
# with acceleration devices, and their responses are also slower,
# especially when node is running several instances of these services.
#
# Kubernetes restarting pod before its startup finishes, or not
# sending it queries because it's not in ready state due to slow
# readiness responses, does really NOT help in getting faster responses.
#
# => probe timings need to be increased when running on CPU.

tgi:
# TODO: add Helm value also for TGI data type option:
# https://github.com/opea-project/GenAIExamples/issues/330
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3

# Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
resources:
limits:
cpu: 8
memory: 70Gi
requests:
cpu: 6
memory: 65Gi

livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 16
periodSeconds: 8
timeoutSeconds: 4
startupProbe:
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 180
timeoutSeconds: 2

teirerank:
RERANK_MODEL_ID: "BAAI/bge-reranker-base"

# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
resources:
limits:
cpu: 4
memory: 30Gi
requests:
cpu: 2
memory: 25Gi

livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 8
periodSeconds: 8
timeoutSeconds: 4
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 120

tei:
EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"

# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
resources:
limits:
cpu: 4
memory: 4Gi
requests:
cpu: 2
memory: 3Gi

livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 24
timeoutSeconds: 2
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 2
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 120
image:
repository: opea/chatqna
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values

vllm:
enabled: false
# TGI: largest bottleneck for ChatQnA
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
Expand Down
5 changes: 2 additions & 3 deletions ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

tgi:
enabled: false

vllm:
enabled: true
shmSize: 1Gi
accelDevice: "gaudi"
image:
repository: opea/vllm-gaudi
Expand All @@ -19,7 +19,7 @@ vllm:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
failureThreshold: 180
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
Expand All @@ -39,7 +39,6 @@ vllm:
"--max-seq_len-to-capture", "2048"
]


# Reranking: second largest bottleneck when reranking is in use
# (i.e. query context docs have been uploaded with data-prep)
#
Expand Down
Loading
Loading