Skip to content

Commit ee0e5cc

Browse files
authored
Sync value files from GenAIInfra (#1428)
All gaudi values updated with extra flags. Added helm support for 2 new examples Text2Image and SearchQnA. Minor fix for llm-uservice. Signed-off-by: Dolpher Du <[email protected]>
1 parent 5c36443 commit ee0e5cc

34 files changed

+331
-1475
lines changed

AgentQnA/kubernetes/helm/gaudi-values.yaml

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,13 @@
44
# Accelerate inferencing in heaviest components to improve performance
55
# by overriding their subchart values
66

7-
tgi:
7+
vllm:
88
enabled: true
9-
accelDevice: "gaudi"
109
image:
11-
repository: ghcr.io/huggingface/tgi-gaudi
12-
tag: "2.0.6"
13-
resources:
14-
limits:
15-
habana.ai/gaudi: 4
16-
MAX_INPUT_LENGTH: "4096"
17-
MAX_TOTAL_TOKENS: "8192"
18-
CUDA_GRAPHS: ""
19-
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
20-
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
21-
ENABLE_HPU_GRAPH: "true"
22-
LIMIT_HPU_GRAPH: "true"
23-
USE_FLASH_ATTENTION: "true"
24-
FLASH_ATTENTION_RECOMPUTE: "true"
25-
extraCmdArgs: ["--sharded","true","--num-shard","4"]
26-
livenessProbe:
27-
initialDelaySeconds: 5
28-
periodSeconds: 5
29-
timeoutSeconds: 1
30-
readinessProbe:
31-
initialDelaySeconds: 5
32-
periodSeconds: 5
33-
timeoutSeconds: 1
34-
startupProbe:
35-
initialDelaySeconds: 5
36-
periodSeconds: 5
37-
timeoutSeconds: 1
38-
failureThreshold: 120
10+
repository: opea/vllm-gaudi
11+
supervisor:
12+
llm_endpoint_url: http://{{ .Release.Name }}-vllm
13+
ragagent:
14+
llm_endpoint_url: http://{{ .Release.Name }}-vllm
15+
sqlagent:
16+
llm_endpoint_url: http://{{ .Release.Name }}-vllm

AudioQnA/kubernetes/helm/gaudi-values.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ tgi:
55
accelDevice: "gaudi"
66
image:
77
repository: ghcr.io/huggingface/tgi-gaudi
8-
tag: "2.0.6"
8+
tag: "2.3.1"
99
resources:
1010
limits:
1111
habana.ai/gaudi: 1
@@ -33,11 +33,15 @@ tgi:
3333
failureThreshold: 120
3434

3535
whisper:
36+
image:
37+
repository: opea/whisper-gaudi
3638
resources:
3739
limits:
3840
habana.ai/gaudi: 1
3941

4042
speecht5:
43+
image:
44+
repository: opea/speecht5-gaudi
4145
resources:
4246
limits:
4347
habana.ai/gaudi: 1
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Override CPU resource request and probe timing values in specific subcharts
5+
#
6+
# RESOURCES
7+
#
8+
# Resource request matching actual resource usage (with enough slack)
9+
# is important when service is scaled up, so that right amount of pods
10+
# get scheduled to right nodes.
11+
#
12+
# Because resource usage depends on the used devices, model, data type
13+
# and SW versions, and this top-level chart has overrides for them,
14+
# resource requests need to be specified here too.
15+
#
16+
# To test service without resource request, use "resources: {}".
17+
#
18+
# PROBES
19+
#
20+
# Inferencing pods startup / warmup takes *much* longer on CPUs than
21+
# with acceleration devices, and their responses are also slower,
22+
# especially when node is running several instances of these services.
23+
#
24+
# Kubernetes restarting pod before its startup finishes, or not
25+
# sending it queries because it's not in ready state due to slow
26+
# readiness responses, does really NOT help in getting faster responses.
27+
#
28+
# => probe timings need to be increased when running on CPU.
29+
30+
vllm:
31+
enabled: false
32+
tgi:
33+
enabled: true
34+
# TODO: add Helm value also for TGI data type option:
35+
# https://github.com/opea-project/GenAIExamples/issues/330
36+
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
37+
38+
# Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
39+
#resources:
40+
# limits:
41+
# cpu: 8
42+
# memory: 70Gi
43+
# requests:
44+
# cpu: 6
45+
# memory: 65Gi
46+
47+
livenessProbe:
48+
initialDelaySeconds: 8
49+
periodSeconds: 8
50+
failureThreshold: 24
51+
timeoutSeconds: 4
52+
readinessProbe:
53+
initialDelaySeconds: 16
54+
periodSeconds: 8
55+
timeoutSeconds: 4
56+
startupProbe:
57+
initialDelaySeconds: 10
58+
periodSeconds: 5
59+
failureThreshold: 180
60+
timeoutSeconds: 2
61+
62+
teirerank:
63+
RERANK_MODEL_ID: "BAAI/bge-reranker-base"
64+
65+
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
66+
resources:
67+
limits:
68+
cpu: 4
69+
memory: 30Gi
70+
requests:
71+
cpu: 2
72+
memory: 25Gi
73+
74+
livenessProbe:
75+
initialDelaySeconds: 8
76+
periodSeconds: 8
77+
failureThreshold: 24
78+
timeoutSeconds: 4
79+
readinessProbe:
80+
initialDelaySeconds: 8
81+
periodSeconds: 8
82+
timeoutSeconds: 4
83+
startupProbe:
84+
initialDelaySeconds: 5
85+
periodSeconds: 5
86+
failureThreshold: 120
87+
88+
tei:
89+
EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
90+
91+
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
92+
resources:
93+
limits:
94+
cpu: 4
95+
memory: 4Gi
96+
requests:
97+
cpu: 2
98+
memory: 3Gi
99+
100+
livenessProbe:
101+
initialDelaySeconds: 5
102+
periodSeconds: 5
103+
failureThreshold: 24
104+
timeoutSeconds: 2
105+
readinessProbe:
106+
initialDelaySeconds: 5
107+
periodSeconds: 5
108+
timeoutSeconds: 2
109+
startupProbe:
110+
initialDelaySeconds: 5
111+
periodSeconds: 5
112+
failureThreshold: 120
Lines changed: 3 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -1,109 +1,5 @@
1-
# Copyright (C) 2024 Intel Corporation
1+
# Copyright (C) 2025 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
33

4-
# Override CPU resource request and probe timing values in specific subcharts
5-
#
6-
# RESOURCES
7-
#
8-
# Resource request matching actual resource usage (with enough slack)
9-
# is important when service is scaled up, so that right amount of pods
10-
# get scheduled to right nodes.
11-
#
12-
# Because resource usage depends on the used devices, model, data type
13-
# and SW versions, and this top-level chart has overrides for them,
14-
# resource requests need to be specified here too.
15-
#
16-
# To test service without resource request, use "resources: {}".
17-
#
18-
# PROBES
19-
#
20-
# Inferencing pods startup / warmup takes *much* longer on CPUs than
21-
# with acceleration devices, and their responses are also slower,
22-
# especially when node is running several instances of these services.
23-
#
24-
# Kubernetes restarting pod before its startup finishes, or not
25-
# sending it queries because it's not in ready state due to slow
26-
# readiness responses, does really NOT help in getting faster responses.
27-
#
28-
# => probe timings need to be increased when running on CPU.
29-
30-
tgi:
31-
# TODO: add Helm value also for TGI data type option:
32-
# https://github.com/opea-project/GenAIExamples/issues/330
33-
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
34-
35-
# Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
36-
resources:
37-
limits:
38-
cpu: 8
39-
memory: 70Gi
40-
requests:
41-
cpu: 6
42-
memory: 65Gi
43-
44-
livenessProbe:
45-
initialDelaySeconds: 8
46-
periodSeconds: 8
47-
failureThreshold: 24
48-
timeoutSeconds: 4
49-
readinessProbe:
50-
initialDelaySeconds: 16
51-
periodSeconds: 8
52-
timeoutSeconds: 4
53-
startupProbe:
54-
initialDelaySeconds: 10
55-
periodSeconds: 5
56-
failureThreshold: 180
57-
timeoutSeconds: 2
58-
59-
teirerank:
60-
RERANK_MODEL_ID: "BAAI/bge-reranker-base"
61-
62-
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
63-
resources:
64-
limits:
65-
cpu: 4
66-
memory: 30Gi
67-
requests:
68-
cpu: 2
69-
memory: 25Gi
70-
71-
livenessProbe:
72-
initialDelaySeconds: 8
73-
periodSeconds: 8
74-
failureThreshold: 24
75-
timeoutSeconds: 4
76-
readinessProbe:
77-
initialDelaySeconds: 8
78-
periodSeconds: 8
79-
timeoutSeconds: 4
80-
startupProbe:
81-
initialDelaySeconds: 5
82-
periodSeconds: 5
83-
failureThreshold: 120
84-
85-
tei:
86-
EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
87-
88-
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
89-
resources:
90-
limits:
91-
cpu: 4
92-
memory: 4Gi
93-
requests:
94-
cpu: 2
95-
memory: 3Gi
96-
97-
livenessProbe:
98-
initialDelaySeconds: 5
99-
periodSeconds: 5
100-
failureThreshold: 24
101-
timeoutSeconds: 2
102-
readinessProbe:
103-
initialDelaySeconds: 5
104-
periodSeconds: 5
105-
timeoutSeconds: 2
106-
startupProbe:
107-
initialDelaySeconds: 5
108-
periodSeconds: 5
109-
failureThreshold: 120
4+
image:
5+
repository: opea/chatqna

ChatQnA/kubernetes/helm/gaudi-values.yaml renamed to ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
# Accelerate inferencing in heaviest components to improve performance
55
# by overriding their subchart values
66

7+
vllm:
8+
enabled: false
79
# TGI: largest bottleneck for ChatQnA
810
tgi:
11+
enabled: true
912
accelDevice: "gaudi"
1013
image:
1114
repository: ghcr.io/huggingface/tgi-gaudi
12-
tag: "2.0.6"
15+
tag: "2.3.1"
1316
resources:
1417
limits:
1518
habana.ai/gaudi: 1

ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66

77
tgi:
88
enabled: false
9-
109
vllm:
1110
enabled: true
11+
shmSize: 1Gi
1212
accelDevice: "gaudi"
1313
image:
1414
repository: opea/vllm-gaudi
@@ -19,7 +19,7 @@ vllm:
1919
initialDelaySeconds: 5
2020
periodSeconds: 5
2121
timeoutSeconds: 1
22-
failureThreshold: 120
22+
failureThreshold: 180
2323
readinessProbe:
2424
initialDelaySeconds: 5
2525
periodSeconds: 5
@@ -39,7 +39,6 @@ vllm:
3939
"--max-seq_len-to-capture", "2048"
4040
]
4141

42-
4342
# Reranking: second largest bottleneck when reranking is in use
4443
# (i.e. query context docs have been uploaded with data-prep)
4544
#

0 commit comments

Comments
 (0)