Skip to content

Commit bd35a62

Browse files
committed
decrease kvcache fraction to avoid OOM
Signed-off-by: Jiagan Cheng <[email protected]>
1 parent cc3beac commit bd35a62

File tree

1 file changed

+2
-0
lines changed

1 file changed

+2
-0
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,11 +2015,13 @@ def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
20152015
pytorch_config = dict(
20162016
disable_overlap_scheduler=not overlap_scheduler,
20172017
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
2018+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
20182019

20192020
with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B",
20202021
tensor_parallel_size=tp_size,
20212022
pipeline_parallel_size=pp_size,
20222023
moe_expert_parallel_size=ep_size,
2024+
kv_cache_config=kv_cache_config,
20232025
**pytorch_config,
20242026
enable_attention_dp=attention_dp) as llm:
20252027
task = CnnDailymail(self.MODEL_NAME)

0 commit comments

Comments
 (0)