feat: Add max_free_gpu_memory_size support for KV cache configuration

qixiang-99 · qixiang-99 · commit e658165e8723 · 2025-07-11T16:29:13.000-07:00
- Introduced max_free_gpu_memory_size to manage GPU memory allocation for KV cache.
- Updated KvCacheConfig and related methods to handle the new parameter.
- Modified estimation logic in KvCacheCreator to utilize max_free_gpu_memory_size for VSWA cases.
- Adjusted resource management to ensure compatibility with the new memory allocation strategy.

Signed-off-by: qixiang-99 &lt;203170375+qixiang-99@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1001,7 +1001,8 @@ class KvCacheConfig
         std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
         std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0,
         bool enablePartialReuse = true, bool copyOnPartialReuse = true, bool useUvm = false,
-        std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults = std::nullopt);
+        std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults = std::nullopt,
+        std::optional<uint64_t> const& maxFreeGpuMemorySize = std::nullopt);
 
     [[nodiscard]] bool getEnableBlockReuse() const;
     [[nodiscard]] bool getEnablePartialReuse() const;
@@ -1016,6 +1017,7 @@ class KvCacheConfig
     [[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;
     [[nodiscard]] size_t getEventBufferMaxSize() const;
     [[nodiscard]] bool getUseUvm() const;
+    [[nodiscard]] std::optional<uint64_t> getMaxFreeGpuMemorySize() const;
 
     void setEnableBlockReuse(bool enableBlockReuse);
     void setEnablePartialReuse(bool enablePartialReuse);
@@ -1030,6 +1032,7 @@ class KvCacheConfig
     void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);
     void setEventBufferMaxSize(size_t eventBufferMaxSize);
     void setUseUvm(bool useUvm);
+    void setMaxFreeGpuMemorySize(uint64_t maxFreeGpuMemorySize);
 
     void fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults const& runtimeDefaults);
 
@@ -1085,6 +1088,12 @@ class KvCacheConfig
 
     /// @brief Whether to use UVM for the KV cache.
     bool mUseUvm;
+
+    /// @brief The maximum size in bytes of GPU memory that can be allocated for the KV cache.
+    /// This is only used for VSWA case for now as a alternative to mMaxTokens.
+    /// If both mMaxFreeGpuMemorySize and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will
+    /// be allocated.
+    std::optional<uint64_t> mMaxFreeGpuMemorySize;
 };
 
 /// @brief Configuration class for the runtime perf knobs
diff --git a/cpp/tensorrt_llm/executor/kvCacheConfig.cpp b/cpp/tensorrt_llm/executor/kvCacheConfig.cpp
@@ -27,7 +27,8 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co
     std::optional<size_t> const& hostCacheSize, bool onboardBlocks,
     std::optional<FloatType> const& crossKvCacheFraction, std::optional<RetentionPriority> secondaryOffloadMinPriority,
     size_t eventBufferMaxSize, bool enablePartialReuse, bool copyOnPartialReuse, bool useUvm,
-    std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults)
+    std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults,
+    std::optional<uint64_t> const& maxFreeGpuMemorySize)
     : mEnableBlockReuse(enableBlockReuse)
     , mHostCacheSize(hostCacheSize)
     , mOnboardBlocks(onboardBlocks)
@@ -61,6 +62,10 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co
     {
         fillEmptyFieldsFromRuntimeDefaults(runtimeDefaults.value());
     }
+    if (maxFreeGpuMemorySize)
+    {
+        setMaxFreeGpuMemorySize(maxFreeGpuMemorySize.value());
+    }
 }
 
 bool KvCacheConfig::getEnableBlockReuse() const
@@ -128,6 +133,11 @@ bool KvCacheConfig::getUseUvm() const
     return mUseUvm;
 }
 
+std::optional<uint64_t> KvCacheConfig::getMaxFreeGpuMemorySize() const
+{
+    return mMaxFreeGpuMemorySize;
+}
+
 void KvCacheConfig::setEnableBlockReuse(bool enableBlockReuse)
 {
     mEnableBlockReuse = enableBlockReuse;
@@ -207,6 +217,11 @@ void KvCacheConfig::setUseUvm(bool useUvm)
     mUseUvm = useUvm;
 }
 
+void KvCacheConfig::setMaxFreeGpuMemorySize(uint64_t maxFreeGpuMemorySize)
+{
+    mMaxFreeGpuMemorySize = maxFreeGpuMemorySize;
+}
+
 void KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults const& runtimeDefaults)
 {
     if (!mMaxAttentionWindowVec && runtimeDefaults.maxAttentionWindowVec)
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -121,14 +121,14 @@ void initConfigBindings(pybind11::module_& m)
         .def(py::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
                  std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
                  std::optional<float> const&, std::optional<tle::RetentionPriority>, size_t const&, bool, bool, bool,
-                 std::optional<RuntimeDefaults> const&>(),
+                 std::optional<RuntimeDefaults> const&, std::optional<uint64_t> const&>(),
             py::arg("enable_block_reuse") = true, py::arg("max_tokens") = py::none(),
             py::arg("max_attention_window") = py::none(), py::arg("sink_token_length") = py::none(),
             py::arg("free_gpu_memory_fraction") = py::none(), py::arg("host_cache_size") = py::none(),
             py::arg("onboard_blocks") = true, py::arg("cross_kv_cache_fraction") = py::none(),
             py::arg("secondary_offload_min_priority") = py::none(), py::arg("event_buffer_max_size") = 0, py::kw_only(),
             py::arg("enable_partial_reuse") = true, py::arg("copy_on_partial_reuse") = true, py::arg("use_uvm") = false,
-            py::arg("runtime_defaults") = py::none())
+            py::arg("runtime_defaults") = py::none(), py::arg("max_free_gpu_memory_size") = py::none())
         .def_property(
             "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
         .def_property("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
@@ -138,6 +138,8 @@ void initConfigBindings(pybind11::module_& m)
             "sink_token_length", &tle::KvCacheConfig::getSinkTokenLength, &tle::KvCacheConfig::setSinkTokenLength)
         .def_property("free_gpu_memory_fraction", &tle::KvCacheConfig::getFreeGpuMemoryFraction,
             &tle::KvCacheConfig::setFreeGpuMemoryFraction)
+        .def_property("max_free_gpu_memory_size", &tle::KvCacheConfig::getMaxFreeGpuMemorySize,
+            &tle::KvCacheConfig::setMaxFreeGpuMemorySize)
         .def_property("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize)
         .def_property("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks)
         .def_property("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction,
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -94,8 +94,13 @@ def _get_free_gpu_memory_fraction(self) -> float:
             fraction = 0.9
         return fraction
 
-    def _cal_max_tokens(self, peak_memory, total_gpu_memory, fraction,
-                        alloc_kv_tokens: int) -> int:
+    def _cal_max_tokens_and_memory(self, peak_memory, total_gpu_memory,
+                                   fraction,
+                                   alloc_kv_tokens: int) -> tuple[int, int]:
+        """
+        Calculate the max KV cache capacity in max_tokens and max_free_gpu_memory_size.
+        For VSWA case, we use max_free_gpu_memory_size instead of max_tokens.
+        """
         model_config = self._model_engine.model.model_config
         mapping = self._mapping
         kv_size_per_token = self._get_cache_size_per_token(
@@ -115,7 +120,7 @@ def _cal_max_tokens(self, peak_memory, total_gpu_memory, fraction,
         )
         max_tokens = int((available_kv_mem) // kv_size_per_token)
         max_tokens = max(max_tokens, 0)
-        return max_tokens
+        return max_tokens, int(available_kv_mem)
 
     def _create_dummy_context_requests(
             self, input_seq_len: int) -> List[trtllm.Request]:
@@ -185,8 +190,9 @@ def try_prepare_estimation(self) -> bool:
             )
         return estimating_kv_cache
 
-    def estimate_max_tokens(self, py_executor: PyExecutor) -> None:
+    def estimate_max_tokens_or_memory(self, py_executor: PyExecutor) -> None:
         """Perform KV cache capacity estimation.
+        NOTE: for VSWA case, we calculate and set kv cache memory instead of using max_tokens in kv_cache_config.
 
         This updates `kv_cache_config`.
         """
@@ -255,16 +261,29 @@ def estimate_max_tokens(self, py_executor: PyExecutor) -> None:
         kv_stats = py_executor.resource_manager.resource_managers.get(
             ResourceManagerType.KV_CACHE_MANAGER).get_kv_cache_stats()
 
-        kv_cache_max_tokens = self._cal_max_tokens(
+        kv_cache_max_tokens, kv_cache_max_memory = self._cal_max_tokens_and_memory(
             peak_memory, total_gpu_memory, fraction,
             kv_stats.max_num_blocks * kv_stats.tokens_per_block)
 
         if self._max_kv_tokens_in is not None:
             kv_cache_max_tokens = min(kv_cache_max_tokens,
                                       self._max_kv_tokens_in)
 
-        logger.info(f"Estimated max tokens in KV cache : {kv_cache_max_tokens}")
-        executor_config.kv_cache_config.max_tokens = kv_cache_max_tokens
+        if executor_config.kv_cache_config.max_attention_window is not None:
+            # NOTE: for VSWA case, we calculate and set kv cache memory instead of using max_tokens in kv_cache_config.
+            assert kv_cache_max_memory is not None, "kv_cache_max_memory should not be None for VSWA case"
+            executor_config.kv_cache_config.max_free_gpu_memory_size = int(
+                kv_cache_max_memory)
+            logger.debug(
+                f"For VSWA case, we set max_free_gpu_memory_size instead of max_tokens in kv_cache_config."
+            )
+            logger.info(
+                f"Estimated max memory in KV cache : {kv_cache_max_memory / (GB):.2f} GiB"
+            )
+        else:
+            logger.info(
+                f"Estimated max tokens in KV cache : {kv_cache_max_tokens}")
+            executor_config.kv_cache_config.max_tokens = kv_cache_max_tokens
 
     def _create_kv_cache_manager(
             self, model_engine: PyTorchModelEngine) -> KVCacheManager:
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -393,7 +393,7 @@ def create_py_executor(
         assert kv_cache_creator is not None
         with mem_monitor.observe_creation_stage(
                 _ExecutorCreationStage.MODEL_EXTRA):
-            kv_cache_creator.estimate_max_tokens(py_executor)
+            kv_cache_creator.estimate_max_tokens_or_memory(py_executor)
         kv_cache_creator.teardown_managers(resources)
         del py_executor  # free before constructing new
 
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -207,8 +207,6 @@ def __init__(
                 kv_cache_config, KvCacheConfigCpp
             ), "calculate_max_num_blocks_from_cpp only accepts KvCacheConfigCpp"
 
-            # overwrite max_tokens in VSWA case
-            kv_cache_config.max_tokens = None
             blocks_per_window = self.calculate_max_num_blocks_from_cpp(
                 kv_cache_config=kv_cache_config,
                 model_config=model_config,
@@ -636,7 +634,13 @@ def calculate_max_num_blocks_from_cpp(
         logger.debug(f"window_size_to_layers: {window_size_to_layers}")
 
         free_mem, total_mem = torch.cuda.mem_get_info()
-        primary_pool_memory_bytes = int(free_mem * 0.9)
+        primary_pool_memory_bytes = int(free_mem)
+        if kv_cache_config.max_free_gpu_memory_size is not None:
+            # overwrite max_tokens in VSWA case, use max_free_gpu_memory_size instead
+            kv_cache_config.max_tokens = None
+            primary_pool_memory_bytes = min(
+                kv_cache_config.max_free_gpu_memory_size,
+                primary_pool_memory_bytes)
         secondary_pool_memory_bytes = 0
         logger.debug(
             f"primary_pool_memory_bytes is set to {primary_pool_memory_bytes/1024**3}GB, \nsecondary_pool_memory_bytes is set to {secondary_pool_memory_bytes/1024**3}GB"
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -796,6 +796,11 @@ class KvCacheConfig(BaseModel, PybindMirror):
     )
     use_uvm: bool = Field(default=False,
                           description="Whether to use UVM for the KV cache.")
+    max_free_gpu_memory_size: Optional[int] = Field(
+        default=None,
+        description=
+        "The maximum size in bytes of GPU memory that can be allocated for the KV cache. This is only used for VSWA case for now as a alternative to `max_tokens`. If both `max_free_gpu_memory_size` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be allocated."
+    )
 
     def _to_pybind(self):
         return _KvCacheConfig(
@@ -811,7 +816,8 @@ def _to_pybind(self):
             event_buffer_max_size=self.event_buffer_max_size,
             enable_partial_reuse=self.enable_partial_reuse,
             copy_on_partial_reuse=self.copy_on_partial_reuse,
-            use_uvm=self.use_uvm)
+            use_uvm=self.use_uvm,
+            max_free_gpu_memory_size=self.max_free_gpu_memory_size)
 
 
 @PybindMirror.mirror_pybind_fields(_ExtendedRuntimePerfKnobConfig)

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,8 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co`
`27`	`27`	`std::optional<size_t> const& hostCacheSize, bool onboardBlocks,`
`28`	`28`	`std::optional<FloatType> const& crossKvCacheFraction, std::optional<RetentionPriority> secondaryOffloadMinPriority,`
`29`	`29`	`size_t eventBufferMaxSize, bool enablePartialReuse, bool copyOnPartialReuse, bool useUvm,`
`30`		`- std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults)`
	`30`	`+ std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults,`
	`31`	`+ std::optional<uint64_t> const& maxFreeGpuMemorySize)`
`31`	`32`	`: mEnableBlockReuse(enableBlockReuse)`
`32`	`33`	`, mHostCacheSize(hostCacheSize)`
`33`	`34`	`, mOnboardBlocks(onboardBlocks)`
`@@ -61,6 +62,10 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co`
`61`	`62`	`{`
`62`	`63`	`fillEmptyFieldsFromRuntimeDefaults(runtimeDefaults.value());`
`63`	`64`	`}`
	`65`	`+ if (maxFreeGpuMemorySize)`
	`66`	`+ {`
	`67`	`+ setMaxFreeGpuMemorySize(maxFreeGpuMemorySize.value());`
	`68`	`+ }`
`64`	`69`	`}`
`65`	`70`
`66`	`71`	`bool KvCacheConfig::getEnableBlockReuse() const`
`@@ -128,6 +133,11 @@ bool KvCacheConfig::getUseUvm() const`
`128`	`133`	`return mUseUvm;`
`129`	`134`	`}`
`130`	`135`
	`136`	`+std::optional<uint64_t> KvCacheConfig::getMaxFreeGpuMemorySize() const`
	`137`	`+{`
	`138`	`+ return mMaxFreeGpuMemorySize;`
	`139`	`+}`
	`140`	`+`
`131`	`141`	`void KvCacheConfig::setEnableBlockReuse(bool enableBlockReuse)`
`132`	`142`	`{`
`133`	`143`	`mEnableBlockReuse = enableBlockReuse;`
`@@ -207,6 +217,11 @@ void KvCacheConfig::setUseUvm(bool useUvm)`
`207`	`217`	`mUseUvm = useUvm;`
`208`	`218`	`}`
`209`	`219`
	`220`	`+void KvCacheConfig::setMaxFreeGpuMemorySize(uint64_t maxFreeGpuMemorySize)`
	`221`	`+{`
	`222`	`+ mMaxFreeGpuMemorySize = maxFreeGpuMemorySize;`
	`223`	`+}`
	`224`	`+`
`210`	`225`	`void KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults const& runtimeDefaults)`
`211`	`226`	`{`
`212`	`227`	`if (!mMaxAttentionWindowVec && runtimeDefaults.maxAttentionWindowVec)`