|
8 | 8 |
|
9 | 9 | from vllm.config import VllmConfig
|
10 | 10 | from vllm.logger import init_logger
|
11 |
| -from vllm.utils import sha256 |
| 11 | +from vllm.utils import GiB_bytes, sha256 |
12 | 12 | from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
13 | 13 | KVCacheGroupSpec, KVCacheSpec,
|
14 | 14 | KVCacheTensor, SlidingWindowSpec)
|
@@ -459,6 +459,54 @@ def hash_request_tokens(hash_function: Any, block_size: int,
|
459 | 459 | return ret
|
460 | 460 |
|
461 | 461 |
|
| 462 | +def estimate_max_model_len(vllm_config: VllmConfig, |
| 463 | + kv_cache_spec: dict[str, KVCacheSpec], |
| 464 | + available_memory: int) -> int: |
| 465 | + """ |
| 466 | + Estimates the maximum model length that can fit in the available memory |
| 467 | + using binary search. |
| 468 | +
|
| 469 | + Args: |
| 470 | + vllm_config: The global VllmConfig |
| 471 | + kv_cache_spec: The kv cache spec of each attention layer in the model |
| 472 | + available_memory: Memory available for KV cache in bytes. |
| 473 | +
|
| 474 | + Returns: |
| 475 | + The estimated maximum model length that can fit in the available memory. |
| 476 | + """ |
| 477 | + |
| 478 | + # Define a function to check if a given model length fits in memory |
| 479 | + def fits_in_memory(model_len: int) -> bool: |
| 480 | + # Modify the max_model_len for this calculation |
| 481 | + vllm_config.model_config.max_model_len = model_len |
| 482 | + # Calculate memory needed for the given model length |
| 483 | + memory_needed = sum( |
| 484 | + (layer_spec.max_memory_usage_bytes(vllm_config) |
| 485 | + for layer_spec in kv_cache_spec.values()), |
| 486 | + start=0, |
| 487 | + ) |
| 488 | + return memory_needed <= available_memory |
| 489 | + |
| 490 | + # Binary search for the maximum model length |
| 491 | + current_max = vllm_config.model_config.max_model_len |
| 492 | + left, right = 1, current_max |
| 493 | + |
| 494 | + # If even the smallest model length doesn't fit, return 0 |
| 495 | + if not fits_in_memory(left): |
| 496 | + return 0 |
| 497 | + |
| 498 | + # Binary search for the maximum model length that fits |
| 499 | + result = 1 |
| 500 | + while left <= right: |
| 501 | + mid = (left + right) // 2 |
| 502 | + if fits_in_memory(mid): |
| 503 | + result = mid |
| 504 | + left = mid + 1 |
| 505 | + else: |
| 506 | + right = mid - 1 |
| 507 | + return result |
| 508 | + |
| 509 | + |
462 | 510 | def check_enough_kv_cache_memory(vllm_config: VllmConfig,
|
463 | 511 | kv_cache_spec: dict[str, KVCacheSpec],
|
464 | 512 | available_memory: int):
|
@@ -486,12 +534,21 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
|
486 | 534 | needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
|
487 | 535 |
|
488 | 536 | if needed_memory > available_memory:
|
| 537 | + # Estimate the maximum model length that can fit in the available memory |
| 538 | + estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec, |
| 539 | + available_memory) |
| 540 | + estimated_msg = "" |
| 541 | + if estimated_max_len > 0: |
| 542 | + estimated_msg = " Based on the available memory," |
| 543 | + f" the estimated maximum model length is {estimated_max_len}." |
| 544 | + |
489 | 545 | raise ValueError(
|
490 | 546 | f"To serve at least one request with the models's max seq len "
|
491 |
| - f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GiB KV " |
| 547 | + f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV " |
492 | 548 | f"cache is needed, which is larger than the available KV cache "
|
493 |
| - f"memory ({available_memory/1024/1024/1024:.2f} GiB). Try " |
494 |
| - f"increasing `gpu_memory_utilization` or decreasing " |
| 549 | + f"memory ({available_memory/GiB_bytes:.2f} GiB)." |
| 550 | + f"{estimated_msg} " |
| 551 | + f" Try increasing `gpu_memory_utilization` or decreasing " |
495 | 552 | f"`max_model_len` when initializing the engine.")
|
496 | 553 |
|
497 | 554 |
|
|
0 commit comments