diff --git a/vllm/envs.py b/vllm/envs.py index 16f635b3ac4..ca45d69eec1 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -140,6 +140,7 @@ VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120 VLLM_USE_CUDNN_PREFILL: bool = False + VLLM_ENABLE_CUDAGRAPH_GC: bool = False VLLM_LOOPBACK_IP: str = "" @@ -968,6 +969,12 @@ def get_vllm_port() -> Optional[int]: "VLLM_USE_TRTLLM_DECODE_ATTENTION": lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None), + # Controls garbage collection during CUDA graph capture. + # If set to 0 (default), enables GC freezing to speed up capture time. + # If set to 1, allows GC to run during capture. + "VLLM_ENABLE_CUDAGRAPH_GC": + lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))), + # Used to force set up loopback IP "VLLM_LOOPBACK_IP": lambda: os.getenv("VLLM_LOOPBACK_IP", ""), diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4c14ac3be3c..5fbc1d7d38f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1996,7 +1996,7 @@ def maybe_randomize_inputs(self, input_ids: torch.Tensor): Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set. This is to help balance expert-selection - during profile_run - - during DP rank dummy run + - during DP rank dummy run """ dp_size = self.vllm_config.parallel_config.data_parallel_size randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1 @@ -2364,10 +2364,25 @@ def capture_model(self) -> None: start_time = time.perf_counter() start_free_gpu_memory = torch.cuda.mem_get_info()[0] + @contextmanager + def freeze_gc(): + # Optimize garbage collection during CUDA graph capture. + # Clean up, then freeze all remaining objects from being included + # in future collections. + gc.collect() + should_freeze = not envs.VLLM_ENABLE_CUDAGRAPH_GC + if should_freeze: + gc.freeze() + try: + yield + finally: + if should_freeze: + gc.unfreeze() + # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. - with graph_capture(device=self.device): + with freeze_gc(), graph_capture(device=self.device): full_cg = self.full_cuda_graph # Only rank 0 should print progress bar during capture compilation_cases = reversed(self.cudagraph_batch_sizes)