From 44a6d6deaabd1638770a3f118abf253a21424c70 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 21 Jan 2025 13:46:59 +0800 Subject: [PATCH 1/8] log transformed bytecode Signed-off-by: youkaichao --- vllm/compilation/backends.py | 20 +++++++++++++------- vllm/compilation/wrapper.py | 20 ++++++++++++++++++++ vllm/config.py | 1 + 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 955c25f30051..9f5b37788fff 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -524,6 +524,7 @@ def configure_post_pass(self): def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: + vllm_config = self.vllm_config if not self.compilation_config.cache_dir: # no provided cache dir, generate one based on the known factors # that affects the compilation. if none of the factors change, @@ -532,7 +533,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: # 1. factors come from the vllm_config (it mainly summarizes how the # model is created) - vllm_config = self.vllm_config config_hash = vllm_config.compute_hash() # 2. factors come from the code files that are traced by Dynamo ( @@ -556,20 +556,26 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: hash_key = hashlib.md5( f"{config_hash}_{code_hash}".encode()).hexdigest()[:10] cache_dir = os.path.join( - envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key, - f"rank_{vllm_config.parallel_config.rank}") - else: - cache_dir = self.compilation_config.cache_dir + envs.VLLM_CACHE_ROOT, + "torch_compile_cache", + hash_key, + ) + self.compilation_config.cache_dir = cache_dir + + cache_dir = self.compilation_config.cache_dir os.makedirs(cache_dir, exist_ok=True) + local_cache_dir = os.path.join( + cache_dir, f"rank_{vllm_config.parallel_config.rank}") + self.compilation_config.local_cache_dir = local_cache_dir disabled = envs.VLLM_DISABLE_COMPILE_CACHE self.inductor_hash_cache: InductorHashCache = InductorHashCache( - cache_dir, disabled=disabled) + local_cache_dir, disabled=disabled) if disabled: logger.info("vLLM's torch.compile cache is disabled.") else: logger.info("Using cache directory: %s for vLLM's torch.compile", - cache_dir) + local_cache_dir) # when dynamo calls the backend, it means the bytecode # transform and analysis are done diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index e3260a10c02a..7dfa1b67c966 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -9,6 +9,9 @@ import vllm.envs as envs from vllm.config import CompilationLevel, get_current_vllm_config +from vllm.logger import init_logger + +logger = init_logger(__name__) class TorchCompileWrapperWithCustomDispatcher: @@ -82,6 +85,23 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): return self.compiled_codes.append(new_code) + decompiled_file = os.path.join(self.compilation_config.local_cache_dir, + "transformed_code.py") + if not os.path.exists(decompiled_file): + try: + # usually the decompilation will succeed for most models, as + # we guarantee a full-graph compilation in Dynamo. + # but there's no 100% guarantee, since decompliation is not a + # reversible process. + import depyf + src = depyf.decompile(new_code) + with open(decompiled_file, "w") as f: + f.write(src) + except Exception: + pass + + if os.path.exists(decompiled_file): + logger.info("Dynamo transformed code saved to %s", decompiled_file) if self.vllm_config.compilation_config.use_cudagraph and \ "update" in new_code.co_names: diff --git a/vllm/config.py b/vllm/config.py index b0a92b2e2134..5e65842b1d17 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2785,6 +2785,7 @@ def model_post_init(self, __context: Any) -> None: compile_sizes: List[int] = PrivateAttr capture_sizes: List[int] = PrivateAttr max_capture_size: int = PrivateAttr + local_cache_dir: str = PrivateAttr # local cache dir for one rank # optimization: # Intuitively, bs_to_padded_graph_size should be Dict[int, int]. # since we know all keys are in a range [0, max_capture_size], From 9e77f982bceb653709ef73bf07bd64d6393eb40b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 21 Jan 2025 13:55:22 +0800 Subject: [PATCH 2/8] log computation graph Signed-off-by: youkaichao --- vllm/compilation/backends.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 9f5b37788fff..4cebcadc3d84 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -615,6 +615,16 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: self.vllm_config, self.graph_pool, self).run(*example_inputs) + graph_path = os.path.join(local_cache_dir, "computation_graph.py") + if not os.path.exists(graph_path): + # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa + # use `print_readable` because it can include submodules + src = "from __future__ import annotations\nimport torch\n" + \ + self.split_gm.print_readable(print_output=False) + src = src.replace("", "GraphModule") + with open(graph_path, "w") as f: + f.write(src) + self._called = True if not self.compilation_config.use_cudagraph or \ From 12dc138af9089cb83dc11e39a2ea7ce3e716af4e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 21 Jan 2025 13:58:29 +0800 Subject: [PATCH 3/8] reduce logging Signed-off-by: youkaichao --- vllm/compilation/backends.py | 2 ++ vllm/compilation/wrapper.py | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 4cebcadc3d84..317b0488c643 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -625,6 +625,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: with open(graph_path, "w") as f: f.write(src) + logger.info("Computation graph saved to %s", graph_path) + self._called = True if not self.compilation_config.use_cudagraph or \ diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 7dfa1b67c966..a3ab16735391 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -85,8 +85,8 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): return self.compiled_codes.append(new_code) - decompiled_file = os.path.join(self.compilation_config.local_cache_dir, - "transformed_code.py") + local_cache_dir = self.vllm_config.compilation_config.local_cache_dir + decompiled_file = os.path.join(local_cache_dir, "transformed_code.py") if not os.path.exists(decompiled_file): try: # usually the decompilation will succeed for most models, as @@ -97,12 +97,12 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): src = depyf.decompile(new_code) with open(decompiled_file, "w") as f: f.write(src) + + logger.info("Dynamo transformed code saved to %s", + decompiled_file) except Exception: pass - if os.path.exists(decompiled_file): - logger.info("Dynamo transformed code saved to %s", decompiled_file) - if self.vllm_config.compilation_config.use_cudagraph and \ "update" in new_code.co_names: import depyf From 95893aaaca6d4613e26f133aa6b8592f9201717d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 21 Jan 2025 14:06:33 +0800 Subject: [PATCH 4/8] add start log Signed-off-by: youkaichao --- vllm/compilation/decorators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 38f284794b8d..8c8e99f59568 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -198,6 +198,8 @@ def __call__(self, *args, **kwargs): f" {dims} for argument {k} with type {type(arg)}.") # here, it is the starting point of the `torch.compile` process start_monitoring_torch_compile(self.vllm_config) + logger.info("Start compiling function %s", + self.original_code_object) # if we don't use custom dispatcher, we can directly call the # compiled function and let torch.compile handle the dispatching, From c25cd1d9f75dc6b9e8558e14bf261308d602a867 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 21 Jan 2025 14:17:07 +0800 Subject: [PATCH 5/8] polish comments Signed-off-by: youkaichao --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 5e65842b1d17..b8628db4d2b8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2785,7 +2785,7 @@ def model_post_init(self, __context: Any) -> None: compile_sizes: List[int] = PrivateAttr capture_sizes: List[int] = PrivateAttr max_capture_size: int = PrivateAttr - local_cache_dir: str = PrivateAttr # local cache dir for one rank + local_cache_dir: str = PrivateAttr # local cache dir for each rank # optimization: # Intuitively, bs_to_padded_graph_size should be Dict[int, int]. # since we know all keys are in a range [0, max_capture_size], From ed404d9ae99cd6615e21e36c805e722a838c096b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 21 Jan 2025 16:20:28 +0800 Subject: [PATCH 6/8] fix no cache dir case Signed-off-by: youkaichao --- vllm/compilation/wrapper.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index a3ab16735391..dc1464a14413 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -86,22 +86,24 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): self.compiled_codes.append(new_code) local_cache_dir = self.vllm_config.compilation_config.local_cache_dir - decompiled_file = os.path.join(local_cache_dir, "transformed_code.py") - if not os.path.exists(decompiled_file): - try: - # usually the decompilation will succeed for most models, as - # we guarantee a full-graph compilation in Dynamo. - # but there's no 100% guarantee, since decompliation is not a - # reversible process. - import depyf - src = depyf.decompile(new_code) - with open(decompiled_file, "w") as f: - f.write(src) - - logger.info("Dynamo transformed code saved to %s", - decompiled_file) - except Exception: - pass + if isinstance(local_cache_dir, str): + decompiled_file = os.path.join(local_cache_dir, + "transformed_code.py") + if not os.path.exists(decompiled_file): + try: + # usually the decompilation will succeed for most models, + # as we guarantee a full-graph compilation in Dynamo. + # but there's no 100% guarantee, since decompliation is + # not a reversible process. + import depyf + src = depyf.decompile(new_code) + with open(decompiled_file, "w") as f: + f.write(src) + + logger.info("Dynamo transformed code saved to %s", + decompiled_file) + except Exception: + pass if self.vllm_config.compilation_config.use_cudagraph and \ "update" in new_code.co_names: From f4c434df1fad280c3d180f732a3bff5ed850426c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 21 Jan 2025 16:25:29 +0800 Subject: [PATCH 7/8] use debug Signed-off-by: youkaichao --- vllm/compilation/backends.py | 2 +- vllm/compilation/wrapper.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 317b0488c643..b9f96c00284b 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -625,7 +625,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: with open(graph_path, "w") as f: f.write(src) - logger.info("Computation graph saved to %s", graph_path) + logger.debug("Computation graph saved to %s", graph_path) self._called = True diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index dc1464a14413..58a8fa76f6ce 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -100,8 +100,8 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): with open(decompiled_file, "w") as f: f.write(src) - logger.info("Dynamo transformed code saved to %s", - decompiled_file) + logger.debug("Dynamo transformed code saved to %s", + decompiled_file) except Exception: pass From fd753d11e584245300577c23c3c1a958d7cecda3 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 21 Jan 2025 16:29:17 +0800 Subject: [PATCH 8/8] use debug Signed-off-by: youkaichao --- vllm/compilation/decorators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 8c8e99f59568..17eb0592ced6 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -198,8 +198,8 @@ def __call__(self, *args, **kwargs): f" {dims} for argument {k} with type {type(arg)}.") # here, it is the starting point of the `torch.compile` process start_monitoring_torch_compile(self.vllm_config) - logger.info("Start compiling function %s", - self.original_code_object) + logger.debug("Start compiling function %s", + self.original_code_object) # if we don't use custom dispatcher, we can directly call the # compiled function and let torch.compile handle the dispatching,