From 44a6d6deaabd1638770a3f118abf253a21424c70 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 13:46:59 +0800
Subject: [PATCH 1/8] log transformed bytecode

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 20 +++++++++++++-------
 vllm/compilation/wrapper.py  | 20 ++++++++++++++++++++
 vllm/config.py               |  1 +
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 955c25f30051..9f5b37788fff 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -524,6 +524,7 @@ def configure_post_pass(self):
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
+        vllm_config = self.vllm_config
         if not self.compilation_config.cache_dir:
             # no provided cache dir, generate one based on the known factors
             # that affects the compilation. if none of the factors change,
@@ -532,7 +533,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
             # 1. factors come from the vllm_config (it mainly summarizes how the
             #    model is created)
-            vllm_config = self.vllm_config
             config_hash = vllm_config.compute_hash()
 
             # 2. factors come from the code files that are traced by Dynamo (
@@ -556,20 +556,26 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             hash_key = hashlib.md5(
                 f"{config_hash}_{code_hash}".encode()).hexdigest()[:10]
             cache_dir = os.path.join(
-                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
-                f"rank_{vllm_config.parallel_config.rank}")
-        else:
-            cache_dir = self.compilation_config.cache_dir
+                envs.VLLM_CACHE_ROOT,
+                "torch_compile_cache",
+                hash_key,
+            )
+            self.compilation_config.cache_dir = cache_dir
+
+        cache_dir = self.compilation_config.cache_dir
         os.makedirs(cache_dir, exist_ok=True)
+        local_cache_dir = os.path.join(
+            cache_dir, f"rank_{vllm_config.parallel_config.rank}")
+        self.compilation_config.local_cache_dir = local_cache_dir
 
         disabled = envs.VLLM_DISABLE_COMPILE_CACHE
         self.inductor_hash_cache: InductorHashCache = InductorHashCache(
-            cache_dir, disabled=disabled)
+            local_cache_dir, disabled=disabled)
         if disabled:
             logger.info("vLLM's torch.compile cache is disabled.")
         else:
             logger.info("Using cache directory: %s for vLLM's torch.compile",
-                        cache_dir)
+                        local_cache_dir)
 
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index e3260a10c02a..7dfa1b67c966 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -9,6 +9,9 @@
 
 import vllm.envs as envs
 from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class TorchCompileWrapperWithCustomDispatcher:
@@ -82,6 +85,23 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
             return
 
         self.compiled_codes.append(new_code)
+        decompiled_file = os.path.join(self.compilation_config.local_cache_dir,
+                                       "transformed_code.py")
+        if not os.path.exists(decompiled_file):
+            try:
+                # usually the decompilation will succeed for most models, as
+                # we guarantee a full-graph compilation in Dynamo.
+                # but there's no 100% guarantee, since decompliation is not a
+                # reversible process.
+                import depyf
+                src = depyf.decompile(new_code)
+                with open(decompiled_file, "w") as f:
+                    f.write(src)
+            except Exception:
+                pass
+
+        if os.path.exists(decompiled_file):
+            logger.info("Dynamo transformed code saved to %s", decompiled_file)
 
         if self.vllm_config.compilation_config.use_cudagraph and \
             "update" in new_code.co_names:
diff --git a/vllm/config.py b/vllm/config.py
index b0a92b2e2134..5e65842b1d17 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2785,6 +2785,7 @@ def model_post_init(self, __context: Any) -> None:
     compile_sizes: List[int] = PrivateAttr
     capture_sizes: List[int] = PrivateAttr
     max_capture_size: int = PrivateAttr
+    local_cache_dir: str = PrivateAttr  # local cache dir for one rank
     # optimization:
     # Intuitively, bs_to_padded_graph_size should be Dict[int, int].
     # since we know all keys are in a range [0, max_capture_size],

From 9e77f982bceb653709ef73bf07bd64d6393eb40b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 13:55:22 +0800
Subject: [PATCH 2/8] log computation graph

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 9f5b37788fff..4cebcadc3d84 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -615,6 +615,16 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
                                     self.vllm_config, self.graph_pool,
                                     self).run(*example_inputs)
 
+        graph_path = os.path.join(local_cache_dir, "computation_graph.py")
+        if not os.path.exists(graph_path):
+            # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa
+            # use `print_readable` because it can include submodules
+            src = "from __future__ import annotations\nimport torch\n" + \
+                self.split_gm.print_readable(print_output=False)
+            src = src.replace("<lambda>", "GraphModule")
+            with open(graph_path, "w") as f:
+                f.write(src)
+
         self._called = True
 
         if not self.compilation_config.use_cudagraph or \

From 12dc138af9089cb83dc11e39a2ea7ce3e716af4e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 13:58:29 +0800
Subject: [PATCH 3/8] reduce logging

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py |  2 ++
 vllm/compilation/wrapper.py  | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 4cebcadc3d84..317b0488c643 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -625,6 +625,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             with open(graph_path, "w") as f:
                 f.write(src)
 
+            logger.info("Computation graph saved to %s", graph_path)
+
         self._called = True
 
         if not self.compilation_config.use_cudagraph or \
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 7dfa1b67c966..a3ab16735391 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -85,8 +85,8 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
             return
 
         self.compiled_codes.append(new_code)
-        decompiled_file = os.path.join(self.compilation_config.local_cache_dir,
-                                       "transformed_code.py")
+        local_cache_dir = self.vllm_config.compilation_config.local_cache_dir
+        decompiled_file = os.path.join(local_cache_dir, "transformed_code.py")
         if not os.path.exists(decompiled_file):
             try:
                 # usually the decompilation will succeed for most models, as
@@ -97,12 +97,12 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
                 src = depyf.decompile(new_code)
                 with open(decompiled_file, "w") as f:
                     f.write(src)
+
+                logger.info("Dynamo transformed code saved to %s",
+                            decompiled_file)
             except Exception:
                 pass
 
-        if os.path.exists(decompiled_file):
-            logger.info("Dynamo transformed code saved to %s", decompiled_file)
-
         if self.vllm_config.compilation_config.use_cudagraph and \
             "update" in new_code.co_names:
             import depyf

From 95893aaaca6d4613e26f133aa6b8592f9201717d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 14:06:33 +0800
Subject: [PATCH 4/8] add start log

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/decorators.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 38f284794b8d..8c8e99f59568 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -198,6 +198,8 @@ def __call__(self, *args, **kwargs):
                             f" {dims} for argument {k} with type {type(arg)}.")
             # here, it is the starting point of the `torch.compile` process
             start_monitoring_torch_compile(self.vllm_config)
+            logger.info("Start compiling function %s",
+                        self.original_code_object)
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,

From c25cd1d9f75dc6b9e8558e14bf261308d602a867 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 14:17:07 +0800
Subject: [PATCH 5/8] polish comments

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 5e65842b1d17..b8628db4d2b8 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2785,7 +2785,7 @@ def model_post_init(self, __context: Any) -> None:
     compile_sizes: List[int] = PrivateAttr
     capture_sizes: List[int] = PrivateAttr
     max_capture_size: int = PrivateAttr
-    local_cache_dir: str = PrivateAttr  # local cache dir for one rank
+    local_cache_dir: str = PrivateAttr  # local cache dir for each rank
     # optimization:
     # Intuitively, bs_to_padded_graph_size should be Dict[int, int].
     # since we know all keys are in a range [0, max_capture_size],

From ed404d9ae99cd6615e21e36c805e722a838c096b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 16:20:28 +0800
Subject: [PATCH 6/8] fix no cache dir case

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/wrapper.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index a3ab16735391..dc1464a14413 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -86,22 +86,24 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
 
         self.compiled_codes.append(new_code)
         local_cache_dir = self.vllm_config.compilation_config.local_cache_dir
-        decompiled_file = os.path.join(local_cache_dir, "transformed_code.py")
-        if not os.path.exists(decompiled_file):
-            try:
-                # usually the decompilation will succeed for most models, as
-                # we guarantee a full-graph compilation in Dynamo.
-                # but there's no 100% guarantee, since decompliation is not a
-                # reversible process.
-                import depyf
-                src = depyf.decompile(new_code)
-                with open(decompiled_file, "w") as f:
-                    f.write(src)
-
-                logger.info("Dynamo transformed code saved to %s",
-                            decompiled_file)
-            except Exception:
-                pass
+        if isinstance(local_cache_dir, str):
+            decompiled_file = os.path.join(local_cache_dir,
+                                           "transformed_code.py")
+            if not os.path.exists(decompiled_file):
+                try:
+                    # usually the decompilation will succeed for most models,
+                    # as we guarantee a full-graph compilation in Dynamo.
+                    # but there's no 100% guarantee, since decompliation is
+                    # not a reversible process.
+                    import depyf
+                    src = depyf.decompile(new_code)
+                    with open(decompiled_file, "w") as f:
+                        f.write(src)
+
+                    logger.info("Dynamo transformed code saved to %s",
+                                decompiled_file)
+                except Exception:
+                    pass
 
         if self.vllm_config.compilation_config.use_cudagraph and \
             "update" in new_code.co_names:

From f4c434df1fad280c3d180f732a3bff5ed850426c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 16:25:29 +0800
Subject: [PATCH 7/8] use debug

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 2 +-
 vllm/compilation/wrapper.py  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 317b0488c643..b9f96c00284b 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -625,7 +625,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             with open(graph_path, "w") as f:
                 f.write(src)
 
-            logger.info("Computation graph saved to %s", graph_path)
+            logger.debug("Computation graph saved to %s", graph_path)
 
         self._called = True
 
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index dc1464a14413..58a8fa76f6ce 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -100,8 +100,8 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
                     with open(decompiled_file, "w") as f:
                         f.write(src)
 
-                    logger.info("Dynamo transformed code saved to %s",
-                                decompiled_file)
+                    logger.debug("Dynamo transformed code saved to %s",
+                                 decompiled_file)
                 except Exception:
                     pass
 

From fd753d11e584245300577c23c3c1a958d7cecda3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 16:29:17 +0800
Subject: [PATCH 8/8] use debug

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/decorators.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 8c8e99f59568..17eb0592ced6 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -198,8 +198,8 @@ def __call__(self, *args, **kwargs):
                             f" {dims} for argument {k} with type {type(arg)}.")
             # here, it is the starting point of the `torch.compile` process
             start_monitoring_torch_compile(self.vllm_config)
-            logger.info("Start compiling function %s",
-                        self.original_code_object)
+            logger.debug("Start compiling function %s",
+                         self.original_code_object)
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,