NVIDIA
diff --git a/‎tensorrt_llm/_torch/compilation/backend.py‎
Lines changed: 4 additions & 6 deletions b/‎tensorrt_llm/_torch/compilation/backend.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎tensorrt_llm/_torch/compilation/piecewise_optimizer.py‎
Lines changed: 11 additions & 12 deletions b/‎tensorrt_llm/_torch/compilation/piecewise_optimizer.py‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎tensorrt_llm/_torch/compilation/utils.py‎
Lines changed: 13 additions & 2 deletions b/‎tensorrt_llm/_torch/compilation/utils.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/custom_ops/torch_custom_ops.py‎
Lines changed: 9 additions & 0 deletions b/‎tensorrt_llm/_torch/custom_ops/torch_custom_ops.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py‎
Lines changed: 14 additions & 14 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎tensorrt_llm/_torch/modules/multi_stream_utils.py‎
Lines changed: 30 additions & 3 deletions b/‎tensorrt_llm/_torch/modules/multi_stream_utils.py‎
Lines changed: 30 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/config.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/config.py‎
Lines changed: 1 addition & 0 deletions
@@ -37,7 +37,7 @@ def __init__(
         enable_inductor=True,
         enable_userbuffers=False,
         enable_piecewise_cuda_graph: bool = False,
-        cuda_graph_batch_sizes: Optional[List[int]] = None,
+        capture_num_tokens: Optional[List[int]] = None,
         max_num_streams: int = 1,
     ) -> None:
         super().__init__()
@@ -48,14 +48,12 @@ def __init__(
         self.custom_passes = Backend.get_custom_pass(enable_userbuffers)
         self.rank = tensorrt_llm.mpi_rank()
         self.enable_inductor = enable_inductor
-        self.cuda_graph_batch_sizes = (cuda_graph_batch_sizes
-                                       if cuda_graph_batch_sizes is not None
-                                       else [])
+        self.capture_num_tokens = capture_num_tokens or []
         self.piecewise_cuda_graph = enable_piecewise_cuda_graph
         self.no_optimization = False
         # We only need to create aux streams.
         self.aux_streams = Backend.Streams(
-            [torch.cuda.Stream() for i in range(max_num_streams - 1)])
+            [torch.cuda.Stream() for _ in range(max_num_streams - 1)])
         self.events = Backend.Events()
         inductor_config.enable_auto_functionalized_v2 = False
 
@@ -125,7 +123,7 @@ def optimize(
                 example_inputs,
                 self.enable_inductor,
                 self.input_num_tokens,
-                self.cuda_graph_batch_sizes,
+                self.capture_num_tokens,
                 self._graph_pool_handle,
                 len(self.aux_streams) + 1,
             )
 
@@ -14,8 +14,7 @@
 from ..utils import (get_model_extra_attrs, get_piecewise_cuda_graph_flag,
                      make_weak_ref)
 from .multi_stream.auto_multi_stream import multi_stream_schedule
-from .utils import (get_enable_piecewise_cuda_graph_capture_flag,
-                    is_call_function)
+from .utils import get_capture_piecewise_cuda_graph_flag, is_call_function
 
 
 class PiecewiseInterpreter(Interpreter):
@@ -25,7 +24,7 @@ def __init__(
         module: GraphModule,
         enable_inductor: bool,
         compile_time_num_tokens: Union[int | torch.SymInt],
-        cuda_graph_batch_sizes: list[int],
+        capture_num_tokens: list[int],
         exclude_modules_id: list[int],
         graph_pool_handle: tuple[int, int],
         garbage_collect_values: bool = True,
@@ -37,7 +36,7 @@ def __init__(
         self.fake_mode = detect_fake_mode()
 
         self.compile_time_num_tokens = compile_time_num_tokens
-        self.cuda_graph_batch_sizes = cuda_graph_batch_sizes
+        self.capture_num_tokens = capture_num_tokens
         self.exclude_modules = [f"submod_{i}" for i in exclude_modules_id]
         self.graph_pool_handle = graph_pool_handle
         self.enable_inductor = enable_inductor
@@ -86,7 +85,7 @@ def call_module(self, target, args, kwargs):
                 target,
                 self.compile_time_num_tokens,
                 runtime_num_tokens_idx,
-                self.cuda_graph_batch_sizes,
+                self.capture_num_tokens,
                 self.graph_pool_handle,
                 compile_fx(submod, args) if self.enable_inductor else submod,
                 self.enable_inductor,
@@ -120,7 +119,7 @@ def __init__(
         name: str,
         compile_time_num_tokens: Union[int | torch.SymInt],
         runtime_num_tokens_idx: tuple[int],
-        cuda_graph_batch_sizes: List[int],
+        capture_num_tokens: List[int],
         graph_pool_handle,
         default_callable: Callable,
         enable_inductor: bool,
@@ -139,9 +138,9 @@ def __init__(
 
         self.entries: dict[int, Entry] = {}
 
-        for bs in cuda_graph_batch_sizes:
-            self.entries[bs] = Entry(
-                bs,
+        for num_tokens in capture_num_tokens:
+            self.entries[num_tokens] = Entry(
+                num_tokens,
                 enable_inductor=self.enable_inductor,
                 callable=default_callable,
             )
@@ -167,7 +166,7 @@ def __call__(self, *args):
 
         if entry.cuda_graph is None:
 
-            if not get_enable_piecewise_cuda_graph_capture_flag():
+            if not get_capture_piecewise_cuda_graph_flag():
                 return entry.callable(*args)
 
             if entry.warmup_count < 3:
@@ -228,7 +227,7 @@ def piecewise_optimizer(
     example_inputs: List[torch.Tensor],
     enable_inductor: bool,
     input_num_tokens: Union[int | torch.SymInt],
-    cuda_graph_batch_sizes: Sequence[int],
+    capture_num_tokens: Sequence[int],
     graph_pool_handle: tuple[int, int],
     max_num_streams: int = 1,
 ) -> tuple[GraphModule, int]:
@@ -269,7 +268,7 @@ def piecewise_optimizer(
         gm,
         enable_inductor,
         input_num_tokens,
-        cuda_graph_batch_sizes,
+        capture_num_tokens,
         exclude_modules_id,
         graph_pool_handle,
         max_num_streams=max_num_streams,
 
@@ -1,3 +1,4 @@
+import contextlib
 from typing import Callable, List, Union
 
 import torch
@@ -33,16 +34,26 @@ def is_call_function(node: Node, target: Union[List[Callable], Callable]):
 _enable_piecewise_cuda_graph_capture = False
 
 
-def set_enable_piecewise_cuda_graph_capture_flag(enable: bool):
+def set_capture_piecewise_cuda_graph_flag(enable: bool):
     global _enable_piecewise_cuda_graph_capture
     _enable_piecewise_cuda_graph_capture = enable
 
 
-def get_enable_piecewise_cuda_graph_capture_flag() -> bool:
+def get_capture_piecewise_cuda_graph_flag() -> bool:
     global _enable_piecewise_cuda_graph_capture
     return _enable_piecewise_cuda_graph_capture
 
 
+@contextlib.contextmanager
+def capture_piecewise_cuda_graph(enable: bool):
+    prev_enable = get_capture_piecewise_cuda_graph_flag()
+    set_capture_piecewise_cuda_graph_flag(enable)
+    try:
+        yield
+    finally:
+        set_capture_piecewise_cuda_graph_flag(prev_enable)
+
+
 def inplace_info():
     inplace_map = {
         torch.ops.trtllm.flashinfer_fused_add_rmsnorm.default: {
 
@@ -8,6 +8,7 @@
 
 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
                          OptimizationProfile, TunableRunner, TuningConfig)
+from ..modules.multi_stream_utils import do_multi_stream
 from ..utils import (fp4_scale_infer_shape,
                      get_last_power_of_2_num_tokens_buckets,
                      last_positive_power_of_2)
@@ -925,25 +926,33 @@ def get_stream(stream_id: int):
 
 @torch.library.custom_op("trtllm::set_stream", mutates_args=())
 def set_stream(stream_id: int) -> None:
+    if not do_multi_stream():
+        return
     stream = get_stream(stream_id)
     assert stream is not None
     torch.cuda.set_stream(stream)
 
 
 @torch.library.custom_op("trtllm::record_event", mutates_args=())
 def record_event(event_idx: int) -> None:
+    if not do_multi_stream():
+        return
     event = get_event(event_idx)
     event.record()
 
 
 @torch.library.custom_op("trtllm::wait_event", mutates_args=())
 def wait_event(event_idx: int) -> None:
+    if not do_multi_stream():
+        return
     event = get_event(event_idx)
     event.wait()
 
 
 @torch.library.custom_op("trtllm::record_stream", mutates_args=())
 def record_stream(tensor: torch.Tensor, stream_id: int) -> None:
+    if not do_multi_stream():
+        return
     stream = get_stream(stream_id)
     assert stream is not None
     tensor.record_stream(stream)
@@ -9,12 +9,12 @@
 
 import tensorrt_llm
 import tensorrt_llm.bindings.internal.runtime as _tbr
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import is_graph_capturing
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 
 from ...distributed import AllReduce
 from ...utils import EventType
+from ..multi_stream_utils import do_multi_stream
 
 
 def _tensor_to_weight(t: torch.Tensor) -> _tbr.MoeWeight:
@@ -472,7 +472,7 @@ def start_wait_gpu_stage(self):
         assert self.func_called_count["start_wait_gpu_stage"] == 0
         self.func_called_count["start_wait_gpu_stage"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -491,7 +491,7 @@ def done_wait_gpu_stage(self):
         assert self.func_called_count["done_wait_gpu_stage"] == 0
         self.func_called_count["done_wait_gpu_stage"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.MoeBalancer].wait()
 
     def start_set_cpu_stage(self):
@@ -502,7 +502,7 @@ def start_set_cpu_stage(self):
         assert self.func_called_count["start_set_cpu_stage"] == 0
         self.func_called_count["start_set_cpu_stage"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -522,7 +522,7 @@ def done_set_cpu_stage(self):
             self.func_called_count[name] = 0
         self.statistic_flag_tensor = None
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.MoeBalancer].wait()
 
     def update_local_statistic(self, local_raw_expert_ids: torch.Tensor,
@@ -544,7 +544,7 @@ def update_local_statistic(self, local_raw_expert_ids: torch.Tensor,
                     (self.expert_count, ),
                     dtype=torch.int32,
                     device=torch.device('cuda'))
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -569,7 +569,7 @@ def get_local_statistic_tensor(self) -> Optional[torch.Tensor]:
         assert self.func_called_count["update_local_statistic"] > 0
         self.func_called_count["get_local_statistic_tensor"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.MoeBalancer].record()
                 self.event_dict[EventType.MoeBalancer].wait()
@@ -598,7 +598,7 @@ def _update_statistic():
                 self.single_layer_load_balancer_ptr)
 
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -636,7 +636,7 @@ def _update_statistic():
         if self.updates_enabled:
             self.update_local_statistic(local_raw_expert_ids, is_first_stage,
                                         is_last_stage)
-            if is_graph_capturing():
+            if do_multi_stream():
                 with torch.cuda.stream(self.aux_stream):
                     _update_statistic()
             else:
@@ -660,7 +660,7 @@ def update_statistic_with_global_ids(self,
         assert self.func_called_count["update_statistic_with_local_ids"] == 0
         self.func_called_count["update_statistic_with_global_ids"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -851,8 +851,8 @@ def set_warm_up_iter_count(self, iter_count: int):
         """
         self.load_balancer_impl.set_warm_up_iter_count(iter_count)
 
-    def set_next_iter_info(self, enable_statistic: Optional[bool],
-                           enable_update_weights: Optional[bool]):
+    def set_iter_info(self, enable_statistic: Optional[bool],
+                      enable_update_weights: Optional[bool]):
         if enable_statistic is not None:
             self.enable_statistic = enable_statistic
         if enable_update_weights is not None:
@@ -998,8 +998,8 @@ def __enter__(self):
         """
         if self.moe_load_balancer is not None and not self.moe_load_balancer.is_static_routing(
         ):
-            self.moe_load_balancer.set_next_iter_info(self.enable_statistic,
-                                                      self.enable_updates)
+            self.moe_load_balancer.set_iter_info(self.enable_statistic,
+                                                 self.enable_updates)
             self.moe_load_balancer.start_iter()
         return self
 
 
@@ -1,8 +1,35 @@
+import threading
+from contextlib import contextmanager
 from typing import Any, Callable, Optional
 
 import torch
 
-from ..pyexecutor.cuda_graph_runner import is_graph_capturing
+
+class do_multi_stream_local(threading.local):
+
+    def __init__(self):
+        self.do_multi_stream = False
+
+
+_local = do_multi_stream_local()
+
+
+def set_do_multi_stream(enable: bool):
+    _local.do_multi_stream = enable
+
+
+def do_multi_stream() -> bool:
+    return _local.do_multi_stream
+
+
+@contextmanager
+def with_multi_stream(enable: bool):
+    prev_do_multi_stream = _local.do_multi_stream
+    set_do_multi_stream(enable)
+    try:
+        yield
+    finally:
+        set_do_multi_stream(prev_do_multi_stream)
 
 
 def maybe_execute_in_parallel(
@@ -30,9 +57,9 @@ def maybe_execute_in_parallel(
         tuple[Any, Any]: the return values of fn0() and fn1()
     """
 
-    do_multi_stream = is_graph_capturing() and aux_stream is not None
+    multi_stream = do_multi_stream() and aux_stream is not None
 
-    if do_multi_stream:
+    if multi_stream:
         event0.record()
         result0 = fn0()
 
 
@@ -242,8 +242,8 @@ def estimate_max_tokens(self, py_executor: PyExecutor) -> None:
             torch_used_bytes = torch.cuda.memory_stats(
             )["allocated_bytes.all.current"]
         finally:
-            py_executor.shutdown()
             py_executor.is_warmup = False
+            py_executor.shutdown()
             py_executor.enable_iter_perf_stats = origin_iter_stats
             py_executor.set_gather_responses(False)
 
 
@@ -79,6 +79,7 @@ class PyTorchConfig:
     torch_compile_fullgraph: bool = True
     torch_compile_inductor_enabled: bool = False
     torch_compile_piecewise_cuda_graph: bool = False
+    torch_compile_piecewise_cuda_graph_num_tokens: Optional[List[int]] = None
     # When torch compile is enabled, userbuffers is enabled by default
     torch_compile_enable_userbuffers: bool = True
     torch_compile_max_num_streams: int = 1