Save state first pass

IzzyPutterman · IzzyPutterman · commit 16b9e78d5d3b · 2025-08-18T13:16:22.000-07:00
Signed-off-by: Izzy Putterman &lt;iputterman@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -418,6 +418,9 @@ def forward(
             spec_metadata=spec_metadata,
             **kwargs,
         )
+        if spec_metadata is not None and spec_metadata.is_final_output_capture(
+        ):
+            spec_metadata.maybe_capture_final_hidden_states(hidden_states)
 
         if self.draft_model is not None:
             # get logits
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -17,6 +17,7 @@ class SpeculativeDecodingMode(IntEnum):
     NGRAM = auto()
     DRAFT_TARGET = auto()
     USER_PROVIDED = auto()
+    SAVE_HIDDEN_STATES = auto()
     NONE = auto()
     AUTO = auto()
 
@@ -50,6 +51,9 @@ def is_none(self):
     def is_draft_target(self):
         return self == SpeculativeDecodingMode.DRAFT_TARGET
 
+    def is_save_hidden_states(self):
+        return self == SpeculativeDecodingMode.SAVE_HIDDEN_STATES
+
     def without_logits(self):
         return self.is_mtp() or self.is_eagle3_one_model()
 
@@ -82,7 +86,7 @@ def has_spec_decoder(self):
 
     def has_spec_drafter(self):
         return self.is_eagle3() or self.is_draft_target() or self.is_ngram(
-        ) or self.is_user_provided()
+        ) or self.is_user_provided() or self.is_save_hidden_states()
 
     def extend_ctx(self, attention_backend: Type[AttentionBackend]):
         """
@@ -185,6 +189,9 @@ def create_cuda_graph_metadata(self, max_batch_size: int):
         cuda_graph_metadata.__post_init__()
         return cuda_graph_metadata
 
+    def is_final_output_capture(self):
+        return False
+
     def maybe_capture_hidden_states(self, layer_id: int,
                                     hidden_states: torch.Tensor,
                                     residual: torch.Tensor) -> None:
@@ -193,6 +200,13 @@ def maybe_capture_hidden_states(self, layer_id: int,
         model. Use this method to record them. By default, does nothing.
         """
 
+    def maybe_capture_final_hidden_states(self,
+                                          hidden_states: torch.Tensor) -> None:
+        """
+        Some spec decode algorithms require hidden states from the target
+        model. Use this method to record them. By default, does nothing.
+        """
+
     @property
     def all_rank_num_tokens(self) -> Optional[List[int]]:
         return self._all_rank_num_tokens
diff --git a/tensorrt_llm/_torch/speculative/save_hidden_state.py b/tensorrt_llm/_torch/speculative/save_hidden_state.py
@@ -0,0 +1,111 @@
+from typing import Optional
+
+from tensorrt_llm._utils import local_mpi_rank
+
+from ..pyexecutor.llm_request import LlmRequest
+from ..pyexecutor.resource_manager import ResourceManager
+from ..pyexecutor.scheduler import ScheduledRequests
+from .drafter import Drafter
+from .eagle3 import Eagle3ResourceManager, Eagle3SpecMetadata
+
+
+@dataclass
+class SaveHiddenStatesSpecMetadata(Eagle3SpecMetadata):
+    save_last_layer_post_norm: bool = False
+
+    def is_final_output_capture(self):
+        return self.save_last_layer_post_norm
+
+    def maybe_capture_final_hidden_states(self,
+                                          hidden_states: torch.Tensor) -> None:
+        if self.save_last_layer_post_norm:
+            # Assume no chunking, BS=1
+            eagle3_hidden_states = self.eagle3_resource_manager.last_hidden_states
+            eagle3_hidden_states.copy_(hidden_states)
+
+
+class SaveHiddenStatesResourceManager(Eagle3ResourceManager):
+
+    def __init__(self, config: "SaveHiddenStatesDecodingConfig",
+                 dtype: torch.dtype, hidden_size: int, max_num_requests: int,
+                 max_seq_len: int, max_num_tokens: int):
+        super().__init__(config, dtype, hidden_size, max_num_requests,
+                         max_seq_len, max_num_tokens)
+        self.last_hidden_states = None
+        if config.save_last_layer_post_norm:
+            self.last_hidden_states = torch.empty(
+                (max_num_tokens, self.hidden_size),
+                dtype=self.dtype,
+                device='cuda')
+
+
+class SaveHiddenStatesDrafter(Drafter):
+
+    def __init__(
+        self,
+        spec_config: SaveHiddenStatesDecodingConfig,
+    ):
+        super().__init__(spec_config.max_concurrency)
+        self.spec_config = spec_config
+        self.max_draft_len = spec_config.max_draft_len
+        self._iter = 0
+        self._output_directory = spec_config.output_directory
+        self._file_prefix = spec_config.file_prefix
+        self._write_interval = spec_config.write_interval
+        self._saved_state = []
+
+    def _process_request(self, request: LlmRequest) -> None:
+        out_dict = {}
+        if local_mpi_rank() != 0:
+            input_ids = torch.tensor(list(request.get_tokens(0)),
+                                     dtype=torch.long,
+                                     device='cpu')
+            hidden_size = resource_manager.hidden_size
+            if self.spec_config.save_last_layer_post_norm:
+                hidden_states = resource_manager.last_hidden_states.cpu().clone(
+                )
+            else:
+                hidden_states = resource_manager.hidden_states[:,
+                                                               -hidden_size:].cpu(
+                                                               ).clone()
+
+            out_dict = {
+                "id":
+                self.iteration,
+                "input_ids":
+                input_ids,
+                "hidden_state_features":
+                resource_manager.hidden_states.cpu().clone(),
+                "hidden_state":
+                hidden_states,
+            }
+
+            self._saved_state.append(out_dict)
+
+    def _write_to_file(self) -> None:
+        if local_mpi_rank() == 0 and self.iteration != self.start_iteration:
+            output_path = os.path.join(self._output_directory,
+                                       f"{self._file_prefix}_{self._iter}.pt")
+            torch.save(self._saved_state, output_path)
+        self._saved_state = []
+
+    def prepare_draft_tokens(
+        self,
+        scheduled_requests: ScheduledRequests,
+        resource_manager: Optional[ResourceManager] = None,
+    ) -> None:
+        for request in sorted(
+                scheduled_requests.context_requests,
+                key=lambda r:
+            (r.py_batch_idx is None, r.py_batch_idx or r.request_id),
+        ):
+            request.py_max_new_tokens = 1
+            self._process_request(request, resource_manager)
+            if self._iter % self._write_interval == 0:
+                self._write_to_file()
+            self._iter += 1
+            # Pad length to `self.max_draft_len`
+            if len(draft_tokens) > 0:
+                pad_length = self.max_draft_len - len(draft_tokens)
+                draft_tokens.extend([0] * pad_length)
+            request.py_draft_tokens = draft_tokens
diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py
@@ -11,6 +11,8 @@
 from .mtp import (MTPEagleWorker, MTPHiddenStatesManager, MTPSampler,
                   MTPSpecMetadata, MTPWorker)
 from .ngram import NGramDrafter, NGramPoolManager
+from .save_hidden_state import (SaveHiddenStatesResourceManager,
+                                SaveHiddenStatesSpecMetadata)
 
 
 def get_spec_metadata(spec_config,
@@ -48,6 +50,20 @@ def get_spec_metadata(spec_config,
             hidden_size=model_config.hidden_size,
             max_num_tokens=max_num_tokens,
         )
+    if spec_config.spec_dec_mode.is_save_hidden_states():
+        return SaveHiddenStatesSpecMetadata(
+            max_draft_len=spec_config.max_draft_len,
+            spec_dec_mode=spec_config.spec_dec_mode,
+            max_num_requests=max_num_requests,
+            num_layers=model_config.num_hidden_layers,
+            hidden_size=model_config.hidden_size,
+            max_num_tokens=max_num_tokens,
+            dtype=model_config.torch_dtype,
+            is_draft_model=is_draft_model,
+            eagle3_resource_manager=spec_resource_manager,
+            num_capture_layers=spec_config.num_capture_layers,
+            save_last_layer_post_norm=spec_config.save_last_layer_post_norm,
+        )
     if  spec_config.spec_dec_mode.is_draft_target() or \
         spec_config.spec_dec_mode.is_ngram() or \
         spec_config.spec_dec_mode.is_user_provided():
@@ -95,6 +111,15 @@ def get_spec_resource_manager(model_engine, draft_model_engine=None):
             max_seq_len,
             max_num_tokens,
         )
+    if spec_dec_mode.is_save_hidden_states():
+        return SaveHiddenStatesResourceManager(
+            spec_config,
+            draft_model_engine.model.config.torch_dtype,
+            model_config.hidden_size,
+            max_num_requests,
+            max_seq_len,
+            max_num_tokens,
+        )
     if spec_dec_mode.is_ngram():
         return NGramPoolManager(spec_config, max_num_requests)
     if spec_dec_mode.is_user_provided():
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -9,7 +9,7 @@
 from enum import Enum, EnumMeta
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Literal, Optional,
-                    Type, TypeAlias, TypeVar, Union, get_args, get_origin)
+                    Set, Type, TypeAlias, TypeVar, Union, get_args, get_origin)
 
 import torch
 import yaml
@@ -361,9 +361,11 @@ def from_dict(cls, data: dict):
             "MTP": MTPDecodingConfig,
             "Medusa": MedusaDecodingConfig,
             "Eagle": EagleDecodingConfig,
+            "SaveState": SaveHiddenStatesDecodingConfig,
             "Lookahead": LookaheadDecodingConfig,
             "NGram": NGramDecodingConfig,
             "DraftTarget": DraftTargetDecodingConfig,
+            "SaveState": SaveHiddenStatesDecodingConfig,
             "UserProvided": UserProvidedDecodingConfig,
             "AUTO": AutoDecodingConfig,
         }
@@ -444,6 +446,31 @@ def spec_dec_mode(self):
         return TorchSpeculativeDecodingMode.EAGLE3
 
 
+class SaveHiddenStatesDecodingConfig(DecodingBaseConfig):
+    output_directory: str
+    write_interval: int = 20
+    file_prefix: str = "data"
+    eagle3_layers_to_capture: Optional[Set[int]] = None
+    save_last_layer_post_norm: bool = False
+
+    @classmethod
+    def from_dict(cls, data: dict):
+        return cls(**data)
+
+    decoding_type: ClassVar[str] = "SaveState"
+
+    def validate(self) -> None:
+        if self.output_directory is None or not self.eagle3_layers_to_capture:
+            raise ValueError(
+                "Save directory and layers to capture must be provided")
+
+    @functools.cached_property
+    def spec_dec_mode(self):
+        from tensorrt_llm._torch.speculative.interface import \
+            SpeculativeDecodingMode as TorchSpeculativeDecodingMode
+        return TorchSpeculativeDecodingMode.SAVE_HIDDEN_STATES
+
+
 class UserProvidedDecodingConfig(DecodingBaseConfig):
     # Cannot use real type annotations due to circular imports
     drafter: object  # Type is Drafter
@@ -921,6 +948,7 @@ def supports_backend(self, backend: str) -> bool:
     MTPDecodingConfig,
     NGramDecodingConfig,
     UserProvidedDecodingConfig,
+    SaveHiddenStateDecodingConfig,
     AutoDecodingConfig,
 ]]
 
@@ -1695,6 +1723,17 @@ def validate_speculative_config(self):
                 self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.AUTO
                 self.build_config.max_draft_len = self.speculative_config.max_draft_len
 
+            elif isinstance(self.speculative_config,
+                            SaveHiddenStatesDecodingConfig):
+                assert self.backend in ['pytorch']
+                assert self.speculative_config.max_draft_len > 0
+                self.build_config.max_batch_size = 1
+                self.max_batch_size = 1
+                self.disable_overlap_scheduler = True
+                self.cuda_graph_config = None
+                self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.SAVE_HIDDEN_STATES
+                self.build_config.max_draft_len = 1
+
             else:
                 raise ValueError(
                     f"Unrecognized speculative config type {type(self.speculative_config)}"

Original file line number	Diff line number	Diff line change
`@@ -418,6 +418,9 @@ def forward(`
`418`	`418`	`spec_metadata=spec_metadata,`
`419`	`419`	`**kwargs,`
`420`	`420`	`)`
	`421`	`+ if spec_metadata is not None and spec_metadata.is_final_output_capture(`
	`422`	`+ ):`
	`423`	`+ spec_metadata.maybe_capture_final_hidden_states(hidden_states)`
`421`	`424`
`422`	`425`	`if self.draft_model is not None:`
`423`	`426`	`# get logits`