Pass ForwardOptions from top level module and also return any relevant state as output

sxu · facebook-github-bot · commit cbe146dde134 · 2025-02-04T10:27:22.000-08:00
Summary: Pass a `ForwardOptions` argument (introduced by #8128) from the top level transformer, consolidate some existing inputs into it, and return any optional updates from the attention implementation. Differential Revision: D69080123
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -252,4 +252,4 @@ def forward(
 
         output = self.wo(output)
 
-        return output
+        return output, None
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -12,7 +12,10 @@
 import torch
 import torch.nn.functional as F
 
-from executorch.examples.models.llama.attention import ATTENTION_REGISTRY
+from executorch.examples.models.llama.attention import (
+    ATTENTION_REGISTRY,
+    ForwardOptions,
+)
 
 from executorch.examples.models.llama.model_args import ModelArgs
 
@@ -148,17 +151,17 @@ def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
         self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
 
-    def forward(self, x, freqs_cos, freqs_sin, input_pos=None):  # x: 1xN
-        h = self.attention.forward(
-            self.attention_norm(x), freqs_cos, freqs_sin, input_pos=input_pos
+    def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x: 1xN
+        h, attn_options_update = self.attention.forward(
+            self.attention_norm(x), freqs_cos, freqs_sin, **attn_options
         )
 
         h = x + h
         if hasattr(self, "block_sparse_moe"):
             out = h + self.block_sparse_moe(self.ffn_norm(h))
         else:
             out = h + self.feed_forward(self.ffn_norm(h))
-        return out
+        return out, attn_options_update
 
 
 class Transformer(nn.Module):
@@ -185,27 +188,28 @@ def __init__(self, params: ModelArgs):
     def forward(
         self,
         tokens: Optional[torch.LongTensor] = None,  # tokens
-        input_pos: Optional[
-            torch.LongTensor
-        ] = None,  # Scalar tensor indicating size of window of the caches
         h: Optional[torch.FloatTensor] = None,  # embeddings
+        attn_options: Optional[ForwardOptions] = None,
     ) -> torch.Tensor:
         if (tokens is None) ^ (h is not None):
             raise ValueError(
                 "You cannot specify both tokens and h at the same time, and must specify either one"
             )
         if tokens is not None and h is None:
             h = self.tok_embeddings(tokens)
-        seqlen = h.shape[1]
-        freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, seqlen)
+        if (
+            attn_options.get("freqs_cos") is None
+            and attn_options.get("freqs_sin") is None
+            and (input_pos := attn_options.get("input_pos")) is not None
+        ):
+            seqlen = h.shape[1]
+            freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, seqlen)
+            attn_options.update({"freqs_cos": freqs_cos, "freqs_sin": freqs_sin})
 
         for layer in self.layers:
-            h = layer(
-                h,
-                freqs_cos,
-                freqs_sin,
-                input_pos,
-            )
+            h, attn_options_update = layer(h, **attn_options)
+            if attn_options_update is not None:
+                attn_options.update(**attn_options_update)
 
         if not self.generate_full_logits:
             # Only the last logit is used for the new generated token
@@ -237,4 +241,4 @@ def forward(
                 expanded_logits[:, list(self.output_prune_map.values())] = logits
             logits = expanded_logits
 
-        return logits
+        return logits, attn_options_update

Original file line number	Diff line number	Diff line change
`@@ -252,4 +252,4 @@ def forward(`
`252`	`252`
`253`	`253`	`output = self.wo(output)`
`254`	`254`
`255`		`- return output`
	`255`	`+ return output, None`