Turn the attribute _return_attention_scores into an argument (#20803)

apehex · web-flow · commit d7d2e437db55 · 2025-01-24T11:13:32.000-08:00
diff --git a/keras/src/layers/attention/multi_head_attention.py b/keras/src/layers/attention/multi_head_attention.py
@@ -158,7 +158,6 @@ def __init__(
         self.seed = seed
 
         self._inverse_sqrt_key_dim = 1.0 / math.sqrt(float(self._key_dim))
-        self._return_attention_scores = False
 
         # Check for flash attention constraints
         if self._flash_attention and self._dropout > 0.0:
@@ -419,6 +418,7 @@ def _compute_attention(
         value,
         attention_mask=None,
         training=None,
+        return_attention_scores=False,
     ):
         """Applies Dot-product attention with query, key, value tensors.
 
@@ -442,7 +442,7 @@ def _compute_attention(
           attention_scores: Multi-headed attention weights.
         """
         # Check for flash attention constraints
-        if self._flash_attention and self._return_attention_scores:
+        if self._flash_attention and return_attention_scores:
             raise ValueError(
                 "Returning attention scores is not supported when flash "
                 "attention is enabled. Please disable flash attention to access"
@@ -452,7 +452,7 @@ def _compute_attention(
         # Determine whether to use dot-product attention
         use_dot_product_attention = not (
             self._dropout > 0.0
-            or self._return_attention_scores
+            or return_attention_scores
             or (len(query.shape) != 4)
         )
 
@@ -525,7 +525,6 @@ def call(
         training=None,
         use_causal_mask=False,
     ):
-        self._return_attention_scores = return_attention_scores
         if key is None:
             key = value
 
@@ -562,6 +561,7 @@ def call(
             value,
             attention_mask,
             training,
+            return_attention_scores,
         )
         attention_output = self._output_dense(attention_output)