keras-team
diff --git a/‎.github/workflows/scorecard.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/scorecard.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎conftest.py
Lines changed: 1 addition & 1 deletion b/‎conftest.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎keras/api/_tf_keras/keras/ops/__init__.py
Lines changed: 2 additions & 0 deletions b/‎keras/api/_tf_keras/keras/ops/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎keras/api/_tf_keras/keras/ops/numpy/__init__.py
Lines changed: 2 additions & 0 deletions b/‎keras/api/_tf_keras/keras/ops/numpy/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎keras/api/ops/__init__.py
Lines changed: 2 additions & 0 deletions b/‎keras/api/ops/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎keras/api/ops/numpy/__init__.py
Lines changed: 2 additions & 0 deletions b/‎keras/api/ops/numpy/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎keras/src/backend/jax/nn.py
Lines changed: 32 additions & 92 deletions b/‎keras/src/backend/jax/nn.py
Lines changed: 32 additions & 92 deletions
diff --git a/‎keras/src/backend/jax/numpy.py
Lines changed: 10 additions & 0 deletions b/‎keras/src/backend/jax/numpy.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎keras/src/backend/jax/rnn.py
Lines changed: 8 additions & 4 deletions b/‎keras/src/backend/jax/rnn.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎keras/src/backend/numpy/numpy.py
Lines changed: 10 additions & 0 deletions b/‎keras/src/backend/numpy/numpy.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎keras/src/backend/numpy/rnn.py
Lines changed: 8 additions & 4 deletions b/‎keras/src/backend/numpy/rnn.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎keras/src/backend/openvino/excluded_concrete_tests.txt
Lines changed: 6 additions & 2 deletions b/‎keras/src/backend/openvino/excluded_concrete_tests.txt
Lines changed: 6 additions & 2 deletions
@@ -56,6 +56,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@1b549b9259bda1cb5ddde3b41741a82a2d15a841 # v3.28.13
+        uses: github/codeql-action/upload-sarif@28deaeda66b76a05916b6923827895f2b14ab387 # v3.28.16
         with:
           sarif_file: results.sarif
@@ -38,7 +38,7 @@ def pytest_collection_modifyitems(config, items):
             ]
 
     requires_trainable_backend = pytest.mark.skipif(
-        backend() == "numpy" or backend() == "openvino",
+        backend() in ["numpy", "openvino"],
         reason="Trainer not implemented for NumPy and OpenVINO backend.",
     )
     for item in items:
 
@@ -135,6 +135,7 @@
 from keras.src.ops.numpy import argsort as argsort
 from keras.src.ops.numpy import array as array
 from keras.src.ops.numpy import average as average
+from keras.src.ops.numpy import bartlett as bartlett
 from keras.src.ops.numpy import bincount as bincount
 from keras.src.ops.numpy import bitwise_and as bitwise_and
 from keras.src.ops.numpy import bitwise_invert as bitwise_invert
@@ -143,6 +144,7 @@
 from keras.src.ops.numpy import bitwise_or as bitwise_or
 from keras.src.ops.numpy import bitwise_right_shift as bitwise_right_shift
 from keras.src.ops.numpy import bitwise_xor as bitwise_xor
+from keras.src.ops.numpy import blackman as blackman
 from keras.src.ops.numpy import broadcast_to as broadcast_to
 from keras.src.ops.numpy import ceil as ceil
 from keras.src.ops.numpy import clip as clip
 
@@ -27,6 +27,7 @@
 from keras.src.ops.numpy import argsort as argsort
 from keras.src.ops.numpy import array as array
 from keras.src.ops.numpy import average as average
+from keras.src.ops.numpy import bartlett as bartlett
 from keras.src.ops.numpy import bincount as bincount
 from keras.src.ops.numpy import bitwise_and as bitwise_and
 from keras.src.ops.numpy import bitwise_invert as bitwise_invert
@@ -35,6 +36,7 @@
 from keras.src.ops.numpy import bitwise_or as bitwise_or
 from keras.src.ops.numpy import bitwise_right_shift as bitwise_right_shift
 from keras.src.ops.numpy import bitwise_xor as bitwise_xor
+from keras.src.ops.numpy import blackman as blackman
 from keras.src.ops.numpy import broadcast_to as broadcast_to
 from keras.src.ops.numpy import ceil as ceil
 from keras.src.ops.numpy import clip as clip
 
@@ -135,6 +135,7 @@
 from keras.src.ops.numpy import argsort as argsort
 from keras.src.ops.numpy import array as array
 from keras.src.ops.numpy import average as average
+from keras.src.ops.numpy import bartlett as bartlett
 from keras.src.ops.numpy import bincount as bincount
 from keras.src.ops.numpy import bitwise_and as bitwise_and
 from keras.src.ops.numpy import bitwise_invert as bitwise_invert
@@ -143,6 +144,7 @@
 from keras.src.ops.numpy import bitwise_or as bitwise_or
 from keras.src.ops.numpy import bitwise_right_shift as bitwise_right_shift
 from keras.src.ops.numpy import bitwise_xor as bitwise_xor
+from keras.src.ops.numpy import blackman as blackman
 from keras.src.ops.numpy import broadcast_to as broadcast_to
 from keras.src.ops.numpy import ceil as ceil
 from keras.src.ops.numpy import clip as clip
 
@@ -27,6 +27,7 @@
 from keras.src.ops.numpy import argsort as argsort
 from keras.src.ops.numpy import array as array
 from keras.src.ops.numpy import average as average
+from keras.src.ops.numpy import bartlett as bartlett
 from keras.src.ops.numpy import bincount as bincount
 from keras.src.ops.numpy import bitwise_and as bitwise_and
 from keras.src.ops.numpy import bitwise_invert as bitwise_invert
@@ -35,6 +36,7 @@
 from keras.src.ops.numpy import bitwise_or as bitwise_or
 from keras.src.ops.numpy import bitwise_right_shift as bitwise_right_shift
 from keras.src.ops.numpy import bitwise_xor as bitwise_xor
+from keras.src.ops.numpy import blackman as blackman
 from keras.src.ops.numpy import broadcast_to as broadcast_to
 from keras.src.ops.numpy import ceil as ceil
 from keras.src.ops.numpy import clip as clip
 
@@ -1127,7 +1127,7 @@ def wrap_flash_attention(
     custom_mask=None,
     attn_logits_soft_cap=None,
     head_shards=1,
-    q_seq_shards=1
+    q_seq_shards=1,
 
 ):
     if decoder_segment_ids is not None:
@@ -1149,10 +1149,8 @@ def wrap_flash_attention(
     )
     splash_kernel = splash_attention_kernel.make_splash_mha(
         mask=multi_head_mask,
-        head_shards=head_shards,
-        q_seq_shards=q_seq_shards,
-        head_shards=head_shards,
-        q_seq_shards=q_seq_shards,
+        head_shards=1,
+        q_seq_shards=1,
         attn_logits_soft_cap=attn_logits_soft_cap,
     )
 
@@ -1185,92 +1183,34 @@ def dot_product_attention(
             f"Received: query.shape={query.shape}, key.shape={key.shape}, "
             f"value.shape={value.shape}."
         )
-            
-    # Check platform
-    platform = jax.devices()[0].platform
-    is_tpu = platform == "tpu"
-    
-    # Check if inputs use partial sharding (not fully replicated)
-    # Flash attention works well with fully replicated tensors on all platforms
-    # but may have issues with certain partial sharding patterns on non-TPU platforms
-    partially_sharded_inputs = any(
-        hasattr(t, "sharding") and not t.sharding.is_fully_replicated
-        for t in (query, key, value)
-    )
-    
-    # Determine flash attention compatibility
     if flash_attention is None:
-        # Auto-detect flash attention availability
-        if is_tpu:
-            # TPUs have specialized hardware for attention that works with any sharding pattern
-            flash_attention = True
-        else:
-            # For GPU/CPU with partially sharded inputs, we need multiple devices
-            # to efficiently handle the sharding
-            if partially_sharded_inputs and len(jax.devices()) <= 1:
-                flash_attention = False
-            else:
-                flash_attention = _can_use_flash_attention(query, key, value, bias)
-    elif flash_attention is True and not is_tpu:
-        # If flash attention is explicitly requested, validate compatibility
-        # Skip validation for TPU as it has specialized hardware support
-        try:
-            _can_use_flash_attention(query, key, value, bias, raise_error=True)
-        except Exception:
-            # Only disable flash attention on non-TPU platforms if validation fails
-            flash_attention = False
-    
-    # TPU-specific flash attention path
-    if is_tpu and flash_attention:
-        # Transpose to ('batch', 'heads', 'length', 'head_dim')
-        query_tpu_layout = jnp.transpose(query, axes=(0, 2, 1, 3))
-        key_tpu_layout = jnp.transpose(key, axes=(0, 2, 1, 3))
-        value_tpu_layout = jnp.transpose(value, axes=(0, 2, 1, 3))
-
-        bs, num_heads, q_len, head_dim = query_tpu_layout.shape
-
-        # Apply scale to query if provided
-        if scale is not None:
-            # TPU kernel applies 1/sqrt(head_dim) internally, to achieve 
-            # overall QK^T * scale, scale query by (scale * sqrt(head_dim))
-            query_tpu_layout = query_tpu_layout * (scale * math.sqrt(head_dim))
-
-        # Create segment IDs for Splash Attention (for packing/batching)
-        segment_ids = jnp.zeros([bs, q_len], dtype=jnp.int32)
-        decoder_segment_ids = splash_attention_kernel.SegmentIds(
-            q=segment_ids, kv=segment_ids
-        )
-
-        # Process mask for Splash Attention
-        custom_mask = None
-        if mask is not None:
-            mask_bool = mask.astype("bool") if mask.dtype != jnp.bool_ else mask
-            
-            if mask_bool.ndim == 3 and mask_bool.shape[0] == bs:
-                custom_mask = mask_bool[0]
-            elif mask_bool.ndim == 4 and mask_bool.shape[0] == bs:
-                custom_mask = mask_bool[0, 0]
-
-            if is_causal and custom_mask is not None:
-                causal_mask = jnp.tril(jnp.ones((q_len, q_len), dtype=jnp.bool_))
-                custom_mask = jnp.logical_and(custom_mask, causal_mask)
-        
-        if custom_mask is None and is_causal:
-            custom_mask = jnp.tril(jnp.ones((q_len, q_len), dtype=jnp.bool_))
+        flash_attention = _can_use_flash_attention(query, key, value, bias)
+    elif flash_attention is True:
+        # Use `raise_error=True` to provide more details if the inputs failed to
+        # use flash attention
+        _can_use_flash_attention(query, key, value, bias, raise_error=True)
 
-        try:
-            output = wrap_flash_attention(
-                query_tpu_layout,
-                key_tpu_layout,
-                value_tpu_layout,
-                decoder_segment_ids=decoder_segment_ids,
-                custom_mask=custom_mask,
-                attn_logits_soft_cap=attn_logits_soft_cap,
-            )
-            # Transpose output back to Keras layout
-            return jnp.transpose(output, axes=(0, 2, 1, 3))
-        except Exception:
-            flash_attention = False
+    if jax.devices()[0].platform == "tpu":
+        # Transpose to ('batch', 'heads', 'length', 'kv')
+        query = jnp.transpose(query, axes=(0, 2, 1, 3))
+        key = jnp.transpose(key, axes=(0, 2, 1, 3))
+        value = jnp.transpose(value, axes=(0, 2, 1, 3))
+        B, H, S, KV = query.shape
+
+        segment_ids = jnp.ones([B, S])
+        # {token_ids, padding_mask, segment_ids} enable packing
+        out = wrap_flash_attention(
+            query,
+            key,
+            value,
+            decoder_segment_ids=splash_attention_kernel.SegmentIds(
+                segment_ids, segment_ids
+            ),
+            custom_mask=mask,
+            attn_logits_soft_cap=attn_logits_soft_cap,
+        )
+        out = jnp.transpose(out, axes=(0, 2, 1, 3))
+        return out
 
     # JAX native dot_product_attention for GPU or fallback for TPU
     if hasattr(jax.nn, "dot_product_attention"):
@@ -1306,9 +1246,9 @@ def dot_product_attention(
             "Please update it by following the official guide: "
             "https://jax.readthedocs.io/en/latest/installation.html"
         )
-
-    # Fallback to custom XLA implementation
-    # This is the reference implementation from jax.nn.dot_product_attention
+    # Ref: jax.nn.dot_product_attention
+    # https://github.com/jax-ml/jax/blob/jax-v0.4.33/jax/_src/nn/functions.py#L886
+    # Not support `query_seq_lengths` and `key_value_seq_lengths` args
     output_shape = query.shape
     _, _, K, H = key.shape
     scale = (1.0 / jnp.sqrt(H)) if scale is None else scale
 
@@ -37,6 +37,11 @@ def add(x1, x2):
     return jnp.add(x1, x2)
 
 
+def bartlett(x):
+    x = convert_to_tensor(x)
+    return jnp.bartlett(x)
+
+
 def bincount(x, weights=None, minlength=0, sparse=False):
     # Note: bincount is never tracable / jittable because the output shape
     # depends on the values in x.
@@ -469,6 +474,11 @@ def right_shift(x, y):
     return bitwise_right_shift(x, y)
 
 
+def blackman(x):
+    x = convert_to_tensor(x)
+    return jnp.blackman(x)
+
+
 def broadcast_to(x, shape):
     x = convert_to_tensor(x)
     return jnp.broadcast_to(x, shape)
 
@@ -164,12 +164,16 @@ def _step(states, current_input):
                 else:
                     # Assume the first state is the previous output.
                     output_tm1 = states[0]
+                    if tree.is_nested(output_tm1):
+                        # Stacked RNN case: assume first state of last cell.
+                        output_tm1 = states[-1][0]
                     masked_outs = jnp.where(is_masked, output_tm1, output_t)
 
-                new_states = [
-                    jnp.where(is_masked, s, ns)
-                    for s, ns in zip(states, new_states)
-                ]
+                new_states = tree.map_structure(
+                    lambda s, ns: jnp.where(is_masked, s, ns),
+                    states,
+                    new_states,
+                )
                 return (new_states, masked_outs)
 
             scan_xs = (inputs, mask)
 
@@ -305,6 +305,11 @@ def average(x, axis=None, weights=None):
     return np.average(x, weights=weights, axis=axis)
 
 
+def bartlett(x):
+    x = convert_to_tensor(x)
+    return np.bartlett(x).astype(config.floatx())
+
+
 def bincount(x, weights=None, minlength=0, sparse=False):
     if sparse:
         raise ValueError("Unsupported value `sparse=True` with numpy backend")
@@ -385,6 +390,11 @@ def right_shift(x, y):
     return bitwise_right_shift(x, y)
 
 
+def blackman(x):
+    x = convert_to_tensor(x)
+    return np.blackman(x).astype(config.floatx())
+
+
 def broadcast_to(x, shape):
     return np.broadcast_to(x, shape)
 
 
@@ -160,12 +160,16 @@ def _step(states, current_input):
                 else:
                     # Assume the first state is the previous output.
                     output_tm1 = states[0]
+                    if tree.is_nested(output_tm1):
+                        # Stacked RNN case: assume first state of last cell.
+                        output_tm1 = states[-1][0]
                     masked_outs = np.where(is_masked, output_tm1, output_t)
 
-                new_states = [
-                    np.where(is_masked, s, ns)
-                    for s, ns in zip(states, new_states)
-                ]
+                new_states = tree.map_structure(
+                    lambda s, ns: np.where(is_masked, s, ns),
+                    states,
+                    new_states,
+                )
                 return (new_states, masked_outs)
 
             scan_xs = (inputs, mask)
 
@@ -8,6 +8,8 @@ NumpyDtypeTest::test_angle
 NumpyDtypeTest::test_any
 NumpyDtypeTest::test_argpartition
 NumpyDtypeTest::test_array
+NumpyDtypeTest::test_bartlett
+NumpyDtypeTest::test_blackman
 NumpyDtypeTest::test_bitwise
 NumpyDtypeTest::test_ceil
 NumpyDtypeTest::test_concatenate
@@ -42,7 +44,6 @@ NumpyDtypeTest::test_outer_
 NumpyDtypeTest::test_power
 NumpyDtypeTest::test_prod
 NumpyDtypeTest::test_quantile
-NumpyDtypeTest::test_ravel
 NumpyDtypeTest::test_repeat
 NumpyDtypeTest::test_roll
 NumpyDtypeTest::test_round
@@ -75,6 +76,8 @@ NumpyOneInputOpsCorrectnessTest::test_angle
 NumpyOneInputOpsCorrectnessTest::test_any
 NumpyOneInputOpsCorrectnessTest::test_argpartition
 NumpyOneInputOpsCorrectnessTest::test_array
+NumpyOneInputOpsCorrectnessTest::test_bartlett
+NumpyOneInputOpsCorrectnessTest::test_blackman
 NumpyOneInputOpsCorrectnessTest::test_bitwise_invert
 NumpyOneInputOpsCorrectnessTest::test_conj
 NumpyOneInputOpsCorrectnessTest::test_correlate
@@ -102,7 +105,6 @@ NumpyOneInputOpsCorrectnessTest::test_pad_int8_constant_2
 NumpyOneInputOpsCorrectnessTest::test_pad_uint8_constant_2
 NumpyOneInputOpsCorrectnessTest::test_pad_int32_constant_2
 NumpyOneInputOpsCorrectnessTest::test_prod
-NumpyOneInputOpsCorrectnessTest::test_ravel
 NumpyOneInputOpsCorrectnessTest::test_real
 NumpyOneInputOpsCorrectnessTest::test_reciprocal
 NumpyOneInputOpsCorrectnessTest::test_repeat
@@ -151,4 +153,6 @@ NumpyTwoInputOpsCorrectnessTest::test_tensordot
 NumpyTwoInputOpsCorrectnessTest::test_vdot
 NumpyTwoInputOpsCorrectnessTest::test_where
 NumpyOneInputOpsDynamicShapeTest::test_angle
+NumpyOneInputOpsDynamicShapeTest::test_bartlett
+NumpyOneInputOpsDynamicShapeTest::test_blackman
 NumpyOneInputOpsStaticShapeTest::test_angle
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ def pytest_collection_modifyitems(config, items):`
`38`	`38`	`]`
`39`	`39`
`40`	`40`	`requires_trainable_backend = pytest.mark.skipif(`
`41`		`- backend() == "numpy" or backend() == "openvino",`
	`41`	`+ backend() in ["numpy", "openvino"],`
`42`	`42`	`reason="Trainer not implemented for NumPy and OpenVINO backend.",`
`43`	`43`	`)`
`44`	`44`	`for item in items:`