AI-Hypercomputer
diff --git a/‎.github/CODEOWNERS
Lines changed: 1 addition & 1 deletion b/‎.github/CODEOWNERS
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/AddLabel.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/AddLabel.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/CPUTests.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/CPUTests.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎MaxText/common_types.py
Lines changed: 0 additions & 6 deletions b/‎MaxText/common_types.py
Lines changed: 0 additions & 6 deletions
diff --git a/‎MaxText/experimental/rl/grpo_trainer.py
Lines changed: 7 additions & 9 deletions b/‎MaxText/experimental/rl/grpo_trainer.py
Lines changed: 7 additions & 9 deletions
diff --git a/‎MaxText/inference/kvcache.py
Lines changed: 20 additions & 35 deletions b/‎MaxText/inference/kvcache.py
Lines changed: 20 additions & 35 deletions
diff --git a/‎MaxText/inference/page_manager.py
Lines changed: 1 addition & 2 deletions b/‎MaxText/inference/page_manager.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎MaxText/inference/paged_attention.py
Lines changed: 7 additions & 21 deletions b/‎MaxText/inference/paged_attention.py
Lines changed: 7 additions & 21 deletions
diff --git a/‎MaxText/kernels/megablox/gmm.py
Lines changed: 5 additions & 7 deletions b/‎MaxText/kernels/megablox/gmm.py
Lines changed: 5 additions & 7 deletions
diff --git a/‎MaxText/kernels/ragged_attention.py
Lines changed: 1 addition & 7 deletions b/‎MaxText/kernels/ragged_attention.py
Lines changed: 1 addition & 7 deletions
@@ -1,2 +1,2 @@
 # Changes in this file should match with requiredReviewers in file .github/workflows/AddLabel.yml
-* @gobbleturk @khatwanimohit @bvandermoon @vipannalla @RissyRan @richjames0 @rni418 @gagika @shralex @yangyuwei @SurbhiJainUSC @hengtaoguo @A9isha @wang2yn84 @wyzhang @mitalisi @gpolovets1 @mailvijayasingh @jrplatin @patemotter @lumosis
+* @gobbleturk @khatwanimohit @bvandermoon @vipannalla @RissyRan @richjames0 @rni418 @gagika @shralex @yangyuwei @SurbhiJainUSC @hengtaoguo @A9isha @wang2yn84 @wyzhang @mitalisi @gpolovets1 @mailvijayasingh @jrplatin @patemotter @lumosis @aireenmei
@@ -74,6 +74,7 @@ jobs:
               jrplatin: "",
               patemotter: "",
               lumosis: "",
+              aireenmei: "",
             }
             const reviews = await github.rest.pulls.listReviews({
               owner,
 
@@ -36,7 +36,7 @@ jobs:
         pytype --jobs auto --disable 'import-error,late-directive,wrong-arg-types,module-attr,unsupported-operands' MaxText/ || true
     - name: Analysing the code with pylint in Maxtext/
       run: |
-         pylint --verbose --msg-template='[{abspath}] {msg_id}:{line:3d},{column}: {obj}: {msg}' --disable R0401,R1701,R1703,R1710,R1711,R1735,R0917,R1714,R1716,R1719,R1721,R1728,R1728,W0102,W0107,W0201,W0212,W0221,W0223,W0237,W0404,W0611,W0612,W0613,W0621,W0622,W0631,W0707,W0718,W1201,W1203,W1309,W1514,W4901 MaxText/ && \
+         pylint --verbose --msg-template='[{abspath}] {msg_id}:{line:3d},{column}: {obj}: {msg}' --disable R0401,R0917,W0102,W0107,W0201,W0212,W0221,W0223,W0237,W0404,W0611,W0612,W0613,W0621,W0622,W0631,W0707,W0718,W1201,W1203,W1309,W1514,W4901 MaxText/ && \
          echo 'Maxtext PyLint check successful' || { echo \
          'PyLint check has failed. Please run bash code_style.sh to fix issues'; exit 20; }
     - name: Analysing the code with pylint in pedagogical_examples/
 
@@ -18,21 +18,15 @@
 
 import numpy as np
 
-import jax
 import jax.numpy as jnp
 
-from flax.linen import partitioning
-
 Config = Any
 
 Array = jnp.ndarray
 PRNGKey = jnp.ndarray
 DType = jnp.dtype
 Shape = Sequence[int]
 
-Mesh = jax.sharding.Mesh
-ScanIn = partitioning.ScanIn
-
 AxisNames = tuple[str, ...]
 AxisIdxes = tuple[int, ...]
 
 
@@ -14,10 +14,6 @@
 limitations under the License.
 """
 
-from collections.abc import Callable
-
-from MaxText.common_types import Array
-
 # pylint: disable=g-bad-todo, abstract-method, consider-using-with, attribute-error
 """
 This script implements Group Relative Policy Optimization (GRPO) training
@@ -32,6 +28,7 @@
 import functools
 import queue
 from typing import Sequence
+from collections.abc import Callable
 
 from absl import app
 
@@ -56,17 +53,17 @@
 import transformers
 
 from MaxText import checkpointing
+from MaxText import max_logging
 from MaxText import max_utils
+from MaxText import maxengine
 from MaxText import maxtext_utils
-from MaxText import max_logging
 from MaxText import profiler
 from MaxText import pyconfig
-from MaxText import maxengine
-from MaxText.metric_logger import MetricLogger
-from MaxText.vertex_tensorboard import VertexTensorboardManager
+from MaxText.common_types import Array
 from MaxText.experimental.rl import grpo_input_pipeline
-from MaxText.layers import models
 from MaxText.gcp_workload_monitor import GCPWorkloadMonitor
+from MaxText.layers import models
+from MaxText.metric_logger import MetricLogger
 from MaxText.train import (
     validate_train_config,
     get_first_step,
@@ -78,6 +75,7 @@
     check_example_batch,
     setup_mesh_and_model,
 )
+from MaxText.vertex_tensorboard import VertexTensorboardManager
 
 # pylint: disable=too-many-positional-arguments
 
 
@@ -23,30 +23,17 @@
 
 from aqt.jax.v2 import aqt_tensor
 from aqt.jax.v2 import config as aqt_config
+from aqt.jax.v2.aqt_tensor import QTensor as KVTensor
 from aqt.jax.v2.flax import aqt_flax
 
-from MaxText import common_types
+from MaxText.common_types import Array, AxisNames, AxisIdxes, Config, CACHE_BATCH_PREFILL, DType, MODEL_MODE_PREFILL, MODEL_MODE_TRAIN, MODEL_MODE_AUTOREGRESSIVE, CACHE_HEADS_NONE, DECODING_ACTIVE_SEQUENCE_INDICATOR
+from MaxText.common_types import CACHE_BATCH, CACHE_SEQUENCE, CACHE_HEADS, CACHE_KV, CACHE_SCALE_BATCH, CACHE_SCALE_SEQUENCE, CACHE_SCALE_HEADS, CACHE_SCALE_KV
 
-Array = common_types.Array
-AxisNames = common_types.AxisNames
-AxisIdxes = common_types.AxisIdxes
-Config = common_types.Config
-KVTensor = aqt_tensor.QTensor
 
 MAX_INT8 = 127.5
 MAX_INT4 = 7.5
 E4M3_MAX = jnp.finfo(jnp.float8_e4m3fn).max.astype(jnp.float32)
 
-CACHE_BATCH_PREFILL = common_types.CACHE_BATCH_PREFILL
-CACHE_BATCH = common_types.CACHE_BATCH
-CACHE_SEQUENCE = common_types.CACHE_SEQUENCE
-CACHE_HEADS = common_types.CACHE_HEADS
-CACHE_KV = common_types.CACHE_KV
-CACHE_SCALE_BATCH = common_types.CACHE_SCALE_BATCH
-CACHE_SCALE_SEQUENCE = common_types.CACHE_SCALE_SEQUENCE
-CACHE_SCALE_HEADS = common_types.CACHE_SCALE_HEADS
-CACHE_SCALE_KV = common_types.CACHE_SCALE_KV
-
 
 def reverse_transpose(transposed_array, transpose_axis_order):
   return jax.numpy.moveaxis(transposed_array, (0, 1, 2, 3), transpose_axis_order)
@@ -167,7 +154,7 @@ class KVCache(nn.Module):
 
   max_prefill_length: int
   max_target_length: int
-  dtype: common_types.DType
+  dtype: DType
   kv_quant: Optional[KVQuant] = None
   prefill_cache_logical_axis_names: AxisNames = (CACHE_BATCH_PREFILL, CACHE_SEQUENCE, CACHE_HEADS, CACHE_KV)
   cache_logical_axis_names: AxisNames = (CACHE_BATCH, CACHE_SEQUENCE, CACHE_HEADS, CACHE_KV)
@@ -194,7 +181,7 @@ def _get_prefill_cache_vars(self, batch, key_heads, value_heads, key_head_size,
     cache_length = self.max_prefill_length
     dtype = self._get_cached_kv_dtype()
 
-    if model_mode == common_types.MODEL_MODE_PREFILL:
+    if model_mode == MODEL_MODE_PREFILL:
       cache_logical_axis_names = self.prefill_cache_logical_axis_names
     else:
       cache_logical_axis_names = self.cache_logical_axis_names
@@ -219,7 +206,7 @@ def _get_prefill_cache_vars(self, batch, key_heads, value_heads, key_head_size,
         cache_shape_value,
         dtype,
     )
-    if model_mode == common_types.MODEL_MODE_PREFILL:
+    if model_mode == MODEL_MODE_PREFILL:
       segment_id_axis_names = (CACHE_BATCH_PREFILL, CACHE_SEQUENCE)
     else:
       segment_id_axis_names = (CACHE_BATCH, CACHE_SEQUENCE)
@@ -274,7 +261,7 @@ def _get_ar_cache_vars(self, batch, key_heads, value_heads, key_head_size, value
       )
     cache_length = self.max_target_length - self.max_prefill_length
 
-    if model_mode == common_types.MODEL_MODE_PREFILL:
+    if model_mode == MODEL_MODE_PREFILL:
       cache_logical_axis_names = self.prefill_cache_logical_axis_names
     else:
       cache_logical_axis_names = self.cache_logical_axis_names
@@ -311,7 +298,7 @@ def _get_ar_cache_vars(self, batch, key_heads, value_heads, key_head_size, value
         cache_axis_names,
     )
 
-    if model_mode == common_types.MODEL_MODE_PREFILL:
+    if model_mode == MODEL_MODE_PREFILL:
       segment_id_axis_names = (CACHE_BATCH_PREFILL, CACHE_SEQUENCE)
     else:
       segment_id_axis_names = (CACHE_BATCH, CACHE_SEQUENCE)
@@ -401,11 +388,11 @@ def kv_cache_chunked_prefill(
       next_pos = previous_chunk.shape[1]
 
     cached_prefill_key_vars, cached_prefill_value_vars, cached_prefill_segment_id_var = self._get_prefill_cache_vars(
-        batch, key_heads, value_heads, key_head_size, value_head_size, common_types.MODEL_MODE_PREFILL
+        batch, key_heads, value_heads, key_head_size, value_head_size, MODEL_MODE_PREFILL
     )
     # TODO: Find a way to not enable the ar cache for prefill mode.
     _ = self._get_ar_cache_vars(
-        batch, key_heads, value_heads, key_head_size, value_head_size, common_types.MODEL_MODE_PREFILL
+        batch, key_heads, value_heads, key_head_size, value_head_size, MODEL_MODE_PREFILL
     )  # initialize it now
 
     key_shaped_for_cache = jnp.transpose(key, self.prefill_cache_axis_order)
@@ -488,11 +475,11 @@ def kv_cache_prefill(
     assert key.dtype == value.dtype, "Key and Value Dtypes should match."
 
     cached_prefill_key_vars, cached_prefill_value_vars, cached_prefill_segment_id_var = self._get_prefill_cache_vars(
-        batch, key_heads, value_heads, key_head_size, value_head_size, common_types.MODEL_MODE_PREFILL
+        batch, key_heads, value_heads, key_head_size, value_head_size, MODEL_MODE_PREFILL
     )
     # TODO: Find a way to not enable the ar cache for prefill mode.
     _ = self._get_ar_cache_vars(
-        batch, key_heads, value_heads, key_head_size, value_head_size, common_types.MODEL_MODE_PREFILL
+        batch, key_heads, value_heads, key_head_size, value_head_size, MODEL_MODE_PREFILL
     )  # initialize it now
 
     key_shaped_for_cache = jnp.transpose(key, self.prefill_cache_axis_order)
@@ -652,9 +639,7 @@ def kv_cache_autoregressive(
       raise ValueError(f"Sequence length should be 1 during autoregression, got {sequence=}")
 
     cached_ar_key_vars, cached_ar_value_vars, cached_ar_segment_id_var, cache_ar_index_var, cache_ar_lengths_var = (
-        self._get_ar_cache_vars(
-            batch, key_heads, value_heads, key_head_size, value_head_size, common_types.MODEL_MODE_AUTOREGRESSIVE
-        )
+        self._get_ar_cache_vars(batch, key_heads, value_heads, key_head_size, value_head_size, MODEL_MODE_AUTOREGRESSIVE)
     )
 
     self.update_ar_key_value(
@@ -666,7 +651,7 @@ def kv_cache_autoregressive(
         cache_ar_lengths_var.value,
         use_ragged_attention,
     )
-    active_indicator = jnp.zeros((batch, 1), dtype=jnp.int32) + common_types.DECODING_ACTIVE_SEQUENCE_INDICATOR
+    active_indicator = jnp.zeros((batch, 1), dtype=jnp.int32) + DECODING_ACTIVE_SEQUENCE_INDICATOR
     cached_ar_segment_id_var.value = jax.lax.dynamic_update_index_in_dim(
         cached_ar_segment_id_var.value, active_indicator, jnp.squeeze(cache_ar_index_var.value), 1
     )
@@ -675,7 +660,7 @@ def kv_cache_autoregressive(
 
     # The below retrieves the existing prefill cache variables, not creating new ones
     cached_prefill_key_vars, cached_prefill_value_vars, cached_prefill_segment_id_var = self._get_prefill_cache_vars(
-        batch, key_heads, value_heads, key_head_size, value_head_size, common_types.MODEL_MODE_AUTOREGRESSIVE
+        batch, key_heads, value_heads, key_head_size, value_head_size, MODEL_MODE_AUTOREGRESSIVE
     )
 
     cached_prefill = (
@@ -719,12 +704,12 @@ def __call__(
       two tuples of (k, v, decoder_segments) -- either can be Nones
 
     """
-    if model_mode == common_types.MODEL_MODE_PREFILL:
+    if model_mode == MODEL_MODE_PREFILL:
       if self.use_chunked_prefill:
         return self.kv_cache_chunked_prefill(key, value, decoder_segment_ids, previous_chunk), None
       else:
         return self.kv_cache_prefill(key, value, decoder_segment_ids), None
-    elif model_mode == common_types.MODEL_MODE_AUTOREGRESSIVE:
+    elif model_mode == MODEL_MODE_AUTOREGRESSIVE:
       return self.kv_cache_autoregressive(key, value, use_ragged_attention)
     else:
       raise ValueError(f"Model Mode isn't supported! {model_mode=}")
@@ -736,13 +721,13 @@ class MlaKVCache(KVCache):
   prefill_cache_logical_axis_names: AxisNames = (
       CACHE_BATCH_PREFILL,
       CACHE_SEQUENCE,
-      common_types.CACHE_HEADS_NONE,
+      CACHE_HEADS_NONE,
       CACHE_KV,
   )
   cache_logical_axis_names: AxisNames = (
       CACHE_BATCH,
       CACHE_SEQUENCE,
-      common_types.CACHE_HEADS_NONE,
+      CACHE_HEADS_NONE,
       CACHE_KV,
   )
 
@@ -767,7 +752,7 @@ def __call__(
       Optional[Tuple[Array, Array, Array]],
       Optional[Tuple[Array, Array, Array, Array]],
   ]:
-    assert model_mode != common_types.MODEL_MODE_TRAIN, "incorrectly updating kvcache in train mode."
+    assert model_mode != MODEL_MODE_TRAIN, "incorrectly updating kvcache in train mode."
     assert self.kv_quant is None, "kvcache quantization not supported with mla."
     key_latent = self.key_latent_add_head_dim(key_latent)
     prefill_cache, ar_cache = super().__call__(key_latent, key_rope, decoder_segment_ids, model_mode)
 
@@ -32,9 +32,8 @@
 
 from jaxtyping import Array, Integer, Bool
 
-from MaxText import common_types
+from MaxText.common_types import Config
 
-Config = common_types.Config
 
 # Aliases using <Dims><Type><Rank>d convention
 # We use string names for dimensions as they are symbolic within the type hints.
 
@@ -21,33 +21,17 @@
 from typing import Optional
 
 import jax.numpy as jnp
-from jax.experimental import shard_map
+from jax.experimental.shard_map import shard_map
 from jax.experimental.pallas.ops.tpu.paged_attention import paged_attention_kernel
 from jax.sharding import PartitionSpec as P
+from jax.sharding import Mesh
 
 from flax import linen as nn
 
-from MaxText import common_types
 from MaxText.inference import page_manager
 from MaxText.inference import paged_attention_kernel_v2
+from MaxText.common_types import Array, DType, AxisNames, BATCH, LENGTH, HEAD, D_KV, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE
 
-# pytype: disable=attribute-error
-
-Mesh = common_types.Mesh
-
-Array = common_types.Array
-Config = common_types.Config
-DType = common_types.DType
-Mesh = common_types.Mesh
-PRNGKey = common_types.PRNGKey
-
-AxisNames = common_types.AxisNames
-BATCH = common_types.BATCH
-LENGTH = common_types.LENGTH
-HEAD = common_types.HEAD
-D_KV = common_types.D_KV
-
-shard_map = shard_map.shard_map
 _use_kernel_v2 = False
 
 
@@ -255,16 +239,18 @@ def __call__(
     key_pages_var, value_pages_var = self.init_or_get_kv_pages(model_mode)
 
     # update kv pages and call page attention kernel
-    if model_mode == common_types.MODEL_MODE_PREFILL:
+    if model_mode == MODEL_MODE_PREFILL:
       self.update_prefill_step_pages(key_pages_var, value_pages_var, key, value, slot, page_state)
       if _use_kernel_v2:
         return self.paged_attention_v2_prefill(query, key_pages_var, value_pages_var, page_state), None, None
       return self.paged_dot_product_attention_with_max_and_sum(query, key, value)
-    elif model_mode == common_types.MODEL_MODE_AUTOREGRESSIVE:
+    elif model_mode == MODEL_MODE_AUTOREGRESSIVE and page_state is not None:
       self.update_decode_step_pages(key_pages_var, value_pages_var, key, value, page_state)
       if _use_kernel_v2:
         return self.paged_attention_v2_decode(query, key_pages_var, value_pages_var, page_state), None, None
       return self.paged_attention_v1_decode(query, key_pages_var, value_pages_var, page_state), None, None
+    else:
+      raise NotImplementedError(model_mode)
 
   def update_prefill_step_pages(
       self,
 
@@ -17,24 +17,22 @@
 # pylint: disable=too-many-positional-arguments, unnecessary-lambda-assignment
 
 from collections.abc import Callable
+from functools import partial
+from typing import Any, Optional, Literal
 import dataclasses
 import functools
-from typing import Any, Optional, Literal
 
-import jax
-import jax.numpy as jnp
 from jax import lax
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
+import jax
+import jax.numpy as jnp
 
 from aqt.jax.v2 import pallas as aqt_pl
-from aqt.jax.v2 import aqt_tensor
+from aqt.jax.v2.aqt_tensor import QTensor
 
 from MaxText.kernels.megablox import common
 
-QTensor = aqt_tensor.QTensor
-partial = functools.partial
-
 
 def _validate_args(
     *,
 
@@ -25,14 +25,8 @@
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
 import jax.numpy as jnp
-from jax.experimental import shard_map
 
-from MaxText import common_types
-
-
-BATCH = common_types.BATCH
-DEFAULT_MASK_VALUE = common_types.DEFAULT_MASK_VALUE
-shard_map = shard_map.shard_map
+from MaxText.common_types import DEFAULT_MASK_VALUE
 
 
 def get_mha_cost_estimate(shape_dtype):
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# Changes in this file should match with requiredReviewers in file .github/workflows/AddLabel.yml`
`2`		`-* @gobbleturk @khatwanimohit @bvandermoon @vipannalla @RissyRan @richjames0 @rni418 @gagika @shralex @yangyuwei @SurbhiJainUSC @hengtaoguo @A9isha @wang2yn84 @wyzhang @mitalisi @gpolovets1 @mailvijayasingh @jrplatin @patemotter @lumosis`
	`2`	`+* @gobbleturk @khatwanimohit @bvandermoon @vipannalla @RissyRan @richjames0 @rni418 @gagika @shralex @yangyuwei @SurbhiJainUSC @hengtaoguo @A9isha @wang2yn84 @wyzhang @mitalisi @gpolovets1 @mailvijayasingh @jrplatin @patemotter @lumosis @aireenmei`
Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,7 @@ jobs:`
`74`	`74`	`jrplatin: "",`
`75`	`75`	`patemotter: "",`
`76`	`76`	`lumosis: "",`
	`77`	`+ aireenmei: "",`
`77`	`78`	`}`
`78`	`79`	`const reviews = await github.rest.pulls.listReviews({`
`79`	`80`	`owner,`