exclude embedding in MFU computation

tianyu-l · tianyu-l · commit 4d8c24520223 · 2024-05-01T17:14:20.000-07:00
ghstack-source-id: 9daa990 Pull Request resolved: #280
diff --git a/torchtitan/utils.py b/torchtitan/utils.py
@@ -96,16 +96,11 @@ def init_distributed(job_config):
     os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
 
 
-def get_num_params(model: torch.nn.Module, only_trainable: bool = False) -> int:
-    """
-    Get the total model params
-    Args : only_trainable: whether to only count trainable params
-    """
-    param_list = list(model.parameters())
-    if only_trainable:
-        param_list = [p for p in param_list if p.requires_grad]
-    # unique_params = {p.data_ptr(): p for p in param_list}.values()
-    return sum(p.numel() for p in param_list)
+def get_num_params(model: torch.nn.Module, exclude_embedding: bool = False) -> int:
+    num_params = sum(p.numel() for p in model.parameters())
+    if exclude_embedding:
+        num_params -= model.tok_embeddings.weight.numel()
+    return num_params
 
 
 def get_num_flop_per_token(num_params: int, model_config, seq_len) -> int:
@@ -115,7 +110,14 @@ def get_num_flop_per_token(num_params: int, model_config, seq_len) -> int:
         model_config.dim // model_config.n_heads,
         seq_len,
     )
+    # Reasoning behind the factor of 12 for the self-attention part of the formula:
+    # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+    # 2. the flash attention does 1 more matmul recomputation in the backward
+    #    but recomputation should not be counted in calculating MFU           (+0)
+    # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+    # 4. we follow the convention and do not account for sparsity in causal attention
     flop_per_token = 6 * num_params + 12 * l * h * q * t
+
     return flop_per_token
 
 
diff --git a/train.py b/train.py
@@ -187,7 +187,9 @@ def loss_fn(pred, labels):
     # log model size
     model_param_count = get_num_params(model)
     num_flop_per_token = get_num_flop_per_token(
-        model_param_count, model_config, job_config.training.seq_len
+        get_num_params(model, exclude_embedding=True),
+        model_config,
+        job_config.training.seq_len,
     )
     logger.info(
         f"{color.blue}Model {model_name} {job_config.model.flavor} "