Update on "[DO NOT MERGE][example] fold batch and sequence dimensions to accelerate Sequence Parallel"

tianyu-l · tianyu-l · commit 67dffdbaae88 · 2024-07-08T18:26:46.000-07:00
Note: This PR is for showcasing purpose only and is almost a reverse of #190. At the cost of model code change, we can obtain better Sequence Parallel performance. Without folding and unfolding, all-gather and reduce-scatter are performed on dim 1 (sequence dim) instead of dim 0 (folded dim), which incurs an extra `aten.cat` after each collective. Stats from awgu: > for 8k seq len, batch size 1 on H100, these two cats take about 0.18 ms out of 3 ms of FFN compute (6%) Experiment on 8-layer `debug_model` before: <img width="1023" alt="image" src="https://github.com/pytorch/torchtitan/assets/150487191/04e5ea4b-fa9e-48e5-92be-582841cb2796"> after: <img width="1023" alt="image" src="https://github.com/pytorch/torchtitan/assets/150487191/38c39506-462d-485a-a16c-48770a28edb0"> [ghstack-poisoned]
diff --git a/torchtitan/models/llama/model.py b/torchtitan/models/llama/model.py
@@ -185,7 +185,7 @@ def forward(
             torch.Tensor: Output tensor after attention.
 
         """
-        # dim 0 of x is a folded dimension of [bs, seqlen]
+        # dim 0 of x is a folded dimension of (bs, seqlen)
         seqlen, _ = freqs_cis.shape
         bs_seqlen, _ = x.shape
         bs = bs_seqlen // seqlen
@@ -427,21 +427,27 @@ def forward(self, tokens: torch.Tensor):
             torch.Tensor: Output logits after applying the Transformer model.
 
         """
-        # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages
-        h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
-        # fold batch dimension and sequence dimension
-        # for more efficient allgather/reduce_scatter
-        h = h.view(-1, self.model_args.dim)
+        # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stage
+        if self.tok_embeddings:
+            # fold batch dimension and sequence dimension
+            # for more efficient allgather/reduce_scatter
+            tokens = tokens.view(-1)
+            h = self.tok_embeddings(tokens)
+        else:
+            h = tokens
 
-        freqs_cis = self.freqs_cis[0 : self.model_args.max_seq_len]
+        seqlen = self.model_args.max_seq_len
+        freqs_cis = self.freqs_cis[0:seqlen]
         for layer in self.layers.values():
             h = layer(h, freqs_cis)
 
         h = self.norm(h) if self.norm else h
-        # unfold batch and sequence dimension
-        bs, seqlen = tokens.shape
-        h = h.view(bs, seqlen, self.model_args.dim)
-        output = self.output(h).float() if self.output else h
+        if self.output:
+            # unfold batch and sequence dimension
+            h = h.view(-1, seqlen, self.model_args.dim)
+            output = self.output(h).float()
+        else:
+            output = h
         return output
 
     @classmethod
diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py
@@ -350,18 +350,14 @@ def apply_tp(model, world_mesh, parallel_dims, job_config: JobConfig):
         {
             "tok_embeddings": RowwiseParallel(
                 input_layouts=Replicate(),
+                output_layouts=Shard(0),
             ),
             "output": col_parallel_strategy(
                 input_layouts=Shard(0),
                 output_layouts=Shard(-1) if loss_parallel else Replicate(),
                 use_local_output=not loss_parallel,
             ),
             "norm": SequenceParallel(sequence_dim=0),
-            "layers.0": PrepareModuleInput(
-                input_layouts=(Replicate(), None),
-                desired_input_layouts=(Shard(0), None),
-                use_local_output=True,
-            ),
         },
     )