WIP integrate pippy's tracer frontend

wconstab · wconstab · commit 6a430d846cde · 2024-03-28T21:44:40.000-07:00
- dcp load seems to work now - need to pull in schedule object ghstack-source-id: cbbb8c9 Pull Request resolved: #161
diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -11,7 +11,7 @@ TRAINER_DIR=${1:-/home/$USER/local/torchtrain}
 # e.g.
 # LOG_RANK=0,1 NGPU=4 ./run_llama_train.sh
 
-NGPU=${NGPU:-"8"}
+NGPU=${NGPU:-"2"}
 
 # by default log just rank 0 output,
 LOG_RANK=${LOG_RANK:-0}
diff --git a/torchtrain/meta_init.py b/torchtrain/meta_init.py
@@ -46,3 +46,9 @@ def meta_to_real_init_fn(module: nn.Module):
                     torch.randn_like(param, device=torch.device("cuda"))
                 )
                 setattr(submodule, param_name, materialized_param)
+        for param_name, param in submodule.named_buffers(recurse=False):
+            if param.is_meta:
+                materialized_param = nn.Parameter(
+                    torch.randn_like(param, device=torch.device("cuda"))
+                )
+                setattr(submodule, param_name, materialized_param)
diff --git a/torchtrain/models/llama/model.py b/torchtrain/models/llama/model.py
@@ -334,13 +334,16 @@ def __init__(self, model_args: ModelArgs):
         self.model_args = model_args
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
 
-        self.freqs_cis = precompute_freqs_cis(
-            # Note that self.model_args.max_seq_len is multiplied by 2 because the token limit for the Llama 2 generation
-            # of models is 4096.
-            # Adding this multiplier instead of using 4096 directly allows for dynamism of token lengths while training
-            # or fine-tuning.
-            self.model_args.dim // self.model_args.n_heads,
-            self.model_args.max_seq_len * 2,
+        self.register_buffer(
+            "freqs_cis",
+            precompute_freqs_cis(
+                # Note that self.model_args.max_seq_len is multiplied by 2 because the token limit for the Llama 2 generation
+                # of models is 4096.
+                # Adding this multiplier instead of using 4096 directly allows for dynamism of token lengths while training
+                # or fine-tuning.
+                self.model_args.dim // self.model_args.n_heads,
+                self.model_args.max_seq_len * 2,
+            ),
         )
 
     def forward(self, tokens: torch.Tensor):
@@ -355,7 +358,7 @@ def forward(self, tokens: torch.Tensor):
         """
         _bsz, seqlen = tokens.shape
         h = self.tok_embeddings(tokens)
-        self.freqs_cis = self.freqs_cis.to(h.device)
+        # self.freqs_cis = self.freqs_cis.to(h.device)
         freqs_cis = self.freqs_cis[0:seqlen]
         return h, freqs_cis
 
diff --git a/torchtrain/parallelisms/parallelize_llama.py b/torchtrain/parallelisms/parallelize_llama.py
@@ -8,6 +8,7 @@
 from typing import Tuple
 
 import torch
+from pippy import annotate_split_points, Pipe, PipeSplitWrapper
 from torch.distributed._tensor import Replicate, Shard
 
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
@@ -143,7 +144,31 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
     """
     # apply PTD parallelisms
     if parallel_dims.pp_enabled:
-        raise NotImplementedError("PP not implemented yet.")
+        pp_mesh = world_mesh["pp"]
+        stage_idx = pp_mesh.get_local_rank()
+        layers_per_rank = len(model.layers) // parallel_dims.pp
+        for i in range(1, parallel_dims.pp):
+            annotate_split_points(
+                model,
+                {
+                    f"layers.{i * layers_per_rank}": PipeSplitWrapper.SplitPoint.BEGINNING
+                },
+            )
+
+        # Get example input
+        label_shape = input_shape = (8, 2048)  # TODO
+        input_ids = torch.randint(
+            model.vocab_size, input_shape, dtype=torch.int64, device="meta"
+        )
+        labels = torch.randint(
+            model.vocab_size, label_shape, dtype=torch.int64, device="meta"
+        )
+        print("input_ids: ", input_ids.shape, input_ids.dtype)
+        print("labels: ", labels.shape, labels.dtype)
+
+        # Create a pipeline representation from the model
+        pipe = Pipe.from_tracing(model, parallel_dims.pp, example_args=(input_ids,))
+        model = pipe.get_stage_module(stage_idx)
 
     # First we apply Tensor Parallelism if it's enabled
     if parallel_dims.tp_enabled:
@@ -256,10 +281,14 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
         meta_to_real_init_fn(model)
         model.cuda()
 
-    # TODO(whc) - proposal: remove this call, and assert that we always load a checkpoint
-    # we have now moved from meta to device,
-    # reset parameters for proper initialization
-    model.reset_parameters()
-    logger.info("Model fully initialized via reset_parameters")
+    if parallel_dims.pp_enabled:
+        setattr(pipe.split_gm, f"submod_{stage_idx}", model)
+        return pipe
+    else:
+        # TODO(whc) - proposal: remove this call, and assert that we always load a checkpoint
+        # we have now moved from meta to device,
+        # reset parameters for proper initialization
+        model.reset_parameters()
+        logger.info("Model fully initialized via reset_parameters")
 
     return model
diff --git a/train.py b/train.py
@@ -187,6 +187,18 @@ def main(job_config: JobConfig):
         model, world_mesh, parallel_dims, job_config
     )
 
+    # TODO(whc) everything below needs to become a function that can be applied to each 'virtual stage' of PP, if
+    # there are virtual stages
+    if parallel_dims.pp_enabled:
+        pmod = model
+        pp_mesh = world_mesh["pp"]
+        pp_degree = pp_mesh.size()
+        pp_rank = pp_mesh.get_local_rank()
+        logger.info(
+            f"{Color.blue}Extracting pipeline module for stage {pp_mesh.get_local_rank()}{Color.reset}"
+        )
+        model = pmod.get_stage_module(pp_mesh.get_local_rank())
+
     # build optimizer after applying parallelisms to the model
     optimizer = build_optimizer(model, job_config)
     scheduler = get_lr_scheduler(optimizer, job_config)
@@ -258,10 +270,12 @@ def main(job_config: JobConfig):
 
             input_ids = input_ids.cuda()
             labels = labels.cuda()
-
+            print("i", input_ids.shape)
+            print("l", labels.shape)
             optimizer.zero_grad()
 
             # forward
+            # TODO - integrate pp batch splitter
             pred = model(input_ids)
 
             with loss_parallel() if parallel_dims.loss_parallel_enabled else contextlib.nullcontext():
diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
@@ -32,7 +32,7 @@ max_norm = 1.0  # grad norm clipping
 steps = 10
 data_parallel_degree = -1
 tensor_parallel_degree = 1
-pipeline_parallel_degree = 1
+pipeline_parallel_degree = 2
 fp8_linear = ""
 compile = false
 checkpoint_interval = 3600