Add support for seed checkpoint creation for meta-init flow

wconstab · wconstab · commit 00f899f66d2c · 2024-03-28T21:44:39.000-07:00
ghstack-source-id: eb584b2 Pull Request resolved: #172
diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -21,4 +21,4 @@ CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"}
 
 torchrun --nproc_per_node=${NGPU} --rdzv_endpoint="localhost:5972" \
 --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-train.py --job.config_file ${CONFIG_FILE}
+train.py --job.config_file ${CONFIG_FILE} --training.checkpoint_folder /data/users/whc/torchtrain
diff --git a/seed_checkpoint.py b/seed_checkpoint.py
@@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import os
+
+import torch.distributed.checkpoint as DCP
+
+from torchtrain.config_manager import JobConfig
+from torchtrain.datasets import create_tokenizer
+from torchtrain.float8_linear import build_fp8_linear
+from torchtrain.logging_utils import init_logger, logger
+from torchtrain.models import model_name_to_cls, model_name_to_tokenizer, models_config
+
+_is_local_logging = True
+if "SLURM_JOB_ID" in os.environ:
+    _is_local_logging = False
+
+
+def main(job_config: JobConfig):
+    init_logger()
+
+    model_name = job_config.model.name
+
+    # build tokenizer
+    tokenizer_type = model_name_to_tokenizer[model_name]
+    tokenizer = create_tokenizer(tokenizer_type, job_config.model.tokenizer_path)
+
+    # build model (using meta init)
+    model_cls = model_name_to_cls[model_name]
+    model_config = models_config[model_name][job_config.model.flavor]
+    model_config.vocab_size = tokenizer.n_words
+    logger.info(f"Building {model_name} {job_config.model.flavor} with {model_config}")
+    model = model_cls.from_model_args(model_config)
+
+    # apply fp8 linear module swap
+    if job_config.training.fp8_linear:
+        build_fp8_linear(model, job_config)
+
+    model.reset_parameters()
+
+    checkpoint_id = os.path.join(job_config.training.checkpoint_folder, "step-0")
+    logger.info(f"Creating seed (step-0) checkpoint in {checkpoint_id}")
+    DCP.save(
+        state_dict={
+            "model": model.state_dict(),
+        },
+        checkpoint_id=checkpoint_id,
+    )
+
+
+"""
+1. how do i serialize enough info about the model config to ensure i don't try to load an incompatible checkpoint later?
+ - maybe skip this. users responsible to manage their checkpoints, and we can partially help by managing their 'dump folder'?
+
+2. would i apply fp8 before creating the seed or not?  I think probably before
+3. can i skip optimizer in seed file? i think so. optimizer can later create its states from the model post-sharding
+"""
+if __name__ == "__main__":
+    config = JobConfig()
+    config.parse_args()
+    main(config)
diff --git a/torchtrain/checkpoint.py b/torchtrain/checkpoint.py
@@ -119,6 +119,7 @@ def save(self, curr_step: int, force: bool = False) -> None:
         )
 
     def load(self, step: int = -1) -> bool:
+        logger.info(f"Trying Loading a checkpoint from  '{self.folder}'")
         if not self.folder:
             return False
         if not os.path.isdir(self.folder):
@@ -136,10 +137,12 @@ def load(self, step: int = -1) -> bool:
                 return False
             step = max(step_counts)
 
-        logger.info("Loading a checkpoint")
+        # We won't have optimizer states to load, if we are loading a seed checkpoint
+        states = {"model": self.states["model"]} if step == 0 else self.states
+        logger.info(f"Loading a checkpoint from step {step}")
         begin = time.monotonic()
         dcp.load(
-            self.states,
+            states,
             checkpoint_id=self.create_checkpoint_id(step),
         )
         logger.info(
diff --git a/torchtrain/parallelisms/parallelize_llama.py b/torchtrain/parallelisms/parallelize_llama.py
@@ -256,6 +256,7 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
         meta_to_real_init_fn(model)
         model.cuda()
 
+    # TODO(whc) - proposal: remove this call, and assert that we always load a checkpoint
     # we have now moved from meta to device,
     # reset parameters for proper initialization
     model.reset_parameters()