Bring LLaMa 3.1 405B to TorchTitan family (#481)

fduwjj · web-flow · commit e457deb080e7 · 2024-08-01T16:16:17.000-07:00
With the official launch of LLaMa 3.1 model, we want to add the config to TorchTitan. Of course, there are more work to be done, but we want to go an incremental way. So more PRs will be needed. For now, we try on 128 GPUs with current config (TP=8, FSDP=16). The perf number is wps: 109 mfu: 29%. Loss curve for 3000 steps with 600 warmup (lr = 0.8e-4). <img width="1037" alt="image" src="https://github.com/user-attachments/assets/f57dd3fa-07d8-4ef4-8f68-8f7a08e9652e"> Loss curve for 3000 steps with 600 warmup (lr = 1.1e-4). ![image](https://github.com/user-attachments/assets/429b9738-94cb-4b37-90ef-049a5587ddd0)
diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ Once you have confirmed access, you can run the following command to download th
 ```bash
 # Get your HF token from https://huggingface.co/settings/tokens
 
-# llama3 tokenizer.model
+# llama3 or 3.1 tokenizer.model
 python torchtitan/datasets/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3-8B --tokenizer_path "original" --hf_token=...
 
 # llama2 tokenizer.model
diff --git a/torchtitan/datasets/download_tokenizer.py b/torchtitan/datasets/download_tokenizer.py
@@ -20,8 +20,8 @@ def hf_download(
 
     try:
         hf_hub_download(
-            repo_id,
-            tokenizer_path,
+            repo_id=repo_id,
+            filename=tokenizer_path,
             local_dir=local_dir,
             local_dir_use_symlinks=False,
             token=hf_token,
diff --git a/torchtitan/models/llama/__init__.py b/torchtitan/models/llama/__init__.py
@@ -48,4 +48,13 @@
         multiple_of=4096,
         rope_theta=500000,
     ),
+    "405B": ModelArgs(
+        dim=16384,
+        n_layers=126,
+        n_heads=128,
+        n_kv_heads=8,
+        ffn_dim_multiplier=1.2,
+        multiple_of=4096,
+        rope_theta=500000,
+    ),
 }
diff --git a/train_configs/llama3_405b.toml b/train_configs/llama3_405b.toml
@@ -0,0 +1,53 @@
+# torchtitan Config.toml
+# NOTE: this toml config is a preset for 128 H100 GPUs.
+
+[job]
+dump_folder = "./outputs"
+description = "Llama 3 405B training"
+
+[profiling]
+enable_profiling = true
+save_traces_folder = "profile_trace"
+profile_freq = 100
+
+[metrics]
+log_freq = 10
+enable_tensorboard = true
+save_tb_folder = "tb"
+
+[model]
+name = "llama3"
+flavor = "405B"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm
+tokenizer_path = "./torchtitan/datasets/tokenizer/original/tokenizer.model"
+
+[optimizer]
+name = "AdamW"
+lr = 0.8e-4
+
+[training]
+batch_size = 2
+seq_len = 8192
+warmup_steps = 600  # lr scheduler warm up, normally 20% of the train steps
+max_norm = 1.0  # grad norm clipping
+steps = 3000
+data_parallel_degree = -1
+tensor_parallel_degree = 8  # 8-way TP
+enable_float8_linear = false
+compile = false
+dataset = "c4"
+
+[experimental]
+pipeline_parallel_degree = 1
+
+[checkpoint]
+enable_checkpoint = false
+folder = "checkpoint"
+interval_type = "steps"
+interval = 500
+model_weights_only = false
+export_dtype = "float32"
+async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
+
+[activation_checkpoint]
+mode = 'full' # ['none', 'selective', 'full']