move config folder to root and adjust options (#83)

wanchaol · web-flow · commit 3fce6bbd5991 · 2024-02-24T15:24:12.000-08:00
as titled, move the config files to the root folder, where it decouples
with the torchtrain package build, and allow easier navigations
diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -6,24 +6,15 @@ TRAINER_DIR=${1:-/home/$USER/local/torchtrain}
 
 # use envs as local overrides for convenience
 # e.g.
-# LOG_RANK=0,1 NGPU=4 SP=2 ./run_llama_train.sh
+# LOG_RANK=0,1 NGPU=4 ./run_llama_train.sh
 
-MODEL=${MODEL:-"llama"}
-MODEL_CONF=${MODEL_CONF:-"debugmodel"}
 NGPU=${NGPU:-"8"}
-PP=${PP:-"1"}
-SP=${SP:-"1"}
-DP=${DP:-"-1"}
 
 # by default log just rank 0 output,
 LOG_RANK=${LOG_RANK:-0}
 
-# Change this string to a meaningful one to enable checkpoint
-CHECKPOINT_FOLDER=${CHECKPOINT_FOLDER:-""}
-# Please adjust this to a longer interval period. The unit of measurement is in steps.
-CHECKPOINT_INTERVAL=${CHECKPOINT_INTERVAL:-5}
 
-CONFIG_FILE=${CONFIG_FILE:-"./torchtrain/train_configs/train_config.toml"}
+CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"}
 
 torchrun --nproc_per_node=${NGPU} --rdzv_endpoint="localhost:5972" \
 --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
diff --git a/test/test_job_config.py b/test/test_job_config.py
@@ -1,3 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
 import pytest
 from torchtrain.config_manager import JobConfig
 
@@ -10,9 +13,7 @@ def test_command_line_args(self):
 
     def test_job_config_file(self):
         config = JobConfig()
-        config.parse_args(
-            ["--job.config_file", "./torchtrain/train_configs/train_config.toml"]
-        )
+        config.parse_args(["--job.config_file", "./train_configs/debug_model.toml"])
         assert config.model.name == "llama"
 
     def test_job_file_does_not_exist(self):
diff --git a/torchtrain/train_configs/__init_.py b/torchtrain/train_configs/__init_.py
diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
@@ -1,6 +1,6 @@
 # TorchTrain Config.toml
 [job]
-dump_folder = "./torchtrain/outputs"
+dump_folder = "./outputs"
 
 [profiling]
 run_profiler = true
@@ -26,8 +26,8 @@ lr = 8e-4
 [training]
 batch_size = 8
 seq_len = 2048
-warmup_pct = 0.20
-max_norm = 1.0
+warmup_pct = 0.20  # lr scheduler warm up
+max_norm = 1.0  # grad norm clipping
 steps = 10
 data_parallel_degree = -1
 sequence_parallel_degree = 1