File tree Expand file tree Collapse file tree 4 files changed +9
-18
lines changed Expand file tree Collapse file tree 4 files changed +9
-18
lines changed Original file line number Diff line number Diff line change @@ -6,24 +6,15 @@ TRAINER_DIR=${1:-/home/$USER/local/torchtrain}
6
6
7
7
# use envs as local overrides for convenience
8
8
# e.g.
9
- # LOG_RANK=0,1 NGPU=4 SP=2 ./run_llama_train.sh
9
+ # LOG_RANK=0,1 NGPU=4 ./run_llama_train.sh
10
10
11
- MODEL=${MODEL:- " llama" }
12
- MODEL_CONF=${MODEL_CONF:- " debugmodel" }
13
11
NGPU=${NGPU:- " 8" }
14
- PP=${PP:- " 1" }
15
- SP=${SP:- " 1" }
16
- DP=${DP:- " -1" }
17
12
18
13
# by default log just rank 0 output,
19
14
LOG_RANK=${LOG_RANK:- 0}
20
15
21
- # Change this string to a meaningful one to enable checkpoint
22
- CHECKPOINT_FOLDER=${CHECKPOINT_FOLDER:- " " }
23
- # Please adjust this to a longer interval period. The unit of measurement is in steps.
24
- CHECKPOINT_INTERVAL=${CHECKPOINT_INTERVAL:- 5}
25
16
26
- CONFIG_FILE=${CONFIG_FILE:- " ./torchtrain/ train_configs/train_config .toml" }
17
+ CONFIG_FILE=${CONFIG_FILE:- " ./train_configs/debug_model .toml" }
27
18
28
19
torchrun --nproc_per_node=${NGPU} --rdzv_endpoint=" localhost:5972" \
29
20
--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
Original file line number Diff line number Diff line change
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
3
+
1
4
import pytest
2
5
from torchtrain .config_manager import JobConfig
3
6
@@ -10,9 +13,7 @@ def test_command_line_args(self):
10
13
11
14
def test_job_config_file (self ):
12
15
config = JobConfig ()
13
- config .parse_args (
14
- ["--job.config_file" , "./torchtrain/train_configs/train_config.toml" ]
15
- )
16
+ config .parse_args (["--job.config_file" , "./train_configs/debug_model.toml" ])
16
17
assert config .model .name == "llama"
17
18
18
19
def test_job_file_does_not_exist (self ):
Load Diff This file was deleted.
Original file line number Diff line number Diff line change 1
1
# TorchTrain Config.toml
2
2
[job ]
3
- dump_folder = " ./torchtrain/ outputs"
3
+ dump_folder = " ./outputs"
4
4
5
5
[profiling ]
6
6
run_profiler = true
@@ -26,8 +26,8 @@ lr = 8e-4
26
26
[training ]
27
27
batch_size = 8
28
28
seq_len = 2048
29
- warmup_pct = 0.20
30
- max_norm = 1.0
29
+ warmup_pct = 0.20 # lr scheduler warm up
30
+ max_norm = 1.0 # grad norm clipping
31
31
steps = 10
32
32
data_parallel_degree = -1
33
33
sequence_parallel_degree = 1
You can’t perform that action at this time.
0 commit comments