add basic AC configs for 13B and 70B (#169)

wanchaol · web-flow · commit 672265720578 · 2024-03-27T10:50:08.000-07:00
as titled, currently 13B use selective op, and 70B use selective layer,
we can do some more experiments and adjust the configs later
diff --git a/train_configs/llama_13b.toml b/train_configs/llama_13b.toml
@@ -30,7 +30,6 @@ seq_len = 4096
 warmup_steps = 200  # lr scheduler warm up
 max_norm = 1.0  # grad norm clipping
 steps = 1000
-# only dp would be sufficient for 7B
 data_parallel_degree = -1
 # 8-way TP, adjust to 2/4 for local(single host) runs
 tensor_parallel_degree = 8
@@ -41,3 +40,8 @@ checkpoint_interval = 3600
 checkpoint_interval_type = "steps"
 checkpoint_folder = ""
 dataset = "openwebtext"
+
+
+[activation_checkpoint]
+mode = 'selective'  # ['none', 'full', 'selective']
+selective_ac_option = 'op'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
diff --git a/train_configs/llama_70b.toml b/train_configs/llama_70b.toml
@@ -30,7 +30,6 @@ seq_len = 4096
 warmup_steps = 200  # lr scheduler warm up
 max_norm = 1.0  # grad norm clipping
 steps = 1000
-# only dp would be sufficient for 7B
 data_parallel_degree = -1
 # 8-way TP
 tensor_parallel_degree = 8
@@ -41,3 +40,8 @@ checkpoint_interval = 3600
 checkpoint_interval_type = "steps"
 checkpoint_folder = ""
 dataset = "openwebtext"
+
+
+[activation_checkpoint]
+mode = 'selective'  # ['none', 'full', 'selective']
+selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy