@@ -26,6 +26,8 @@ class OverrideDefinitions:
26
26
27
27
override_args : Sequence [Sequence [str ]] = tuple (tuple (" " ))
28
28
test_descr : str = "default"
29
+ requires_seed_checkpoint : bool = False
30
+ ngpu : int = 4
29
31
30
32
31
33
CONFIG_DIR = "./train_configs"
@@ -85,25 +87,104 @@ class OverrideDefinitions:
85
87
],
86
88
"Checkpoint Integration Test - Save Model Weights Only bf16" ,
87
89
),
90
+ OverrideDefinitions (
91
+ [
92
+ [
93
+ "--checkpoint.enable_checkpoint" ,
94
+ f"--checkpoint.folder { test_checkpoint_dir } _pp" ,
95
+ "--experimental.pipeline_parallel_degree 2" ,
96
+ "--experimental.pipeline_parallel_split_points layers.1" ,
97
+ "--training.data_parallel_degree 1" ,
98
+ "--model.norm_type rmsnorm" , # TODO fix fused_rmsnorm issue
99
+ ],
100
+ ],
101
+ "PP 1D test" ,
102
+ requires_seed_checkpoint = True ,
103
+ ngpu = 2 ,
104
+ ),
105
+ OverrideDefinitions (
106
+ [
107
+ [
108
+ "--checkpoint.enable_checkpoint" ,
109
+ f"--checkpoint.folder { test_checkpoint_dir } _pp_dp" ,
110
+ "--experimental.pipeline_parallel_degree 2" ,
111
+ "--experimental.pipeline_parallel_split_points layers.1" ,
112
+ "--training.data_parallel_degree 2" ,
113
+ "--model.norm_type fused_rmsnorm" ,
114
+ ],
115
+ ],
116
+ "PP+DP 2D test" ,
117
+ requires_seed_checkpoint = True ,
118
+ ),
119
+ OverrideDefinitions (
120
+ [
121
+ [
122
+ "--checkpoint.enable_checkpoint" ,
123
+ f"--checkpoint.folder { test_checkpoint_dir } _pp_tp" ,
124
+ "--experimental.pipeline_parallel_degree 2" ,
125
+ "--experimental.pipeline_parallel_split_points layers.1" ,
126
+ "--training.tensor_parallel_degree 2" ,
127
+ "--model.norm_type rmsnorm" , # TODO fix fused_rmsnorm issue
128
+ ],
129
+ ],
130
+ "PP+TP 2D test" ,
131
+ requires_seed_checkpoint = True ,
132
+ ),
133
+ # oh.. not enough GPUs?
134
+ # OverrideDefinitions(
135
+ # [
136
+ # [
137
+ # "--checkpoint.enable_checkpoint",
138
+ # f"--checkpoint.folder {test_checkpoint_dir}_pp_dp_tp",
139
+ # "--experimental.pipeline_parallel_degree 2",
140
+ # "--experimental.pipeline_parallel_split_points layers.1",
141
+ # "--training.data_parallel_degree 2",
142
+ # "--training.tensor_parallel_degree 2",
143
+ # "--model.norm_type rmsnorm", # TODO fix fused_rmsnorm issue
144
+ # ],
145
+ # ],
146
+ # "PP+DP+TP 3D test",
147
+ # requires_seed_checkpoint=True,
148
+ # ),
88
149
]
89
150
90
151
152
+ def _run_cmd (cmd ):
153
+ return subprocess .run (
154
+ [cmd ],
155
+ stdout = subprocess .PIPE ,
156
+ stderr = subprocess .STDOUT ,
157
+ text = True ,
158
+ shell = True ,
159
+ )
160
+
161
+
91
162
def run_test (test_flavor : OverrideDefinitions , full_path : str ):
92
163
# run_test supports sequence of tests.
93
164
for override_arg in test_flavor .override_args :
94
- cmd = f"CONFIG_FILE={ full_path } NGPU=4 LOG_RANK=0,1,2,3 ./run_llama_train.sh"
165
+
166
+ cmd = f"CONFIG_FILE={ full_path } NGPU={ test_flavor .ngpu } LOG_RANK=0,1,2,3 ./run_llama_train.sh"
95
167
if override_arg :
96
168
cmd += " " + " " .join (override_arg )
97
169
print (
98
170
f"=====Integration test, flavor : { test_flavor .test_descr } , command : { cmd } ====="
99
171
)
100
- result = subprocess .run (
101
- [cmd ],
102
- stdout = subprocess .PIPE ,
103
- stderr = subprocess .STDOUT ,
104
- text = True ,
105
- shell = True ,
106
- )
172
+
173
+ if test_flavor .requires_seed_checkpoint :
174
+ checkpoint_folder_arg = None
175
+ for arg in override_arg :
176
+ if "--checkpoint.folder" in arg :
177
+ checkpoint_folder_arg = arg
178
+ assert (
179
+ checkpoint_folder_arg is not None
180
+ ), "Can't use seed checkpoint if folder is not specified"
181
+ print ("Creating seed checkpoint" )
182
+ result = _run_cmd (
183
+ f"CONFIG_FILE={ full_path } ./create_seed_checkpoint.sh { checkpoint_folder_arg } "
184
+ )
185
+ print (result .stdout )
186
+
187
+ result = _run_cmd (cmd )
107
188
print (result .stdout )
108
189
if result .returncode != 0 :
109
190
raise Exception (
0 commit comments