@@ -30,6 +30,8 @@ class OverrideDefinitions:
30
30
31
31
override_args : Sequence [Sequence [str ]] = tuple (tuple (" " ))
32
32
test_descr : str = "default"
33
+ requires_seed_checkpoint : bool = False
34
+ ngpu : int = 4
33
35
34
36
35
37
CONFIG_DIR = "./train_configs"
@@ -102,25 +104,104 @@ class OverrideDefinitions:
102
104
],
103
105
"Checkpoint Integration Test - Save Model Weights Only bf16" ,
104
106
),
107
+ OverrideDefinitions (
108
+ [
109
+ [
110
+ "--checkpoint.enable_checkpoint" ,
111
+ f"--job.dump_folder { args .output_dir } /pp/" ,
112
+ "--experimental.pipeline_parallel_degree 2" ,
113
+ "--experimental.pipeline_parallel_split_points layers.1" ,
114
+ "--training.data_parallel_degree 1" ,
115
+ "--model.norm_type rmsnorm" , # TODO fix fused_rmsnorm issue
116
+ ],
117
+ ],
118
+ "PP 1D test" ,
119
+ requires_seed_checkpoint = True ,
120
+ ngpu = 2 ,
121
+ ),
122
+ OverrideDefinitions (
123
+ [
124
+ [
125
+ "--checkpoint.enable_checkpoint" ,
126
+ f"--job.dump_folder { args .output_dir } /pp_dp/" ,
127
+ "--experimental.pipeline_parallel_degree 2" ,
128
+ "--experimental.pipeline_parallel_split_points layers.1" ,
129
+ "--training.data_parallel_degree 2" ,
130
+ "--model.norm_type fused_rmsnorm" ,
131
+ ],
132
+ ],
133
+ "PP+DP 2D test" ,
134
+ requires_seed_checkpoint = True ,
135
+ ),
136
+ OverrideDefinitions (
137
+ [
138
+ [
139
+ "--checkpoint.enable_checkpoint" ,
140
+ f"--job.dump_folder { args .output_dir } /pp_tp/" ,
141
+ "--experimental.pipeline_parallel_degree 2" ,
142
+ "--experimental.pipeline_parallel_split_points layers.1" ,
143
+ "--training.tensor_parallel_degree 2" ,
144
+ "--model.norm_type rmsnorm" , # TODO fix fused_rmsnorm issue
145
+ ],
146
+ ],
147
+ "PP+TP 2D test" ,
148
+ requires_seed_checkpoint = True ,
149
+ ),
150
+ # oh.. not enough GPUs?
151
+ # OverrideDefinitions(
152
+ # [
153
+ # [
154
+ # "--checkpoint.enable_checkpoint",
155
+ # f"--job.dump_folder {args.output_dir}/pp_dp_tp/",
156
+ # "--experimental.pipeline_parallel_degree 2",
157
+ # "--experimental.pipeline_parallel_split_points layers.1",
158
+ # "--training.data_parallel_degree 2",
159
+ # "--training.tensor_parallel_degree 2",
160
+ # "--model.norm_type rmsnorm", # TODO fix fused_rmsnorm issue
161
+ # ],
162
+ # ],
163
+ # "PP+DP+TP 3D test",
164
+ # requires_seed_checkpoint=True,
165
+ # ),
105
166
]
106
167
107
168
169
+ def _run_cmd (cmd ):
170
+ return subprocess .run (
171
+ [cmd ],
172
+ stdout = subprocess .PIPE ,
173
+ stderr = subprocess .STDOUT ,
174
+ text = True ,
175
+ shell = True ,
176
+ )
177
+
178
+
108
179
def run_test (test_flavor : OverrideDefinitions , full_path : str ):
109
180
# run_test supports sequence of tests.
110
181
for override_arg in test_flavor .override_args :
111
- cmd = f"CONFIG_FILE={ full_path } NGPU=4 LOG_RANK=0,1,2,3 ./run_llama_train.sh"
182
+
183
+ cmd = f"CONFIG_FILE={ full_path } NGPU={ test_flavor .ngpu } LOG_RANK=0,1,2,3 ./run_llama_train.sh"
112
184
if override_arg :
113
185
cmd += " " + " " .join (override_arg )
114
186
print (
115
187
f"=====Integration test, flavor : { test_flavor .test_descr } , command : { cmd } ====="
116
188
)
117
- result = subprocess .run (
118
- [cmd ],
119
- stdout = subprocess .PIPE ,
120
- stderr = subprocess .STDOUT ,
121
- text = True ,
122
- shell = True ,
123
- )
189
+
190
+ if test_flavor .requires_seed_checkpoint :
191
+ dump_folder_arg = None
192
+ for arg in override_arg :
193
+ if "--job.dump_folder" in arg :
194
+ dump_folder_arg = arg
195
+ assert (
196
+ dump_folder_arg is not None
197
+ ), "Can't use seed checkpoint if folder is not specified"
198
+ print ("Creating seed checkpoint" )
199
+ result = _run_cmd (
200
+ f"CONFIG_FILE={ full_path } ./create_seed_checkpoint.sh { dump_folder_arg } "
201
+ )
202
+ print (result .stdout )
203
+
204
+ result = _run_cmd (cmd )
124
205
print (result .stdout )
125
206
if result .returncode != 0 :
126
207
raise Exception (
0 commit comments