[EZ] Add logs for some basic training params so that we can verify in… (#491)

fduwjj · web-flow · commit f13fe3f301cb · 2024-07-29T19:38:01.000-07:00
As title, while testing on 405B model, I found that we need to somehow need the logs for some training params. So added some here. Tested locally and the logging is shown as in the screenshot: <img width="900" alt="image" src="https://github.com/user-attachments/assets/b94e34f5-3e88-4c5f-94ed-75f50dde9786">
diff --git a/train.py b/train.py
@@ -355,7 +355,12 @@ def loss_fn(pred, labels):
     gpu_memory_monitor.reset_peak_stats()
 
     # train loop
-    logger.info(f"Training starts at step {train_state.step + 1}")
+    logger.info(
+        f"Training starts at step {train_state.step + 1}, "
+        f"with local batch size: {job_config.training.batch_size}, "
+        f"sequence length: {job_config.training.seq_len}, "
+        f"total steps: {job_config.training.steps}({job_config.training.warmup_steps}), "
+    )
     with maybe_enable_profiling(
         job_config, global_step=train_state.step
     ) as torch_profiler, maybe_enable_memory_snapshot(