Skip to content

Commit b4ab627

Browse files
gnadathurgnadathur
andauthored
Add integration test with compile enabled (#183)
Summary: same as title Test Plan: ``` + export USE_LIBUV=1 + USE_LIBUV=1 + TRAINER_DIR=/home/gnadathur/local/torchtrain + NGPU=4 + LOG_RANK=0,1 + CONFIG_FILE=./train_configs/debug_model_compile.toml + torchrun --nproc_per_node=4 --rdzv_endpoint=localhost:5972 --local-ranks-filter 0,1 --role rank --tee 3 train.py --job.config_file ./train_configs/debug_model_compile.toml W0401 17:54:33.567000 139955931223040 torch/distributed/run.py:757] W0401 17:54:33.567000 139955931223040 torch/distributed/run.py:757] ***************************************** W0401 17:54:33.567000 139955931223040 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0401 17:54:33.567000 139955931223040 torch/distributed/run.py:757] ***************************************** [rank0]:2024-04-01 17:54:35,779 - root - INFO - Starting job: LLaMA debug training [rank1]:2024-04-01 17:54:35,797 - root - INFO - Starting job: LLaMA debug training [rank0]:2024-04-01 17:54:36,063 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config [rank0]:2024-04-01 17:54:36,069 - root - INFO - Building 1-D device mesh with ['dp'], [4] [rank0]:2024-04-01 17:54:36,071 - root - INFO - Building sentencepiece tokenizer locally from ./torchtrain/datasets/tokenizer/tokenizer.model [rank0]:2024-04-01 17:54:36,078 - root - INFO - SentencePieceTokenizer built: #words 32000, BOS ID 1, EOS ID 2 [rank0]:2024-04-01 17:54:36,078 - root - INFO - Preparing alpaca dataset from HuggingFace [rank1]:2024-04-01 17:54:36,449 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config [rank1]:2024-04-01 17:54:36,454 - root - INFO - Building 1-D device mesh with ['dp'], [4] [rank1]:2024-04-01 17:54:36,456 - root - INFO - Building sentencepiece tokenizer locally from ./torchtrain/datasets/tokenizer/tokenizer.model [rank1]:2024-04-01 17:54:36,463 - root - INFO - SentencePieceTokenizer built: #words 32000, BOS ID 1, EOS ID 2 [rank1]:2024-04-01 17:54:36,463 - root - INFO - Preparing alpaca dataset from HuggingFace [rank0]:2024-04-01 17:54:37,631 - root - INFO - Building llama debugmodel with ModelArgs(dim=256, n_layers=2, n_heads=16, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=32768, depth_init=True) [rank0]:2024-04-01 17:54:37,643 - root - INFO - �[34mModel llama debugmodel �[31msize: 18,089,216 total parameters�[39m [rank0]:2024-04-01 17:54:37,644 - root - INFO - GPU capacity: NVIDIA H100 (0) with 95.04GiB memory [rank0]:2024-04-01 17:54:37,653 - root - INFO - Applied selective activation checkpointing to the model [rank0]:2024-04-01 17:54:37,653 - root - INFO - Applied FSDP to the model [rank1]:2024-04-01 17:54:38,310 - root - INFO - Building llama debugmodel with ModelArgs(dim=256, n_layers=2, n_heads=16, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=32768, depth_init=True) [rank1]:2024-04-01 17:54:38,324 - root - INFO - �[34mModel llama debugmodel �[31msize: 18,089,216 total parameters�[39m [rank1]:2024-04-01 17:54:38,325 - root - INFO - GPU capacity: NVIDIA H100 (1) with 95.04GiB memory [rank1]:2024-04-01 17:54:38,335 - root - INFO - Applied selective activation checkpointing to the model [rank1]:2024-04-01 17:54:38,335 - root - INFO - Applied FSDP to the model [rank1]:2024-04-01 17:54:38,699 - root - INFO - Gradient scaling not enabled [rank1]:2024-04-01 17:54:38,699 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240401-1754 [rank1]:2024-04-01 17:54:38,701 - root - INFO - Compiling model with torch.compile [rank0]:2024-04-01 17:54:38,692 - root - INFO - Gradient scaling not enabled [rank0]:2024-04-01 17:54:38,693 - root - INFO - Metrics logging active. Tensorboard logs will be saved at ./outputs/tb/20240401-1754 [rank0]:2024-04-01 17:54:38,694 - root - INFO - Compiling model with torch.compile [rank0]:2024-04-01 17:54:39,390 - root - INFO - Profiling active. Traces will be saved at ./outputs/profiling/traces [rank1]:2024-04-01 17:54:39,390 - root - INFO - Profiling active. Traces will be saved at ./outputs/profiling/traces [rank1]:/data/users/gnadathur/a/pytorch/torch/_inductor/lowering.py:1789: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager. [rank1]: warnings.warn( [rank0]:/data/users/gnadathur/a/pytorch/torch/_inductor/lowering.py:1789: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager. [rank0]: warnings.warn( [rank1]:2024-04-01 17:54:40,498 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:40,493 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:41,992 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:41,985 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:42,180 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:42,187 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:43,947 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:43,963 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:43,971 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:43,920 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:43,951 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:43,974 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:44,029 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:44,033 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:45,907 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:45,933 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:47,561 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:47,667 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:47,649 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:47,706 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:49,084 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:49,108 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:49,110 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:49,086 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:49,114 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:49,131 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:50,546 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:50,638 - root - INFO - running build_ext [rank0]:2024-04-01 17:54:51,901 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:52,025 - root - INFO - running build_ext [rank1]:2024-04-01 17:54:52,734 - root - INFO - �[36mstep: 1 �[32mloss: 10.9746 �[33mmemory: 9.53GiB(10.03%) �[34mwps: 1,228 �[35mmfu: 0.02%�[39m [rank1]:2024-04-01 17:54:52,734 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:00:05 [rank1]:2024-04-01 17:54:52,813 - root - INFO - �[36mstep: 2 �[32mloss: 10.9091 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 208,739 �[35mmfu: 2.56%�[39m [rank0]:2024-04-01 17:54:52,734 - root - INFO - �[36mstep: 1 �[32mloss: 10.9746 �[33mmemory: 9.53GiB(10.03%) �[34mwps: 1,228 �[35mmfu: 0.02%�[39m [rank0]:2024-04-01 17:54:52,734 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:00:05 [rank0]:2024-04-01 17:54:52,813 - root - INFO - �[36mstep: 2 �[32mloss: 10.9091 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 208,501 �[35mmfu: 2.55%�[39m [rank1]:2024-04-01 17:54:52,889 - root - INFO - �[36mstep: 3 �[32mloss: 10.7722 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 219,416 �[35mmfu: 2.69%�[39m [rank0]:2024-04-01 17:54:52,889 - root - INFO - �[36mstep: 3 �[32mloss: 10.7722 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 219,182 �[35mmfu: 2.68%�[39m [rank1]:2024-04-01 17:54:52,965 - root - INFO - �[36mstep: 4 �[32mloss: 10.5428 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 218,226 �[35mmfu: 2.67%�[39m [rank0]:2024-04-01 17:54:52,965 - root - INFO - �[36mstep: 4 �[32mloss: 10.5428 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 218,015 �[35mmfu: 2.67%�[39m [rank1]:2024-04-01 17:54:53,045 - root - INFO - �[36mstep: 5 �[32mloss: 10.3063 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 207,094 �[35mmfu: 2.54%�[39m [rank0]:2024-04-01 17:54:53,045 - root - INFO - �[36mstep: 5 �[32mloss: 10.3063 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 207,220 �[35mmfu: 2.54%�[39m [rank1]:2024-04-01 17:54:53,123 - root - INFO - �[36mstep: 6 �[32mloss: 10.0707 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 210,814 �[35mmfu: 2.58%�[39m [rank1]:2024-04-01 17:54:53,202 - root - INFO - �[36mstep: 7 �[32mloss: 9.8302 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 209,649 �[35mmfu: 2.57%�[39m [rank0]:2024-04-01 17:54:53,123 - root - INFO - �[36mstep: 6 �[32mloss: 10.0707 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 210,849 �[35mmfu: 2.58%�[39m [rank0]:2024-04-01 17:54:53,202 - root - INFO - �[36mstep: 7 �[32mloss: 9.8302 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 209,542 �[35mmfu: 2.57%�[39m [rank0]:2024-04-01 17:54:53,281 - root - INFO - �[36mstep: 8 �[32mloss: 9.5918 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 211,690 �[35mmfu: 2.59%�[39m [rank1]:2024-04-01 17:54:53,281 - root - INFO - �[36mstep: 8 �[32mloss: 9.5918 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 211,786 �[35mmfu: 2.59%�[39m [rank1]:2024-04-01 17:54:53,412 - root - INFO - �[36mstep: 9 �[32mloss: 9.4299 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 125,833 �[35mmfu: 1.54%�[39m [rank1]:[rank1]:[W401 17:54:53.242673953 CPUAllocator.cpp:249] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event [rank0]:2024-04-01 17:54:53,412 - root - INFO - �[36mstep: 9 �[32mloss: 9.4299 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 125,765 �[35mmfu: 1.54%�[39m [rank0]:[rank0]:[W401 17:54:53.240925776 CPUAllocator.cpp:249] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event [rank1]:2024-04-01 17:54:53,492 - root - INFO - �[36mstep: 10 �[32mloss: 9.2955 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 207,661 �[35mmfu: 2.54%�[39m [rank0]:2024-04-01 17:54:53,492 - root - INFO - �[36mstep: 10 �[32mloss: 9.2955 �[33mmemory: 9.54GiB(10.03%) �[34mwps: 207,426 �[35mmfu: 2.54%�[39m [rank0]:NCCL version 2.20.5+cuda12.0 ``` Reviewers: Subscribers: Tasks: Tags: --------- Co-authored-by: gnadathur <[email protected]>
1 parent dca7657 commit b4ab627

File tree

3 files changed

+53
-60
lines changed

3 files changed

+53
-60
lines changed

run_llama_train.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ LOG_RANK=${LOG_RANK:-0}
1919

2020
CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"}
2121

22+
overrides=""
23+
if [ $# -ne 0 ]; then
24+
overrides="$*"
25+
fi
26+
2227
torchrun --nproc_per_node=${NGPU} --rdzv_endpoint="localhost:5972" \
2328
--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
24-
train.py --job.config_file ${CONFIG_FILE}
29+
train.py --job.config_file ${CONFIG_FILE} $overrides

test/test_runner.py

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,64 @@
55
# All rights reserved.
66
import os
77
import subprocess
8+
from collections import defaultdict
9+
from dataclasses import dataclass
10+
from typing import Sequence
811

912
try:
1013
import tomllib
1114
except ModuleNotFoundError:
1215
import tomli as tomllib
1316

17+
18+
@dataclass
19+
class OverrideDefinitions:
20+
"""
21+
This class is used to define the override definitions for the integration tests.
22+
"""
23+
24+
override_args: Sequence[str] = tuple()
25+
test_descr: str = "default"
26+
27+
1428
CONFIG_DIR = "./train_configs"
29+
30+
"""
31+
key is the config file name and value is a list of OverrideDefinitions
32+
that is used to generate variations of integration tests based on the
33+
same root config file.
34+
"""
35+
integration_tests_flavors = defaultdict(list)
36+
integration_tests_flavors["debug_model.toml"] = [
37+
OverrideDefinitions(["--training.compile"], "1D compile"),
38+
OverrideDefinitions(
39+
["--training.tensor_parallel_degree 2"], "Eager mode 2DParallel"
40+
),
41+
]
42+
43+
1544
for config_file in os.listdir(CONFIG_DIR):
1645
if config_file.endswith(".toml"):
1746
full_path = os.path.join(CONFIG_DIR, config_file)
1847
with open(full_path, "rb") as f:
1948
config = tomllib.load(f)
2049
is_integration_test = config["job"].get("use_for_integration_test", False)
2150
if is_integration_test:
22-
cmd = f"CONFIG_FILE={full_path} NGPU=4 ./run_llama_train.sh"
23-
print(f"=====Integration test: {cmd}=====")
24-
result = subprocess.run(
25-
[cmd],
26-
stdout=subprocess.PIPE,
27-
stderr=subprocess.STDOUT,
28-
text=True,
29-
shell=True,
30-
)
31-
print(result.stdout)
51+
test_flavors = [OverrideDefinitions()] + integration_tests_flavors[
52+
config_file
53+
]
54+
for test_flavor in test_flavors:
55+
cmd = f"CONFIG_FILE={full_path} NGPU=4 ./run_llama_train.sh"
56+
if test_flavor.override_args:
57+
cmd += " " + " ".join(test_flavor.override_args)
58+
print(
59+
f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}====="
60+
)
61+
result = subprocess.run(
62+
[cmd],
63+
stdout=subprocess.PIPE,
64+
stderr=subprocess.STDOUT,
65+
text=True,
66+
shell=True,
67+
)
68+
print(result.stdout)

train_configs/debug_model_2d.toml

Lines changed: 0 additions & 49 deletions
This file was deleted.

0 commit comments

Comments
 (0)