pytorch · lessw2020 · Feb 13, 2024 · Feb 14, 2024 · Feb 15, 2024 · Feb 15, 2024
@@ -7,8 +7,9 @@ max-line-length = 120
 # N812 ignored because import torch.nn.functional as F is PyTorch convention
 # N817 ignored because importing using acronyms is convention (DistributedDataParallel as DDP)
 # E731 allow usage of assigning lambda expressions
+# N803,N806 allow caps and mixed case in function params. This is to work with Triton kernel coding style.
 ignore =
-    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731
+    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731,N803,N806
     # shebang has extra meaning in fbcode lints, so I think it's not worth trying
     # to line this up with executable bit
     EXE001,

@@ -0,0 +1,42 @@
+name: GPU Integration Test
+
+on:
+  schedule:
+    # Runs hourly
+    - cron: '0 * * * *'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  unit_tests_4gpu:
+    runs-on: linux.g5.12xlarge.nvidia.gpu
+    strategy:
+      matrix:
+        python-version: ['3.10']
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+      - name: Setup conda env
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          miniconda-version: "latest"
+          activate-environment: test
+          python-version: ${{ matrix.python-version }}
+      - name: Update pip
+        run: python -m pip install --upgrade pip
+      - name: Install dependencies
+        run: |
+          pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+          python -m pip install -r requirements.txt
+          python -m pip install -r dev-requirements.txt
+      - name: Run test_runner.py
+        run: python ./test_runner.py
+      - name: Upload Coverage to Codecov
+        uses: codecov/codecov-action@v3
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.11']
+        python-version: ['3.10']
     steps:
       - name: Check out repo
         uses: actions/checkout@v3
@@ -30,10 +30,8 @@ jobs:
         run: |
           python -m pip install pre-commit
           pre-commit install-hooks
-      - id: file_changes
-        uses: trilom/[email protected]
-        with:
-          prNumber: ${{ github.event.number }}
-          output: ' '
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/[email protected]
       - name: Lint modified files
-        run: pre-commit run --files ${{ steps.file_changes.outputs.files }}
+        run: pre-commit run --files ${{ steps.changed-files.outputs.all_changed_files }}
@@ -0,0 +1,42 @@
+name: 4 GPU Unit Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  unit_tests_4gpu:
+    runs-on: linux.g5.12xlarge.nvidia.gpu
+    strategy:
+      matrix:
+        python-version: ['3.10']
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+      - name: Setup conda env
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          miniconda-version: "latest"
+          activate-environment: test
+          python-version: ${{ matrix.python-version }}
+      - name: Update pip
+        run: python -m pip install --upgrade pip
+      - name: Install dependencies
+        run: |
+          pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+          python -m pip install -r requirements.txt
+          python -m pip install -r dev-requirements.txt
+      - name: Run test_runner.py
+        run: python ./test_runner.py
+      - name: Upload Coverage to Codecov
+        uses: codecov/codecov-action@v3
@@ -1,4 +1,4 @@
-name: Unit Test
+name: CPU Unit Test
 
 on:
   push:
@@ -14,7 +14,7 @@ defaults:
     shell: bash -l -eo pipefail {0}
 
 jobs:
-  unit_tests:
+  cpu_unit_tests:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -33,10 +33,9 @@ jobs:
         run: python -m pip install --upgrade pip
       - name: Install dependencies
         run: |
-          pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+          pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
           python -m pip install -r requirements.txt
           python -m pip install -r dev-requirements.txt
-          python -m pip install -e .
       - name: Run unit tests with coverage
         run: pytest test --cov=. --cov-report=xml --durations=20 -vv
       - name: Upload Coverage to Codecov

@@ -4,10 +4,11 @@ __pycache__
 *.egg-info
 build
 outputs
+dist/*
 
 # data
 data
 out
 wandb
-*.model
-*.json
+
+torchtitan/datasets/**/*.model
@@ -1,4 +1,4 @@
-# Contributing to torchtrain
+# Contributing to torchtitan
 We want to make contributing to this project as easy and transparent as
 possible.
 
@@ -28,5 +28,5 @@ disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.
 
 ## License
-By contributing to `torchtrain`, you agree that your contributions will be licensed
+By contributing to `torchtitan`, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.
@@ -0,0 +1,28 @@
+BSD 3-Clause License
+
+Copyright 2024 Meta
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,this list
+of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may
+be used to endorse or promote products derived from this software without specific
+prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
@@ -1,26 +1,127 @@
-# torchtrain
+[![GPU Integration Test](https://github.com/pytorch/torchtitan/actions/workflows/unit_test_4gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/unit_test_4gpu.yaml)
 
-Note: This repository is currently under heavy development.
+# torchtitan
 
-torchtrain contains PyTorch native parallelisms, tools and utilities to train large models.
+`torchtitan` is currently in a pre-release state and under extensive development.
 
-# Installation
+`torchtitan` is a proof-of-concept for Large-scale LLM training using native PyTorch. It is (and will continue to be) a repo to showcase PyTorch's latest distributed training features in a clean, minimal codebase. torchtitan is complementary to and not a replacement for any of the great large-scale LLM training codebases such as Megatron, Megablocks, LLM Foundry, Deepspeed, etc. Instead, we hope that the features showcased in torchtitan will be adopted by these codebases quickly. torchtitan is unlikely to ever grow a large community around it.
 
-install PyTorch from source or install the latest pytorch nightly, then install requirements by
+Our guiding principles when building `torchtitan`:
 
-```python
+* Designed to be easy to understand, use and extend for different training purposes.
+* Minimal changes to the model code when applying 1D, 2D, or (soon) 3D Parallel.
+* Modular components instead of a monolithic codebase.
+* Get started in minutes, not hours!
+
+### Intro video - learn more about torchtitan in under 4 mins:
+
+[![Welcome to torchtitan!](assets/images/titan_play_video.png)](https://youtu.be/ee5DOEqD35I?si=_B94PbVv0V5ZnNKE "Welcome to torchtitan!")
+
+## Pre-Release Updates:
+#### (4/25/2024): `torchtitan` is now public but in a pre-release state and under development.
+Currently we showcase pre-training **Llama 3 and Llama 2** LLMs of various sizes from scratch. `torchtitan` is tested and verified with the PyTorch nightly version `torch-2.4.0.dev20240412`. (We recommend latest PyTorch nightly).
+
+### Key features available
+
+1. [FSDP2 with per param sharding](docs/fsdp.md)
+2. [Tensor Parallel](https://pytorch.org/docs/stable/distributed.tensor.parallel.html)
+3. Selective layer and operator activation checkpointing
+4. Distributed checkpointing
+5. 2 datasets pre-configured (45K - 144M)
+6. GPU usage, MFU, tokens per second and more displayed via TensorBoard
+6. Learning rate scheduler, meta init, Optional Fused RMSNorm
+7. All options easily configured via [toml files](train_configs/)
+8. [Interoperable checkpoints](docs/checkpoint.md) which can be loaded directly into [`torchtune`](https://github.com/pytorch/torchtune) for fine tuning
+
+We report our [Performance](docs/performance.md) verified on 64 A100 GPUs
+
+
+### Coming soon
+1. Async checkpointing
+2. FP8 support
+3. Context Parallel
+4. 3D Pipeline Parallel
+5. `torch.compile` support
+6. Scalable data loading solution
+
+
+## Installation
+
+```bash
+git clone https://github.com/pytorch/torchtitan
+cd torchtitan
 pip install -r requirements.txt
+pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 # or cu118
+```
+
+### Downloading a tokenizer
+
+`torchtitan` currently supports training Llama 3 (8B, 70B), and Llama 2 (7B, 13B, 70B) out of the box. To get started training these models, we need to download a tokenizer.model. Follow the instructions on the official [meta-llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B) repository to ensure you have access to the Llama model weights.
+
+Once you have confirmed access, you can run the following command to download the Llama 3 / Llama 2 tokenizer to your local machine.
+
+```bash
+# Get your HF token from https://huggingface.co/settings/tokens
+
+# llama3 tokenizer.model
+python torchtitan/datasets/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3-8B --tokenizer_path "original" --hf_token=...
+
+# llama2 tokenizer.model
+python torchtitan/datasets/download_tokenizer.py --repo_id meta-llama/Llama-2-13b-hf --hf_token=...
+```
+
+### Start a training run
+Llama 3 8B model locally on 8 GPUs
+
+```bash
+CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh
+```
+
+
+## TensorBoard
+
+To visualize TensorBoard metrics of models trained on a remote server via a local web browser:
+
+1. Make sure `metrics.enable_tensorboard` option is set to true in model training (either from a .toml file or from CLI).
+
+2. Set up SSH tunneling, by running the following from local CLI
+```
+ssh -L 6006:127.0.0.1:6006 [username]@[hostname]
+```
+
+3. Inside the SSH tunnel that logged into the remote server, go to the torchtitan repo, and start the TensorBoard backend
 ```
+tensorboard --logdir=./outputs/tb
+```
+
+4. In the local web browser, go to the URL it provides OR to http://localhost:6006/.
+
 
-download tokenizer from HF
-This part is needed first time if there's no tokenizer locally by run:
+## Multi-Node Training
+For training on ParallelCluster/Slurm type configurations, you can use the `multinode_trainer.slurm` file to submit your sbatch job.
 
+To get started adjust the number of nodes and GPUs
 ```
-python torchtrain/datasets/download_tokenizer.py --hf_token your_token
+#SBATCH --ntasks=2
+#SBATCH --nodes=2
 ```
 
-run the llama debug model locally to verify the setup is correct:
+Then start a run where `nnodes` is your total node count, matching the sbatch node count above.
 
 ```
-./run_llama_train.sh
+srun torchrun --nnodes 2
 ```
+
+If your gpu count per node is not 8, adjust:
+
+```--nproc_per_node```
+
+ in the torchrun command and
+
+```#SBATCH --gpus-per-task```
+
+in the SBATCH command section.
+
+## License
+
+This code is made available under [BSD 3 license](./LICENSE). However you may have other legal obligations that govern your use of other content, such as the terms of service for third-party models, data, etc.
@@ -0,0 +1 @@
+images folder for main repo
@@ -0,0 +1,36 @@
+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# create_seed_checkpoint.sh
+#
+# Run this script to create a seed checkpoint used to initialize a model from step-0.
+# Seed checkpoints are used to initialize pipeline-parallel models since the model initializer
+# functions don't cleanly run on chunked model parts after meta-initialization.
+#
+# Use the same model config to generate your seed checkpoint as you use for training.
+# e.g.
+# CONFIG=<path to model_config> ./create_seed_checkpoint.sh
+
+set -ex
+
+export USE_LIBUV=1
+TRAINER_DIR=${1:-/home/$USER/local/torchtitan}
+NGPU=1
+LOG_RANK=0
+CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"}
+
+seed_checkpoint="--checkpoint.enable_checkpoint --checkpoint.create_seed_checkpoint"
+force_1d="--training.data_parallel_degree 1 --training.tensor_parallel_degree 1 --training.pipeline_parallel_degree 1"
+overrides=""
+if [ $# -ne 0 ]; then
+    overrides="$*"
+fi
+
+torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+train.py --job.config_file ${CONFIG_FILE} $seed_checkpoint $force_1d $overrides
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,10 +4,11 @@ __pycache__ @@
     *.egg-info
     build
     outputs
+    dist/*
     # data
     data
     out
     wandb
-    *.model
-    *.json
+    torchtitan/datasets/**/*.model