Skip to content

Commit 011b738

Browse files
committed
Update base for Update on "[WIP] Add more NVFuser microbenchmarks"
Waiting on pytorch/pytorch#73627 to land, because some of these don't pass without it. [ghstack-poisoned]
2 parents b19f72b + 032f079 commit 011b738

File tree

133 files changed

+1670
-1066
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

133 files changed

+1670
-1066
lines changed

.github/workflows/correctness.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name: TorchBench Correctness Testing
22
on:
3-
pull_request:
3+
workflow_dispatch:
44
env:
55
PYTHON_VERSION: "3.8"
66
CUDA_VERSION: "11.3"

.github/workflows/v1-bisection.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,13 @@ jobs:
2828
run: |
2929
conda create -y -n "$BISECT_CONDA_ENV" python="${PYTHON_VERSION}"
3030
. activate "$BISECT_CONDA_ENV"
31-
conda install -y numpy requests=2.22 ninja pyyaml mkl mkl-include setuptools cmake cffi \
31+
# pytorch doesn't support cmake>3.22
32+
# See: https://github.com/pytorch/pytorch/issues/74985
33+
conda install -y numpy requests ninja pyyaml mkl mkl-include setuptools cmake=3.22 cffi \
3234
typing_extensions future six dataclasses tabulate gitpython tqdm
3335
conda install -y -c pytorch "${MAGMA_VERSION}"
36+
# Pin ffmpeg version to 4.4.1. See: https://github.com/pytorch/vision/issues/5616
37+
conda install -y ffmpeg=4.4.1
3438
- name: Bisection
3539
run: |
3640
export BISECT_ISSUE="${{ github.event.inputs.issue_name }}"

.github/workflows/v2-bisection.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,14 @@ jobs:
2828
run: |
2929
conda create -y -n "${BISECT_CONDA_ENV}" python="${PYTHON_VER}"
3030
. activate "${BISECT_CONDA_ENV}"
31-
conda install -y numpy requests ninja pyyaml mkl mkl-include setuptools cmake cffi \
31+
# pytorch doesn't support cmake>3.22
32+
# See: https://github.com/pytorch/pytorch/issues/74985
33+
conda install -y numpy requests ninja pyyaml mkl mkl-include setuptools cmake=3.22 cffi \
3234
typing_extensions future six dataclasses tabulate gitpython git-lfs tqdm
3335
# Install magma
3436
conda install -y -c pytorch "${MAGMA_VERSION}"
37+
# Pin ffmpeg version to 4.4.1. See: https://github.com/pytorch/vision/issues/5616
38+
conda install -y ffmpeg=4.4.1
3539
- name: Bisection
3640
run: |
3741
export BISECT_ISSUE="${{ github.event.inputs.issue_name }}"

.gitignore

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,5 @@ build/
1414
.idea
1515
old.json
1616
te.json
17-
Video_data_train_processed.csv
18-
labels.json
19-
results.txt
20-
checkpoints
2117
logs/
2218
scripts/scribe.py

.gitmodules

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,3 @@
1-
[submodule "third_party/benchmark"]
2-
path = legacy/third_party/benchmark
3-
url = https://github.com/google/benchmark
4-
[submodule "setup/third_party/pytorch-dockerfiles"]
5-
path = setup/third_party/pytorch-dockerfiles
6-
url = https://github.com/pietern/pytorch-dockerfiles
7-
[submodule "python/third_party_benchmarks/PyTorch-benchmark"]
8-
path = python/third_party_benchmarks/PyTorch-benchmark
9-
url = https://github.com/MlWoo/PyTorch-benchmark
10-
[submodule "third_party/eigen"]
11-
path = legacy/third_party/eigen
12-
url = https://github.com/eigenteam/eigen-git-mirror
13-
[submodule "third_party/sleef"]
14-
path = legacy/third_party/sleef
15-
url = https://github.com/shibatch/sleef
16-
[submodule "third_party/tbb"]
17-
path = legacy/third_party/tbb
18-
url = https://github.com/01org/tbb
1+
[submodule "submodules/FAMBench"]
2+
path = submodules/FAMBench
3+
url = https://github.com/facebookresearch/FAMBench.git

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ except for the torch products which are intended to be installed separately so
1111
different torch versions can be benchmarked.
1212

1313
### Using Pre-built Packages
14-
We support python 3.7 and 3.8, and 3.8 is recommended. Currently, there are compatibility issues with 3.9+. Conda is optional but suggested. To switch to python 3.8 in conda:
14+
We support python 3.7+, and 3.8 is recommended. Conda is optional but suggested. To start with python 3.8 in conda:
1515
```
1616
# using your current conda environment:
1717
conda install -y python=3.8

bisection.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
TORCH_GITREPO="https://github.com/pytorch/pytorch.git"
2828
TORCHBENCH_GITREPO="https://github.com/pytorch/benchmark.git"
2929
TORCHBENCH_DEPS = {
30-
"torchtext": os.path.expandvars("${HOME}/text"),
31-
"torchvision": os.path.expandvars("${HOME}/vision"),
30+
"torchtext": (os.path.expandvars("${HOME}/text"), "main"),
31+
"torchvision": (os.path.expandvars("${HOME}/vision"), "main"),
3232
}
3333

3434
def exist_dir_path(string):
@@ -152,8 +152,7 @@ def prep(self) -> bool:
152152
# Update pytorch, torchtext, and torchvision repo
153153
def update_repos(self):
154154
repos = [(self.srcpath, "master")]
155-
for value in TORCHBENCH_DEPS.values():
156-
repos.append((value, "main"))
155+
repos.extend(TORCHBENCH_DEPS.values())
157156
for (repo, branch) in repos:
158157
gitutils.clean_git_repo(repo)
159158
assert gitutils.update_git_repo(repo, branch), f"Failed to update {branch} branch of repository {repo}."
@@ -195,23 +194,25 @@ def setup_build_env(self, env) -> Dict[str, str]:
195194
# Checkout the last commit of dependencies on date
196195
def checkout_deps(self, cdate: datetime):
197196
for pkg in TORCHBENCH_DEPS:
198-
dep_commit = gitutils.get_git_commit_on_date(TORCHBENCH_DEPS[pkg], cdate)
197+
pkg_path, branch = TORCHBENCH_DEPS[pkg]
198+
gitutils.checkout_git_branch(pkg_path, branch)
199+
dep_commit = gitutils.get_git_commit_on_date(pkg_path, cdate)
199200
print(f"Checking out {pkg} commit {dep_commit} ...", end="", flush=True)
200201
assert dep_commit, "Failed to find the commit on {cdate} of {pkg}"
201-
assert gitutils.checkout_git_commit(TORCHBENCH_DEPS[pkg], dep_commit), "Failed to checkout commit {commit} of {pkg}"
202+
assert gitutils.checkout_git_commit(pkg_path, dep_commit), "Failed to checkout commit {commit} of {pkg}"
202203
print("done.")
203204

204205
# Install dependencies such as torchtext and torchvision
205206
def build_install_deps(self, build_env):
206207
# Build torchvision
207208
print(f"Building torchvision ...", end="", flush=True)
208209
command = "python setup.py install"
209-
subprocess.check_call(command, cwd=TORCHBENCH_DEPS["torchvision"], env=build_env, shell=True)
210+
subprocess.check_call(command, cwd=TORCHBENCH_DEPS["torchvision"][0], env=build_env, shell=True)
210211
print("done")
211212
# Build torchtext
212213
print(f"Building torchtext ...", end="", flush=True)
213214
command = "python setup.py clean install"
214-
subprocess.check_call(command, cwd=TORCHBENCH_DEPS["torchtext"], env=build_env, shell=True)
215+
subprocess.check_call(command, cwd=TORCHBENCH_DEPS["torchtext"][0], env=build_env, shell=True)
215216
print("done")
216217

217218
def _build_lazy_tensor(self, commit: Commit, build_env: Dict[str, str]):

install.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ def pip_install_requirements():
5151

5252
if __name__ == '__main__':
5353
parser = argparse.ArgumentParser()
54-
parser.add_argument("--continue_on_fail", action="store_true")
55-
parser.add_argument("--models", nargs='+', default=[],
54+
parser.add_argument("models", nargs='*', default=[],
5655
help="Specify one or more models to install. If not set, install all models.")
56+
parser.add_argument("--continue_on_fail", action="store_true")
5757
parser.add_argument("--verbose", "-v", action="store_true")
5858
args = parser.parse_args()
5959

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
accelerate
12
bs4
23
patch
34
py-cpuinfo

run_e2e.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
SUPPORT_DEVICE_LIST = ["cpu", "cuda"]
1010

1111
def run(func) -> Dict[str, float]:
12-
torch.cuda.synchronize()
12+
if torch.cuda.is_available():
13+
torch.cuda.synchronize()
1314
result = {}
1415
# Collect time_ns() instead of time() which does not provide better precision than 1
1516
# second according to https://docs.python.org/3/library/time.html#time.time.
1617
t0 = time.time_ns()
1718
func()
18-
torch.cuda.synchronize()
19+
if torch.cuda.is_available():
20+
torch.cuda.synchronize()
1921
t2 = time.time_ns()
2022
result["latency_ms"] = (t2 - t0) / 1_000_000.0
2123
return result

run_microbenchmarks.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,24 @@
1+
import os
12
import argparse
2-
from torchbenchmark.microbenchmarks.nvfuser import run_nvfuser_microbenchmarks
3+
import importlib
4+
from typing import List
35

6+
CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
7+
MICROBENCHMARKS_DIR = os.path.join(CURRENT_DIR, "torchbenchmark", "microbenchmarks")
48

5-
def run():
6-
parser = argparse.ArgumentParser(description="Run nvfuser microbenchmarks")
7-
parser.add_argument("--filter", nargs="*", default=[], help='List of benchmarks to test')
8-
args, extra_args = parser.parse_known_args()
9-
args = parser.parse_args()
9+
def list_microbenchmarks() -> List[str]:
10+
return os.listdir(MICROBENCHMARKS_DIR)
1011

11-
run_nvfuser_microbenchmarks(args.filter, extra_args)
12+
def run():
13+
parser = argparse.ArgumentParser(description="Run TorchBench microbenchmarks")
14+
parser.add_argument("bm_name", choices=list_microbenchmarks(), help='name of the microbenchmark')
15+
args, bm_args = parser.parse_known_args()
1216

17+
try:
18+
microbenchmark = importlib.import_module(f"torchbenchmark.microbenchmarks.{args.bm_name}")
19+
microbenchmark.run(bm_args)
20+
except ImportError as e:
21+
print(f"Failed to import microbenchmark module {args.bm_name}, error: {str(e)}")
1322

1423
if __name__ == "__main__":
1524
run()

run_sweep.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class ModelTestResult:
5252
extra_args: List[str]
5353
status: str
5454
batch_size: Optional[int]
55+
precision: str
5556
results: Dict[str, Any]
5657

5758
def _list_model_paths(models: List[str]) -> List[str]:
@@ -80,7 +81,7 @@ def _validate_devices(devices: str) -> List[str]:
8081

8182
def _run_model_test(model_path: pathlib.Path, test: str, device: str, jit: bool, batch_size: Optional[int], extra_args: List[str]) -> ModelTestResult:
8283
assert test == "train" or test == "eval", f"Test must be either 'train' or 'eval', but get {test}."
83-
result = ModelTestResult(name=model_path.name, test=test, device=device, extra_args=extra_args, batch_size=None,
84+
result = ModelTestResult(name=model_path.name, test=test, device=device, extra_args=extra_args, batch_size=None, precision="fp32",
8485
status="OK", results={})
8586
# Run the benchmark test in a separate process
8687
print(f"Running model {model_path.name} ... ", end='', flush=True)
@@ -96,6 +97,7 @@ def _run_model_test(model_path: pathlib.Path, test: str, device: str, jit: bool,
9697
task.make_model_instance(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
9798
# Check the batch size in the model matches the specified value
9899
result.batch_size = task.get_model_attribute(bs_name)
100+
result.precision = task.get_model_attribute("dargs", "precision")
99101
if batch_size and (not result.batch_size == batch_size):
100102
raise ValueError(f"User specify batch size {batch_size}, but model {result.name} runs with batch size {result.batch_size}. Please report a bug.")
101103
result.results["latency_ms"] = run_one_step(task.invoke, device)

scripts/install_cuda.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ chmod +x "${CUDA_INSTALLER}"
1818
rm -f "${CUDA_INSTALLER}"
1919
rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.3 /usr/local/cuda
2020

21-
# install CUDA 11.3 CuDNN 8.2.0
21+
# install CUDA 11.3 CuDNN 8.3.2
2222
# cuDNN download archive: https://developer.nvidia.com/rdp/cudnn-archive
2323
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
2424
mkdir tmp_cudnn && cd tmp_cudnn
25-
wget -q https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.0/cudnn-11.3-linux-x64-v8.2.0.53.tgz -O cudnn-8.2.tgz
26-
tar xf cudnn-8.2.tgz
27-
cp -a cuda/include/* /usr/local/cuda/include/
28-
cp -a cuda/lib64/* /usr/local/cuda/lib64/
25+
wget -q https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz -O cudnn-8.3.2.tar.xz
26+
tar xJf cudnn-8.3.2.tar.xz
27+
cp -a cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive/include/* /usr/local/cuda/include/
28+
cp -a cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive/lib/* /usr/local/cuda/lib64/
2929
cd ..
3030
rm -rf tmp_cudnn
3131
ldconfig

submodules/FAMBench

Submodule FAMBench added at a0f12ca

test.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import torch
1616
from torchbenchmark import _list_model_paths, ModelTask, get_metadata_from_yaml
17+
from torchbenchmark.util.metadata_utils import skip_by_metadata
1718

1819

1920
# Some of the models have very heavyweight setup, so we have to set a very
@@ -24,14 +25,6 @@
2425
# unresponsive for 5 minutes the parent will presume it dead / incapacitated.)
2526
TIMEOUT = 300 # Seconds
2627

27-
# Skip this list of unit tests. One reason may be that the original batch size
28-
# used in the paper is too large to fit on the CI's GPU.
29-
EXCLUDELIST = {("densenet121", "train", "cuda"), # GPU train runs out of memory on CI.
30-
("densenet121", "train", "cpu"), # CPU train runs for too long on CI.
31-
("densenet121", "example", "cuda"), # GPU train runs out of memory on CI.
32-
("densenet121", "example", "cpu")} # CPU train runs for too long on CI.
33-
34-
3528
class TestBenchmark(unittest.TestCase):
3629

3730
def setUp(self):
@@ -51,27 +44,37 @@ def test_fx_profile(self):
5144
main(["--repeat=1", "--filter=pytorch_struct", "--device=cpu"])
5245
self.assertGreaterEqual(mock_save.call_count, 1)
5346

47+
def _create_example_model_instance(task: ModelTask, device: str):
48+
skip = False
49+
try:
50+
task.make_model_instance(test="eval", device=device, jit=False)
51+
except NotImplementedError:
52+
try:
53+
task.make_model_instance(test="train", device=device, jit=False)
54+
except NotImplementedError:
55+
skip = True
56+
finally:
57+
if skip:
58+
raise NotImplementedError(f"Model is not implemented on the device {device}")
5459

5560
def _load_test(path, device):
5661

5762
def example_fn(self):
5863
task = ModelTask(path, timeout=TIMEOUT)
5964
with task.watch_cuda_memory(skip=(device != "cuda"), assert_equal=self.assertEqual):
6065
try:
61-
task.make_model_instance(test="eval", device=device, jit=False)
66+
_create_example_model_instance(task, device)
6267
task.check_example()
6368
task.del_model_instance()
64-
6569
except NotImplementedError:
66-
self.skipTest('Method get_module is not implemented, skipping...')
70+
self.skipTest(f'Method `get_module()` on {device} is not implemented, skipping...')
6771

6872
def train_fn(self):
6973
metadata = get_metadata_from_yaml(path)
7074
task = ModelTask(path, timeout=TIMEOUT)
7175
with task.watch_cuda_memory(skip=(device != "cuda"), assert_equal=self.assertEqual):
7276
try:
7377
task.make_model_instance(test="train", device=device, jit=False)
74-
task.set_train()
7578
task.invoke()
7679
task.check_details_train(device=device, md=metadata)
7780
task.del_model_instance()
@@ -84,8 +87,6 @@ def eval_fn(self):
8487
with task.watch_cuda_memory(skip=(device != "cuda"), assert_equal=self.assertEqual):
8588
try:
8689
task.make_model_instance(test="eval", device=device, jit=False)
87-
88-
task.set_eval()
8990
task.invoke()
9091
task.check_details_eval(device=device, md=metadata)
9192
task.check_eval_output()
@@ -104,10 +105,13 @@ def check_device_fn(self):
104105
self.skipTest(f'Method check_device on {device} is not implemented, skipping...')
105106

106107
name = os.path.basename(path)
108+
metadata = get_metadata_from_yaml(path)
107109
for fn, fn_name in zip([example_fn, train_fn, eval_fn, check_device_fn],
108110
["example", "train", "eval", "check_device"]):
111+
# set exclude list based on metadata
109112
setattr(TestBenchmark, f'test_{name}_{fn_name}_{device}',
110-
(unittest.skipIf((name, fn_name, device) in EXCLUDELIST, "This test is on the EXCLUDELIST")(fn)))
113+
(unittest.skipIf(skip_by_metadata(test=fn_name, device=device,\
114+
jit=False, extra_args=[], metadata=metadata), "This test is skipped by its metadata")(fn)))
111115

112116

113117
def _load_tests():

test_bench.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
import pytest
1616
import time
1717
from components._impl.workers import subprocess_worker
18-
from torchbenchmark import _list_model_paths, ModelTask
18+
from torchbenchmark import _list_model_paths, ModelTask, get_metadata_from_yaml
1919
from torchbenchmark.util.machine_config import get_machine_state
20-
20+
from torchbenchmark.util.metadata_utils import skip_by_metadata
2121

2222
def pytest_generate_tests(metafunc):
2323
# This is where the list of models to test can be configured
@@ -48,12 +48,14 @@ class TestBenchNetwork:
4848

4949
def test_train(self, model_path, device, compiler, benchmark):
5050
try:
51+
if skip_by_metadata(test="train", device=device, jit=(compiler == 'jit'), \
52+
extra_args=[], metadata=get_metadata_from_yaml(model_path)):
53+
raise NotImplementedError("Test skipped by its metadata.")
5154
task = ModelTask(model_path)
5255
if not task.model_details.exists:
5356
return # Model is not supported.
5457

5558
task.make_model_instance(test="train", device=device, jit=(compiler == 'jit'))
56-
task.set_train()
5759
benchmark(task.invoke)
5860
benchmark.extra_info['machine_state'] = get_machine_state()
5961

@@ -62,14 +64,16 @@ def test_train(self, model_path, device, compiler, benchmark):
6264

6365
def test_eval(self, model_path, device, compiler, benchmark, pytestconfig):
6466
try:
67+
if skip_by_metadata(test="eval", device=device, jit=(compiler == 'jit'), \
68+
extra_args=[], metadata=get_metadata_from_yaml(model_path)):
69+
raise NotImplementedError("Test skipped by its metadata.")
6570
task = ModelTask(model_path)
6671
if not task.model_details.exists:
6772
return # Model is not supported.
6873

6974
task.make_model_instance(test="eval", device=device, jit=(compiler == 'jit'))
7075

7176
with task.no_grad(disable_nograd=pytestconfig.getoption("disable_nograd")):
72-
task.set_eval()
7377
benchmark(task.invoke)
7478
benchmark.extra_info['machine_state'] = get_machine_state()
7579
if pytestconfig.getoption("check_opt_vs_noopt_jit"):

0 commit comments

Comments
 (0)