diff --git a/mednist_tutorial.ipynb b/2d_classification/mednist_tutorial.ipynb
similarity index 99%
rename from mednist_tutorial.ipynb
rename to 2d_classification/mednist_tutorial.ipynb
index 5edba6ac70..d26558f41c 100644
--- a/mednist_tutorial.ipynb
+++ b/2d_classification/mednist_tutorial.ipynb
@@ -15,7 +15,7 @@
     "* Train the model with a PyTorch program\n",
     "* Evaluate on test dataset\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/mednist_tutorial.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/2d_classification/mednist_tutorial.ipynb)"
    ]
   },
   {
@@ -683,7 +683,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/2d_segmentation/torch/unet_evaluation_array.py b/2d_segmentation/torch/unet_evaluation_array.py
new file mode 100644
index 0000000000..cbd8c0da47
--- /dev/null
+++ b/2d_segmentation/torch/unet_evaluation_array.py
@@ -0,0 +1,84 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+
+from monai import config
+from monai.data import ArrayDataset, PNGSaver, create_test_image_2d
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.networks.nets import UNet
+from monai.transforms import AddChannel, Compose, LoadImage, ScaleIntensity, ToTensor
+
+
+def main(tempdir):
+    config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(5):
+        im, seg = create_test_image_2d(128, 128, num_seg_classes=1)
+        Image.fromarray(im.astype("uint8")).save(os.path.join(tempdir, f"img{i:d}.png"))
+        Image.fromarray(seg.astype("uint8")).save(os.path.join(tempdir, f"seg{i:d}.png"))
+
+    images = sorted(glob(os.path.join(tempdir, "img*.png")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.png")))
+
+    # define transforms for image and segmentation
+    imtrans = Compose([LoadImage(image_only=True), ScaleIntensity(), AddChannel(), ToTensor()])
+    segtrans = Compose([LoadImage(image_only=True), AddChannel(), ToTensor()])
+    val_ds = ArrayDataset(images, imtrans, segs, segtrans)
+    # sliding window inference for one image at every iteration
+    val_loader = DataLoader(val_ds, batch_size=1, num_workers=1, pin_memory=torch.cuda.is_available())
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = UNet(
+        dimensions=2,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+
+    model.load_state_dict(torch.load("best_metric_model_segmentation2d_array.pth"))
+    model.eval()
+    with torch.no_grad():
+        metric_sum = 0.0
+        metric_count = 0
+        saver = PNGSaver(output_dir="./output")
+        for val_data in val_loader:
+            val_images, val_labels = val_data[0].to(device), val_data[1].to(device)
+            # define sliding window size and batch size for windows inference
+            roi_size = (96, 96)
+            sw_batch_size = 4
+            val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+            value = dice_metric(y_pred=val_outputs, y=val_labels)
+            metric_count += len(value)
+            metric_sum += value.item() * len(value)
+            val_outputs = val_outputs.sigmoid() >= 0.5
+            saver.save_batch(val_outputs)
+        metric = metric_sum / metric_count
+        print("evaluation metric:", metric)
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/2d_segmentation/torch/unet_evaluation_dict.py b/2d_segmentation/torch/unet_evaluation_dict.py
new file mode 100644
index 0000000000..f20152921c
--- /dev/null
+++ b/2d_segmentation/torch/unet_evaluation_dict.py
@@ -0,0 +1,92 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+
+import monai
+from monai.data import PNGSaver, create_test_image_2d, list_data_collate
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.networks.nets import UNet
+from monai.transforms import AddChanneld, Compose, LoadImaged, ScaleIntensityd, ToTensord
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(5):
+        im, seg = create_test_image_2d(128, 128, num_seg_classes=1)
+        Image.fromarray(im.astype("uint8")).save(os.path.join(tempdir, f"img{i:d}.png"))
+        Image.fromarray(seg.astype("uint8")).save(os.path.join(tempdir, f"seg{i:d}.png"))
+
+    images = sorted(glob(os.path.join(tempdir, "img*.png")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.png")))
+    val_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    val_transforms = Compose(
+        [
+            LoadImaged(keys=["img", "seg"]),
+            AddChanneld(keys=["img", "seg"]),
+            ScaleIntensityd(keys="img"),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    # sliding window inference need to input 1 image in every iteration
+    val_loader = DataLoader(val_ds, batch_size=1, num_workers=4, collate_fn=list_data_collate)
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = UNet(
+        dimensions=2,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+
+    model.load_state_dict(torch.load("best_metric_model_segmentation2d_dict.pth"))
+
+    model.eval()
+    with torch.no_grad():
+        metric_sum = 0.0
+        metric_count = 0
+        saver = PNGSaver(output_dir="./output")
+        for val_data in val_loader:
+            val_images, val_labels = val_data["img"].to(device), val_data["seg"].to(device)
+            # define sliding window size and batch size for windows inference
+            roi_size = (96, 96)
+            sw_batch_size = 4
+            val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+            value = dice_metric(y_pred=val_outputs, y=val_labels)
+            metric_count += len(value)
+            metric_sum += value.item() * len(value)
+            val_outputs = val_outputs.sigmoid() >= 0.5
+            saver.save_batch(val_outputs)
+        metric = metric_sum / metric_count
+        print("evaluation metric:", metric)
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/2d_segmentation/torch/unet_training_array.py b/2d_segmentation/torch/unet_training_array.py
new file mode 100644
index 0000000000..9249fd1aaf
--- /dev/null
+++ b/2d_segmentation/torch/unet_training_array.py
@@ -0,0 +1,166 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+import monai
+from monai.data import ArrayDataset, create_test_image_2d
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.transforms import AddChannel, Compose, LoadImage, RandRotate90, RandSpatialCrop, ScaleIntensity, ToTensor
+from monai.visualize import plot_2d_or_3d_image
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # create a temporary directory and 40 random image, mask pairs
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(40):
+        im, seg = create_test_image_2d(128, 128, num_seg_classes=1)
+        Image.fromarray(im.astype("uint8")).save(os.path.join(tempdir, f"img{i:d}.png"))
+        Image.fromarray(seg.astype("uint8")).save(os.path.join(tempdir, f"seg{i:d}.png"))
+
+    images = sorted(glob(os.path.join(tempdir, "img*.png")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.png")))
+    train_files = [{"img": img, "seg": seg} for img, seg in zip(images[:20], segs[:20])]
+    val_files = [{"img": img, "seg": seg} for img, seg in zip(images[-20:], segs[-20:])]
+
+    # define transforms for image and segmentation
+    train_imtrans = Compose(
+        [
+            LoadImage(image_only=True),
+            ScaleIntensity(),
+            AddChannel(),
+            RandSpatialCrop((96, 96), random_size=False),
+            RandRotate90(prob=0.5, spatial_axes=(0, 1)),
+            ToTensor(),
+        ]
+    )
+    train_segtrans = Compose(
+        [
+            LoadImage(image_only=True),
+            AddChannel(),
+            RandSpatialCrop((96, 96), random_size=False),
+            RandRotate90(prob=0.5, spatial_axes=(0, 1)),
+            ToTensor(),
+        ]
+    )
+    val_imtrans = Compose([LoadImage(image_only=True), ScaleIntensity(), AddChannel(), ToTensor()])
+    val_segtrans = Compose([LoadImage(image_only=True), AddChannel(), ToTensor()])
+
+    # define array dataset, data loader
+    check_ds = ArrayDataset(images, train_imtrans, segs, train_segtrans)
+    check_loader = DataLoader(check_ds, batch_size=10, num_workers=2, pin_memory=torch.cuda.is_available())
+    im, seg = monai.utils.misc.first(check_loader)
+    print(im.shape, seg.shape)
+
+    # create a training data loader
+    train_ds = ArrayDataset(images[:20], train_imtrans, segs[:20], train_segtrans)
+    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=8, pin_memory=torch.cuda.is_available())
+    # create a validation data loader
+    val_ds = ArrayDataset(images[-20:], val_imtrans, segs[-20:], val_segtrans)
+    val_loader = DataLoader(val_ds, batch_size=1, num_workers=4, pin_memory=torch.cuda.is_available())
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = monai.networks.nets.UNet(
+        dimensions=2,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    loss_function = monai.losses.DiceLoss(sigmoid=True)
+    optimizer = torch.optim.Adam(model.parameters(), 1e-3)
+
+    # start a typical PyTorch training
+    val_interval = 2
+    best_metric = -1
+    best_metric_epoch = -1
+    epoch_loss_values = list()
+    metric_values = list()
+    writer = SummaryWriter()
+    for epoch in range(10):
+        print("-" * 10)
+        print(f"epoch {epoch + 1}/{10}")
+        model.train()
+        epoch_loss = 0
+        step = 0
+        for batch_data in train_loader:
+            step += 1
+            inputs, labels = batch_data[0].to(device), batch_data[1].to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            epoch_len = len(train_ds) // train_loader.batch_size
+            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
+            writer.add_scalar("train_loss", loss.item(), epoch_len * epoch + step)
+        epoch_loss /= step
+        epoch_loss_values.append(epoch_loss)
+        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
+
+        if (epoch + 1) % val_interval == 0:
+            model.eval()
+            with torch.no_grad():
+                metric_sum = 0.0
+                metric_count = 0
+                val_images = None
+                val_labels = None
+                val_outputs = None
+                for val_data in val_loader:
+                    val_images, val_labels = val_data[0].to(device), val_data[1].to(device)
+                    roi_size = (96, 96)
+                    sw_batch_size = 4
+                    val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+                    value = dice_metric(y_pred=val_outputs, y=val_labels)
+                    metric_count += len(value)
+                    metric_sum += value.item() * len(value)
+                metric = metric_sum / metric_count
+                metric_values.append(metric)
+                if metric > best_metric:
+                    best_metric = metric
+                    best_metric_epoch = epoch + 1
+                    torch.save(model.state_dict(), "best_metric_model_segmentation2d_array.pth")
+                    print("saved new best metric model")
+                print(
+                    "current epoch: {} current mean dice: {:.4f} best mean dice: {:.4f} at epoch {}".format(
+                        epoch + 1, metric, best_metric, best_metric_epoch
+                    )
+                )
+                writer.add_scalar("val_mean_dice", metric, epoch + 1)
+                # plot the last model output as GIF image in TensorBoard with the corresponding image and label
+                plot_2d_or_3d_image(val_images, epoch + 1, writer, index=0, tag="image")
+                plot_2d_or_3d_image(val_labels, epoch + 1, writer, index=0, tag="label")
+                plot_2d_or_3d_image(val_outputs, epoch + 1, writer, index=0, tag="output")
+
+    print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")
+    writer.close()
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/2d_segmentation/torch/unet_training_dict.py b/2d_segmentation/torch/unet_training_dict.py
new file mode 100644
index 0000000000..3945ba7809
--- /dev/null
+++ b/2d_segmentation/torch/unet_training_dict.py
@@ -0,0 +1,182 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+import monai
+from monai.data import create_test_image_2d, list_data_collate
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.transforms import (
+    AddChanneld,
+    Compose,
+    LoadImaged,
+    RandCropByPosNegLabeld,
+    RandRotate90d,
+    ScaleIntensityd,
+    ToTensord,
+)
+from monai.visualize import plot_2d_or_3d_image
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # create a temporary directory and 40 random image, mask pairs
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(40):
+        im, seg = create_test_image_2d(128, 128, num_seg_classes=1)
+        Image.fromarray(im.astype("uint8")).save(os.path.join(tempdir, f"img{i:d}.png"))
+        Image.fromarray(seg.astype("uint8")).save(os.path.join(tempdir, f"seg{i:d}.png"))
+
+    images = sorted(glob(os.path.join(tempdir, "img*.png")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.png")))
+    train_files = [{"img": img, "seg": seg} for img, seg in zip(images[:20], segs[:20])]
+    val_files = [{"img": img, "seg": seg} for img, seg in zip(images[-20:], segs[-20:])]
+
+    # define transforms for image and segmentation
+    train_transforms = Compose(
+        [
+            LoadImaged(keys=["img", "seg"]),
+            AddChanneld(keys=["img", "seg"]),
+            ScaleIntensityd(keys="img"),
+            RandCropByPosNegLabeld(
+                keys=["img", "seg"], label_key="seg", spatial_size=[96, 96], pos=1, neg=1, num_samples=4
+            ),
+            RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 1]),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+    val_transforms = Compose(
+        [
+            LoadImaged(keys=["img", "seg"]),
+            AddChanneld(keys=["img", "seg"]),
+            ScaleIntensityd(keys="img"),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+
+    # define dataset, data loader
+    check_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    check_loader = DataLoader(check_ds, batch_size=2, num_workers=4, collate_fn=list_data_collate)
+    check_data = monai.utils.misc.first(check_loader)
+    print(check_data["img"].shape, check_data["seg"].shape)
+
+    # create a training data loader
+    train_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=2,
+        shuffle=True,
+        num_workers=4,
+        collate_fn=list_data_collate,
+        pin_memory=torch.cuda.is_available(),
+    )
+    # create a validation data loader
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    val_loader = DataLoader(val_ds, batch_size=1, num_workers=4, collate_fn=list_data_collate)
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = monai.networks.nets.UNet(
+        dimensions=2,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    loss_function = monai.losses.DiceLoss(sigmoid=True)
+    optimizer = torch.optim.Adam(model.parameters(), 1e-3)
+
+    # start a typical PyTorch training
+    val_interval = 2
+    best_metric = -1
+    best_metric_epoch = -1
+    epoch_loss_values = list()
+    metric_values = list()
+    writer = SummaryWriter()
+    for epoch in range(10):
+        print("-" * 10)
+        print(f"epoch {epoch + 1}/{10}")
+        model.train()
+        epoch_loss = 0
+        step = 0
+        for batch_data in train_loader:
+            step += 1
+            inputs, labels = batch_data["img"].to(device), batch_data["seg"].to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            epoch_len = len(train_ds) // train_loader.batch_size
+            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
+            writer.add_scalar("train_loss", loss.item(), epoch_len * epoch + step)
+        epoch_loss /= step
+        epoch_loss_values.append(epoch_loss)
+        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
+
+        if (epoch + 1) % val_interval == 0:
+            model.eval()
+            with torch.no_grad():
+                metric_sum = 0.0
+                metric_count = 0
+                val_images = None
+                val_labels = None
+                val_outputs = None
+                for val_data in val_loader:
+                    val_images, val_labels = val_data["img"].to(device), val_data["seg"].to(device)
+                    roi_size = (96, 96)
+                    sw_batch_size = 4
+                    val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+                    value = dice_metric(y_pred=val_outputs, y=val_labels)
+                    metric_count += len(value)
+                    metric_sum += value.item() * len(value)
+                metric = metric_sum / metric_count
+                metric_values.append(metric)
+                if metric > best_metric:
+                    best_metric = metric
+                    best_metric_epoch = epoch + 1
+                    torch.save(model.state_dict(), "best_metric_model_segmentation2d_dict.pth")
+                    print("saved new best metric model")
+                print(
+                    "current epoch: {} current mean dice: {:.4f} best mean dice: {:.4f} at epoch {}".format(
+                        epoch + 1, metric, best_metric, best_metric_epoch
+                    )
+                )
+                writer.add_scalar("val_mean_dice", metric, epoch + 1)
+                # plot the last model output as GIF image in TensorBoard with the corresponding image and label
+                plot_2d_or_3d_image(val_images, epoch + 1, writer, index=0, tag="image")
+                plot_2d_or_3d_image(val_labels, epoch + 1, writer, index=0, tag="label")
+                plot_2d_or_3d_image(val_outputs, epoch + 1, writer, index=0, tag="output")
+
+    print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")
+    writer.close()
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/3d_classification/ignite/densenet_evaluation_array.py b/3d_classification/ignite/densenet_evaluation_array.py
new file mode 100644
index 0000000000..9692fecfda
--- /dev/null
+++ b/3d_classification/ignite/densenet_evaluation_array.py
@@ -0,0 +1,94 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import numpy as np
+import torch
+from ignite.engine import _prepare_batch, create_supervised_evaluator
+from ignite.metrics import Accuracy
+from torch.utils.data import DataLoader
+
+import monai
+from monai.data import NiftiDataset
+from monai.handlers import CheckpointLoader, ClassificationSaver, StatsHandler
+from monai.transforms import AddChannel, Compose, Resize, ScaleIntensity, ToTensor
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
+    images = [
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI607-Guys-1097-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI175-HH-1570-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI385-HH-2078-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI344-Guys-0905-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI409-Guys-0960-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI584-Guys-1129-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI253-HH-1694-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI092-HH-1436-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI574-IOP-1156-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI585-Guys-1130-T1.nii.gz"]),
+    ]
+
+    # 2 binary labels for gender classification: man and woman
+    labels = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)
+
+    # define transforms for image
+    val_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), ToTensor()])
+    # define nifti dataset
+    val_ds = NiftiDataset(image_files=images, labels=labels, transform=val_transforms, image_only=False)
+    # create DenseNet121
+    net = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    metric_name = "Accuracy"
+    # add evaluation metric to the evaluator engine
+    val_metrics = {metric_name: Accuracy()}
+
+    def prepare_batch(batch, device=None, non_blocking=False):
+        return _prepare_batch((batch[0], batch[1]), device, non_blocking)
+
+    # Ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration,
+    # user can add output_transform to return other values
+    evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch)
+
+    # add stats event handler to print validation stats via evaluator
+    val_stats_handler = StatsHandler(
+        name="evaluator",
+        output_transform=lambda x: None,  # no need to print loss value, so disable per iteration output
+    )
+    val_stats_handler.attach(evaluator)
+
+    # for the array data format, assume the 3rd item of batch data is the meta_data
+    prediction_saver = ClassificationSaver(
+        output_dir="tempdir",
+        batch_transform=lambda batch: batch[2],
+        output_transform=lambda output: output[0].argmax(1),
+    )
+    prediction_saver.attach(evaluator)
+
+    # the model was trained by "densenet_training_array" example
+    CheckpointLoader(load_path="./runs_array/net_checkpoint_20.pth", load_dict={"net": net}).attach(evaluator)
+
+    # create a validation data loader
+    val_loader = DataLoader(val_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available())
+
+    state = evaluator.run(val_loader)
+    print(state)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3d_classification/ignite/densenet_evaluation_dict.py b/3d_classification/ignite/densenet_evaluation_dict.py
new file mode 100644
index 0000000000..69b917be2d
--- /dev/null
+++ b/3d_classification/ignite/densenet_evaluation_dict.py
@@ -0,0 +1,102 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import numpy as np
+import torch
+from ignite.engine import _prepare_batch, create_supervised_evaluator
+from ignite.metrics import Accuracy
+from torch.utils.data import DataLoader
+
+import monai
+from monai.handlers import CheckpointLoader, ClassificationSaver, StatsHandler
+from monai.transforms import AddChanneld, Compose, LoadNiftid, Resized, ScaleIntensityd, ToTensord
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
+    images = [
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI607-Guys-1097-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI175-HH-1570-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI385-HH-2078-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI344-Guys-0905-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI409-Guys-0960-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI584-Guys-1129-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI253-HH-1694-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI092-HH-1436-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI574-IOP-1156-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI585-Guys-1130-T1.nii.gz"]),
+    ]
+
+    # 2 binary labels for gender classification: man and woman
+    labels = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)
+    val_files = [{"img": img, "label": label} for img, label in zip(images, labels)]
+
+    # define transforms for image
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img"]),
+            AddChanneld(keys=["img"]),
+            ScaleIntensityd(keys=["img"]),
+            Resized(keys=["img"], spatial_size=(96, 96, 96)),
+            ToTensord(keys=["img"]),
+        ]
+    )
+
+    # create DenseNet121
+    net = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def prepare_batch(batch, device=None, non_blocking=False):
+        return _prepare_batch((batch["img"], batch["label"]), device, non_blocking)
+
+    metric_name = "Accuracy"
+    # add evaluation metric to the evaluator engine
+    val_metrics = {metric_name: Accuracy()}
+    # Ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration,
+    # user can add output_transform to return other values
+    evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch)
+
+    # add stats event handler to print validation stats via evaluator
+    val_stats_handler = StatsHandler(
+        name="evaluator",
+        output_transform=lambda x: None,  # no need to print loss value, so disable per iteration output
+    )
+    val_stats_handler.attach(evaluator)
+
+    # for the array data format, assume the 3rd item of batch data is the meta_data
+    prediction_saver = ClassificationSaver(
+        output_dir="tempdir",
+        name="evaluator",
+        batch_transform=lambda batch: batch["img_meta_dict"],
+        output_transform=lambda output: output[0].argmax(1),
+    )
+    prediction_saver.attach(evaluator)
+
+    # the model was trained by "densenet_training_dict" example
+    CheckpointLoader(load_path="./runs_dict/net_checkpoint_20.pth", load_dict={"net": net}).attach(evaluator)
+
+    # create a validation data loader
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    val_loader = DataLoader(val_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available())
+
+    state = evaluator.run(val_loader)
+    print(state)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3d_classification/ignite/densenet_training_array.py b/3d_classification/ignite/densenet_training_array.py
new file mode 100644
index 0000000000..a57f28f0b8
--- /dev/null
+++ b/3d_classification/ignite/densenet_training_array.py
@@ -0,0 +1,144 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import numpy as np
+import torch
+from ignite.engine import Events, create_supervised_evaluator, create_supervised_trainer
+from ignite.handlers import EarlyStopping, ModelCheckpoint
+from ignite.metrics import Accuracy
+from torch.utils.data import DataLoader
+
+import monai
+from monai.data import NiftiDataset
+from monai.handlers import StatsHandler, TensorBoardStatsHandler, stopping_fn_from_metric
+from monai.transforms import AddChannel, Compose, RandRotate90, Resize, ScaleIntensity, ToTensor
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
+    images = [
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI314-IOP-0889-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI249-Guys-1072-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI609-HH-2600-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI173-HH-1590-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI020-Guys-0700-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI342-Guys-0909-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI134-Guys-0780-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI577-HH-2661-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI066-Guys-0731-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI130-HH-1528-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI607-Guys-1097-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI175-HH-1570-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI385-HH-2078-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI344-Guys-0905-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI409-Guys-0960-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI584-Guys-1129-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI253-HH-1694-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI092-HH-1436-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI574-IOP-1156-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI585-Guys-1130-T1.nii.gz"]),
+    ]
+
+    # 2 binary labels for gender classification: man and woman
+    labels = np.array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)
+
+    # define transforms
+    train_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), RandRotate90(), ToTensor()])
+    val_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), ToTensor()])
+
+    # define nifti dataset, data loader
+    check_ds = NiftiDataset(image_files=images, labels=labels, transform=train_transforms)
+    check_loader = DataLoader(check_ds, batch_size=2, num_workers=2, pin_memory=torch.cuda.is_available())
+    im, label = monai.utils.misc.first(check_loader)
+    print(type(im), im.shape, label)
+
+    # create DenseNet121, CrossEntropyLoss and Adam optimizer
+    net = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2)
+    loss = torch.nn.CrossEntropyLoss()
+    lr = 1e-5
+    opt = torch.optim.Adam(net.parameters(), lr)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Ignite trainer expects batch=(img, label) and returns output=loss at every iteration,
+    # user can add output_transform to return other values, like: y_pred, y, etc.
+    trainer = create_supervised_trainer(net, opt, loss, device, False)
+
+    # adding checkpoint handler to save models (network params and optimizer stats) during training
+    checkpoint_handler = ModelCheckpoint("./runs_array/", "net", n_saved=10, require_empty=False)
+    trainer.add_event_handler(
+        event_name=Events.EPOCH_COMPLETED, handler=checkpoint_handler, to_save={"net": net, "opt": opt}
+    )
+
+    # StatsHandler prints loss at every iteration and print metrics at every epoch,
+    # we don't set metrics for trainer here, so just print loss, user can also customize print functions
+    # and can use output_transform to convert engine.state.output if it's not loss value
+    train_stats_handler = StatsHandler(name="trainer")
+    train_stats_handler.attach(trainer)
+
+    # TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler
+    train_tensorboard_stats_handler = TensorBoardStatsHandler()
+    train_tensorboard_stats_handler.attach(trainer)
+
+    # set parameters for validation
+    validation_every_n_epochs = 1
+
+    metric_name = "Accuracy"
+    # add evaluation metric to the evaluator engine
+    val_metrics = {metric_name: Accuracy()}
+    # Ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration,
+    # user can add output_transform to return other values
+    evaluator = create_supervised_evaluator(net, val_metrics, device, True)
+
+    # add stats event handler to print validation stats via evaluator
+    val_stats_handler = StatsHandler(
+        name="evaluator",
+        output_transform=lambda x: None,  # no need to print loss value, so disable per iteration output
+        global_epoch_transform=lambda x: trainer.state.epoch,
+    )  # fetch global epoch number from trainer
+    val_stats_handler.attach(evaluator)
+
+    # add handler to record metrics to TensorBoard at every epoch
+    val_tensorboard_stats_handler = TensorBoardStatsHandler(
+        output_transform=lambda x: None,  # no need to plot loss value, so disable per iteration output
+        global_epoch_transform=lambda x: trainer.state.epoch,
+    )  # fetch global epoch number from trainer
+    val_tensorboard_stats_handler.attach(evaluator)
+
+    # add early stopping handler to evaluator
+    early_stopper = EarlyStopping(patience=4, score_function=stopping_fn_from_metric(metric_name), trainer=trainer)
+    evaluator.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=early_stopper)
+
+    # create a validation data loader
+    val_ds = NiftiDataset(image_files=images[-10:], labels=labels[-10:], transform=val_transforms)
+    val_loader = DataLoader(val_ds, batch_size=2, num_workers=2, pin_memory=torch.cuda.is_available())
+
+    @trainer.on(Events.EPOCH_COMPLETED(every=validation_every_n_epochs))
+    def run_validation(engine):
+        evaluator.run(val_loader)
+
+    # create a training data loader
+    train_ds = NiftiDataset(image_files=images[:10], labels=labels[:10], transform=train_transforms)
+    train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available())
+
+    train_epochs = 30
+    state = trainer.run(train_loader, train_epochs)
+    print(state)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3d_classification/ignite/densenet_training_dict.py b/3d_classification/ignite/densenet_training_dict.py
new file mode 100644
index 0000000000..e116542386
--- /dev/null
+++ b/3d_classification/ignite/densenet_training_dict.py
@@ -0,0 +1,166 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import numpy as np
+import torch
+from ignite.engine import Events, _prepare_batch, create_supervised_evaluator, create_supervised_trainer
+from ignite.handlers import EarlyStopping, ModelCheckpoint
+from ignite.metrics import Accuracy
+from torch.utils.data import DataLoader
+
+import monai
+from monai.handlers import ROCAUC, StatsHandler, TensorBoardStatsHandler, stopping_fn_from_metric
+from monai.transforms import AddChanneld, Compose, LoadNiftid, RandRotate90d, Resized, ScaleIntensityd, ToTensord
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
+    images = [
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI314-IOP-0889-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI249-Guys-1072-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI609-HH-2600-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI173-HH-1590-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI020-Guys-0700-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI342-Guys-0909-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI134-Guys-0780-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI577-HH-2661-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI066-Guys-0731-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI130-HH-1528-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI607-Guys-1097-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI175-HH-1570-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI385-HH-2078-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI344-Guys-0905-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI409-Guys-0960-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI584-Guys-1129-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI253-HH-1694-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI092-HH-1436-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI574-IOP-1156-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI585-Guys-1130-T1.nii.gz"]),
+    ]
+
+    # 2 binary labels for gender classification: man and woman
+    labels = np.array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)
+    train_files = [{"img": img, "label": label} for img, label in zip(images[:10], labels[:10])]
+    val_files = [{"img": img, "label": label} for img, label in zip(images[-10:], labels[-10:])]
+
+    # define transforms for image
+    train_transforms = Compose(
+        [
+            LoadNiftid(keys=["img"]),
+            AddChanneld(keys=["img"]),
+            ScaleIntensityd(keys=["img"]),
+            Resized(keys=["img"], spatial_size=(96, 96, 96)),
+            RandRotate90d(keys=["img"], prob=0.8, spatial_axes=[0, 2]),
+            ToTensord(keys=["img"]),
+        ]
+    )
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img"]),
+            AddChanneld(keys=["img"]),
+            ScaleIntensityd(keys=["img"]),
+            Resized(keys=["img"], spatial_size=(96, 96, 96)),
+            ToTensord(keys=["img"]),
+        ]
+    )
+
+    # define dataset, data loader
+    check_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    check_loader = DataLoader(check_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available())
+    check_data = monai.utils.misc.first(check_loader)
+    print(check_data["img"].shape, check_data["label"])
+
+    # create DenseNet121, CrossEntropyLoss and Adam optimizer
+    net = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2)
+    loss = torch.nn.CrossEntropyLoss()
+    lr = 1e-5
+    opt = torch.optim.Adam(net.parameters(), lr)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Ignite trainer expects batch=(img, label) and returns output=loss at every iteration,
+    # user can add output_transform to return other values, like: y_pred, y, etc.
+    def prepare_batch(batch, device=None, non_blocking=False):
+
+        return _prepare_batch((batch["img"], batch["label"]), device, non_blocking)
+
+    trainer = create_supervised_trainer(net, opt, loss, device, False, prepare_batch=prepare_batch)
+
+    # adding checkpoint handler to save models (network params and optimizer stats) during training
+    checkpoint_handler = ModelCheckpoint("./runs_dict/", "net", n_saved=10, require_empty=False)
+    trainer.add_event_handler(
+        event_name=Events.EPOCH_COMPLETED, handler=checkpoint_handler, to_save={"net": net, "opt": opt}
+    )
+
+    # StatsHandler prints loss at every iteration and print metrics at every epoch,
+    # we don't set metrics for trainer here, so just print loss, user can also customize print functions
+    # and can use output_transform to convert engine.state.output if it's not loss value
+    train_stats_handler = StatsHandler(name="trainer")
+    train_stats_handler.attach(trainer)
+
+    # TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler
+    train_tensorboard_stats_handler = TensorBoardStatsHandler()
+    train_tensorboard_stats_handler.attach(trainer)
+
+    # set parameters for validation
+    validation_every_n_epochs = 1
+
+    metric_name = "Accuracy"
+    # add evaluation metric to the evaluator engine
+    val_metrics = {metric_name: Accuracy(), "AUC": ROCAUC(to_onehot_y=True, softmax=True)}
+    # Ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration,
+    # user can add output_transform to return other values
+    evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch)
+
+    # add stats event handler to print validation stats via evaluator
+    val_stats_handler = StatsHandler(
+        name="evaluator",
+        output_transform=lambda x: None,  # no need to print loss value, so disable per iteration output
+        global_epoch_transform=lambda x: trainer.state.epoch,
+    )  # fetch global epoch number from trainer
+    val_stats_handler.attach(evaluator)
+
+    # add handler to record metrics to TensorBoard at every epoch
+    val_tensorboard_stats_handler = TensorBoardStatsHandler(
+        output_transform=lambda x: None,  # no need to plot loss value, so disable per iteration output
+        global_epoch_transform=lambda x: trainer.state.epoch,
+    )  # fetch global epoch number from trainer
+    val_tensorboard_stats_handler.attach(evaluator)
+
+    # add early stopping handler to evaluator
+    early_stopper = EarlyStopping(patience=4, score_function=stopping_fn_from_metric(metric_name), trainer=trainer)
+    evaluator.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=early_stopper)
+
+    # create a validation data loader
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    val_loader = DataLoader(val_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available())
+
+    @trainer.on(Events.EPOCH_COMPLETED(every=validation_every_n_epochs))
+    def run_validation(engine):
+        evaluator.run(val_loader)
+
+    # create a training data loader
+    train_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=4, pin_memory=torch.cuda.is_available())
+
+    train_epochs = 30
+    state = trainer.run(train_loader, train_epochs)
+    print(state)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3d_classification/torch/densenet_evaluation_array.py b/3d_classification/torch/densenet_evaluation_array.py
new file mode 100644
index 0000000000..43428ba4d5
--- /dev/null
+++ b/3d_classification/torch/densenet_evaluation_array.py
@@ -0,0 +1,77 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+import monai
+from monai.data import CSVSaver, NiftiDataset
+from monai.transforms import AddChannel, Compose, Resize, ScaleIntensity, ToTensor
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
+    images = [
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI607-Guys-1097-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI175-HH-1570-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI385-HH-2078-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI344-Guys-0905-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI409-Guys-0960-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI584-Guys-1129-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI253-HH-1694-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI092-HH-1436-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI574-IOP-1156-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI585-Guys-1130-T1.nii.gz"]),
+    ]
+
+    # 2 binary labels for gender classification: man and woman
+    labels = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)
+
+    # Define transforms for image
+    val_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), ToTensor()])
+
+    # Define nifti dataset
+    val_ds = NiftiDataset(image_files=images, labels=labels, transform=val_transforms, image_only=False)
+    # create a validation data loader
+    val_loader = DataLoader(val_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available())
+
+    # Create DenseNet121
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2).to(device)
+
+    model.load_state_dict(torch.load("best_metric_model_classification3d_array.pth"))
+    model.eval()
+    with torch.no_grad():
+        num_correct = 0.0
+        metric_count = 0
+        saver = CSVSaver(output_dir="./output")
+        for val_data in val_loader:
+            val_images, val_labels = val_data[0].to(device), val_data[1].to(device)
+            val_outputs = model(val_images).argmax(dim=1)
+            value = torch.eq(val_outputs, val_labels)
+            metric_count += len(value)
+            num_correct += value.sum().item()
+            saver.save_batch(val_outputs, val_data[2])
+        metric = num_correct / metric_count
+        print("evaluation metric:", metric)
+        saver.finalize()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3d_classification/torch/densenet_evaluation_dict.py b/3d_classification/torch/densenet_evaluation_dict.py
new file mode 100644
index 0000000000..0d6b2420fb
--- /dev/null
+++ b/3d_classification/torch/densenet_evaluation_dict.py
@@ -0,0 +1,85 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+import monai
+from monai.data import CSVSaver
+from monai.transforms import AddChanneld, Compose, LoadNiftid, Resized, ScaleIntensityd, ToTensord
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
+    images = [
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI607-Guys-1097-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI175-HH-1570-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI385-HH-2078-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI344-Guys-0905-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI409-Guys-0960-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI584-Guys-1129-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI253-HH-1694-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI092-HH-1436-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI574-IOP-1156-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI585-Guys-1130-T1.nii.gz"]),
+    ]
+
+    # 2 binary labels for gender classification: man and woman
+    labels = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)
+    val_files = [{"img": img, "label": label} for img, label in zip(images, labels)]
+
+    # Define transforms for image
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img"]),
+            AddChanneld(keys=["img"]),
+            ScaleIntensityd(keys=["img"]),
+            Resized(keys=["img"], spatial_size=(96, 96, 96)),
+            ToTensord(keys=["img"]),
+        ]
+    )
+
+    # create a validation data loader
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    val_loader = DataLoader(val_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available())
+
+    # Create DenseNet121
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2).to(device)
+
+    model.load_state_dict(torch.load("best_metric_model_classification3d_dict.pth"))
+    model.eval()
+    with torch.no_grad():
+        num_correct = 0.0
+        metric_count = 0
+        saver = CSVSaver(output_dir="./output")
+        for val_data in val_loader:
+            val_images, val_labels = val_data["img"].to(device), val_data["label"].to(device)
+            val_outputs = model(val_images).argmax(dim=1)
+            value = torch.eq(val_outputs, val_labels)
+            metric_count += len(value)
+            num_correct += value.sum().item()
+            saver.save_batch(val_outputs, val_data["img_meta_dict"])
+        metric = num_correct / metric_count
+        print("evaluation metric:", metric)
+        saver.finalize()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3d_classification/torch/densenet_training_array.py b/3d_classification/torch/densenet_training_array.py
new file mode 100644
index 0000000000..a0712a4a57
--- /dev/null
+++ b/3d_classification/torch/densenet_training_array.py
@@ -0,0 +1,139 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+import monai
+from monai.data import NiftiDataset
+from monai.transforms import AddChannel, Compose, RandRotate90, Resize, ScaleIntensity, ToTensor
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
+    images = [
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI314-IOP-0889-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI249-Guys-1072-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI609-HH-2600-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI173-HH-1590-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI020-Guys-0700-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI342-Guys-0909-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI134-Guys-0780-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI577-HH-2661-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI066-Guys-0731-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI130-HH-1528-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI607-Guys-1097-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI175-HH-1570-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI385-HH-2078-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI344-Guys-0905-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI409-Guys-0960-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI584-Guys-1129-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI253-HH-1694-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI092-HH-1436-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI574-IOP-1156-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI585-Guys-1130-T1.nii.gz"]),
+    ]
+
+    # 2 binary labels for gender classification: man and woman
+    labels = np.array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)
+
+    # Define transforms
+    train_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), RandRotate90(), ToTensor()])
+    val_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), ToTensor()])
+
+    # Define nifti dataset, data loader
+    check_ds = NiftiDataset(image_files=images, labels=labels, transform=train_transforms)
+    check_loader = DataLoader(check_ds, batch_size=2, num_workers=2, pin_memory=torch.cuda.is_available())
+    im, label = monai.utils.misc.first(check_loader)
+    print(type(im), im.shape, label)
+
+    # create a training data loader
+    train_ds = NiftiDataset(image_files=images[:10], labels=labels[:10], transform=train_transforms)
+    train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available())
+
+    # create a validation data loader
+    val_ds = NiftiDataset(image_files=images[-10:], labels=labels[-10:], transform=val_transforms)
+    val_loader = DataLoader(val_ds, batch_size=2, num_workers=2, pin_memory=torch.cuda.is_available())
+
+    # Create DenseNet121, CrossEntropyLoss and Adam optimizer
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2).to(device)
+    loss_function = torch.nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), 1e-5)
+
+    # start a typical PyTorch training
+    val_interval = 2
+    best_metric = -1
+    best_metric_epoch = -1
+    epoch_loss_values = list()
+    metric_values = list()
+    writer = SummaryWriter()
+    for epoch in range(5):
+        print("-" * 10)
+        print(f"epoch {epoch + 1}/{5}")
+        model.train()
+        epoch_loss = 0
+        step = 0
+        for batch_data in train_loader:
+            step += 1
+            inputs, labels = batch_data[0].to(device), batch_data[1].to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            epoch_len = len(train_ds) // train_loader.batch_size
+            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
+            writer.add_scalar("train_loss", loss.item(), epoch_len * epoch + step)
+        epoch_loss /= step
+        epoch_loss_values.append(epoch_loss)
+        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
+
+        if (epoch + 1) % val_interval == 0:
+            model.eval()
+            with torch.no_grad():
+                num_correct = 0.0
+                metric_count = 0
+                for val_data in val_loader:
+                    val_images, val_labels = val_data[0].to(device), val_data[1].to(device)
+                    val_outputs = model(val_images)
+                    value = torch.eq(val_outputs.argmax(dim=1), val_labels)
+                    metric_count += len(value)
+                    num_correct += value.sum().item()
+                metric = num_correct / metric_count
+                metric_values.append(metric)
+                if metric > best_metric:
+                    best_metric = metric
+                    best_metric_epoch = epoch + 1
+                    torch.save(model.state_dict(), "best_metric_model_classification3d_array.pth")
+                    print("saved new best metric model")
+                print(
+                    "current epoch: {} current accuracy: {:.4f} best accuracy: {:.4f} at epoch {}".format(
+                        epoch + 1, metric, best_metric, best_metric_epoch
+                    )
+                )
+                writer.add_scalar("val_accuracy", metric, epoch + 1)
+    print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")
+    writer.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3d_classification/torch/densenet_training_dict.py b/3d_classification/torch/densenet_training_dict.py
new file mode 100644
index 0000000000..6d4a590e87
--- /dev/null
+++ b/3d_classification/torch/densenet_training_dict.py
@@ -0,0 +1,155 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+import monai
+from monai.metrics import compute_roc_auc
+from monai.transforms import AddChanneld, Compose, LoadNiftid, RandRotate90d, Resized, ScaleIntensityd, ToTensord
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
+    images = [
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI314-IOP-0889-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI249-Guys-1072-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI609-HH-2600-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI173-HH-1590-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI020-Guys-0700-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI342-Guys-0909-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI134-Guys-0780-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI577-HH-2661-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI066-Guys-0731-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI130-HH-1528-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI607-Guys-1097-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI175-HH-1570-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI385-HH-2078-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI344-Guys-0905-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI409-Guys-0960-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI584-Guys-1129-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI253-HH-1694-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI092-HH-1436-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI574-IOP-1156-T1.nii.gz"]),
+        os.sep.join(["workspace", "data", "medical", "ixi", "IXI-T1", "IXI585-Guys-1130-T1.nii.gz"]),
+    ]
+
+    # 2 binary labels for gender classification: man and woman
+    labels = np.array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)
+    train_files = [{"img": img, "label": label} for img, label in zip(images[:10], labels[:10])]
+    val_files = [{"img": img, "label": label} for img, label in zip(images[-10:], labels[-10:])]
+
+    # Define transforms for image
+    train_transforms = Compose(
+        [
+            LoadNiftid(keys=["img"]),
+            AddChanneld(keys=["img"]),
+            ScaleIntensityd(keys=["img"]),
+            Resized(keys=["img"], spatial_size=(96, 96, 96)),
+            RandRotate90d(keys=["img"], prob=0.8, spatial_axes=[0, 2]),
+            ToTensord(keys=["img"]),
+        ]
+    )
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img"]),
+            AddChanneld(keys=["img"]),
+            ScaleIntensityd(keys=["img"]),
+            Resized(keys=["img"], spatial_size=(96, 96, 96)),
+            ToTensord(keys=["img"]),
+        ]
+    )
+
+    # Define dataset, data loader
+    check_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    check_loader = DataLoader(check_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available())
+    check_data = monai.utils.misc.first(check_loader)
+    print(check_data["img"].shape, check_data["label"])
+
+    # create a training data loader
+    train_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=4, pin_memory=torch.cuda.is_available())
+
+    # create a validation data loader
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    val_loader = DataLoader(val_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available())
+
+    # Create DenseNet121, CrossEntropyLoss and Adam optimizer
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2).to(device)
+    loss_function = torch.nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), 1e-5)
+
+    # start a typical PyTorch training
+    val_interval = 2
+    best_metric = -1
+    best_metric_epoch = -1
+    writer = SummaryWriter()
+    for epoch in range(5):
+        print("-" * 10)
+        print(f"epoch {epoch + 1}/{5}")
+        model.train()
+        epoch_loss = 0
+        step = 0
+        for batch_data in train_loader:
+            step += 1
+            inputs, labels = batch_data["img"].to(device), batch_data["label"].to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            epoch_len = len(train_ds) // train_loader.batch_size
+            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
+            writer.add_scalar("train_loss", loss.item(), epoch_len * epoch + step)
+        epoch_loss /= step
+        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
+
+        if (epoch + 1) % val_interval == 0:
+            model.eval()
+            with torch.no_grad():
+                y_pred = torch.tensor([], dtype=torch.float32, device=device)
+                y = torch.tensor([], dtype=torch.long, device=device)
+                for val_data in val_loader:
+                    val_images, val_labels = val_data["img"].to(device), val_data["label"].to(device)
+                    y_pred = torch.cat([y_pred, model(val_images)], dim=0)
+                    y = torch.cat([y, val_labels], dim=0)
+
+                acc_value = torch.eq(y_pred.argmax(dim=1), y)
+                acc_metric = acc_value.sum().item() / len(acc_value)
+                auc_metric = compute_roc_auc(y_pred, y, to_onehot_y=True, softmax=True)
+                if acc_metric > best_metric:
+                    best_metric = acc_metric
+                    best_metric_epoch = epoch + 1
+                    torch.save(model.state_dict(), "best_metric_model_classification3d_dict.pth")
+                    print("saved new best metric model")
+                print(
+                    "current epoch: {} current accuracy: {:.4f} current AUC: {:.4f} best accuracy: {:.4f} at epoch {}".format(
+                        epoch + 1, acc_metric, auc_metric, best_metric, best_metric_epoch
+                    )
+                )
+                writer.add_scalar("val_accuracy", acc_metric, epoch + 1)
+    print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")
+    writer.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/brats_segmentation_3d.ipynb b/3d_segmentation/brats_segmentation_3d.ipynb
similarity index 99%
rename from brats_segmentation_3d.ipynb
rename to 3d_segmentation/brats_segmentation_3d.ipynb
index 60f6c686e3..b8ea848d14 100644
--- a/brats_segmentation_3d.ipynb
+++ b/3d_segmentation/brats_segmentation_3d.ipynb
@@ -27,7 +27,7 @@
     "Below figure shows image patches with the tumor sub-regions that are annotated in the different modalities (top left) and the final labels for the whole dataset (right).\n",
     "(Figure taken from the [BraTS IEEE TMI paper](https://ieeexplore.ieee.org/document/6975210/))\n",
     "\n",
-    "![image](./images/brats_tasks.png)\n",
+    "![image](../figures/brats_tasks.png)\n",
     "\n",
     "The image patches show from left to right:\n",
     "1. the whole tumor (yellow) visible in T2-FLAIR (Fig.A).\n",
@@ -35,7 +35,7 @@
     "1. the enhancing tumor structures (light blue) visible in T1Gd, surrounding the cystic/necrotic components of the core (green) (Fig. C).\n",
     "1. The segmentations are combined to generate the final labels of the tumor sub-regions (Fig.D): edema (yellow), non-enhancing solid core (red), necrotic/cystic core (green), enhancing core (blue).\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/brats_segmentation_3d.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/3d_segmentation/brats_segmentation_3d.ipynb)"
    ]
   },
   {
diff --git a/3d_segmentation/ignite/unet_evaluation_array.py b/3d_segmentation/ignite/unet_evaluation_array.py
new file mode 100644
index 0000000000..20d3661d97
--- /dev/null
+++ b/3d_segmentation/ignite/unet_evaluation_array.py
@@ -0,0 +1,113 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from ignite.engine import Engine
+from torch.utils.data import DataLoader
+
+from monai import config
+from monai.data import NiftiDataset, create_test_image_3d
+from monai.handlers import CheckpointLoader, MeanDice, SegmentationSaver, StatsHandler
+from monai.inferers import sliding_window_inference
+from monai.networks import predict_segmentation
+from monai.networks.nets import UNet
+from monai.transforms import AddChannel, Compose, ScaleIntensity, ToTensor
+
+
+def main(tempdir):
+    config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(5):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1)
+
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz"))
+
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+
+    # define transforms for image and segmentation
+    imtrans = Compose([ScaleIntensity(), AddChannel(), ToTensor()])
+    segtrans = Compose([AddChannel(), ToTensor()])
+    ds = NiftiDataset(images, segs, transform=imtrans, seg_transform=segtrans, image_only=False)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    net = UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    )
+    net.to(device)
+
+    # define sliding window size and batch size for windows inference
+    roi_size = (96, 96, 96)
+    sw_batch_size = 4
+
+    def _sliding_window_processor(engine, batch):
+        net.eval()
+        with torch.no_grad():
+            val_images, val_labels = batch[0].to(device), batch[1].to(device)
+            seg_probs = sliding_window_inference(val_images, roi_size, sw_batch_size, net)
+            return seg_probs, val_labels
+
+    evaluator = Engine(_sliding_window_processor)
+
+    # add evaluation metric to the evaluator engine
+    MeanDice(sigmoid=True, to_onehot_y=False).attach(evaluator, "Mean_Dice")
+
+    # StatsHandler prints loss at every iteration and print metrics at every epoch,
+    # we don't need to print loss for evaluator, so just print metrics, user can also customize print functions
+    val_stats_handler = StatsHandler(
+        name="evaluator",
+        output_transform=lambda x: None,  # no need to print loss value, so disable per iteration output
+    )
+    val_stats_handler.attach(evaluator)
+
+    # for the array data format, assume the 3rd item of batch data is the meta_data
+    file_saver = SegmentationSaver(
+        output_dir="tempdir",
+        output_ext=".nii.gz",
+        output_postfix="seg",
+        name="evaluator",
+        batch_transform=lambda x: x[2],
+        output_transform=lambda output: predict_segmentation(output[0]),
+    )
+    file_saver.attach(evaluator)
+
+    # the model was trained by "unet_training_array" example
+    ckpt_saver = CheckpointLoader(load_path="./runs_array/net_checkpoint_100.pth", load_dict={"net": net})
+    ckpt_saver.attach(evaluator)
+
+    # sliding window inference for one image at every iteration
+    loader = DataLoader(ds, batch_size=1, num_workers=1, pin_memory=torch.cuda.is_available())
+    state = evaluator.run(loader)
+    print(state)
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/3d_segmentation/ignite/unet_evaluation_dict.py b/3d_segmentation/ignite/unet_evaluation_dict.py
new file mode 100644
index 0000000000..5dbc305661
--- /dev/null
+++ b/3d_segmentation/ignite/unet_evaluation_dict.py
@@ -0,0 +1,119 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from ignite.engine import Engine
+from torch.utils.data import DataLoader
+
+import monai
+from monai.data import create_test_image_3d, list_data_collate
+from monai.handlers import CheckpointLoader, MeanDice, SegmentationSaver, StatsHandler
+from monai.inferers import sliding_window_inference
+from monai.networks import predict_segmentation
+from monai.networks.nets import UNet
+from monai.transforms import AsChannelFirstd, Compose, LoadNiftid, ScaleIntensityd, ToTensord
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(5):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz"))
+
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+    val_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    net = UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    )
+    net.to(device)
+
+    # define sliding window size and batch size for windows inference
+    roi_size = (96, 96, 96)
+    sw_batch_size = 4
+
+    def _sliding_window_processor(engine, batch):
+        net.eval()
+        with torch.no_grad():
+            val_images, val_labels = batch["img"].to(device), batch["seg"].to(device)
+            seg_probs = sliding_window_inference(val_images, roi_size, sw_batch_size, net)
+            return seg_probs, val_labels
+
+    evaluator = Engine(_sliding_window_processor)
+
+    # add evaluation metric to the evaluator engine
+    MeanDice(sigmoid=True, to_onehot_y=False).attach(evaluator, "Mean_Dice")
+
+    # StatsHandler prints loss at every iteration and print metrics at every epoch,
+    # we don't need to print loss for evaluator, so just print metrics, user can also customize print functions
+    val_stats_handler = StatsHandler(
+        name="evaluator",
+        output_transform=lambda x: None,  # no need to print loss value, so disable per iteration output
+    )
+    val_stats_handler.attach(evaluator)
+
+    # convert the necessary metadata from batch data
+    SegmentationSaver(
+        output_dir="tempdir",
+        output_ext=".nii.gz",
+        output_postfix="seg",
+        name="evaluator",
+        batch_transform=lambda batch: batch["img_meta_dict"],
+        output_transform=lambda output: predict_segmentation(output[0]),
+    ).attach(evaluator)
+    # the model was trained by "unet_training_dict" example
+    CheckpointLoader(load_path="./runs_dict/net_checkpoint_50.pth", load_dict={"net": net}).attach(evaluator)
+
+    # sliding window inference for one image at every iteration
+    val_loader = DataLoader(
+        val_ds, batch_size=1, num_workers=4, collate_fn=list_data_collate, pin_memory=torch.cuda.is_available()
+    )
+    state = evaluator.run(val_loader)
+    print(state)
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/3d_segmentation/ignite/unet_training_array.py b/3d_segmentation/ignite/unet_training_array.py
new file mode 100644
index 0000000000..bf4c9d21e8
--- /dev/null
+++ b/3d_segmentation/ignite/unet_training_array.py
@@ -0,0 +1,160 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from ignite.engine import Events, create_supervised_evaluator, create_supervised_trainer
+from ignite.handlers import EarlyStopping, ModelCheckpoint
+from torch.utils.data import DataLoader
+
+import monai
+from monai.data import NiftiDataset, create_test_image_3d
+from monai.handlers import (
+    MeanDice,
+    StatsHandler,
+    TensorBoardImageHandler,
+    TensorBoardStatsHandler,
+    stopping_fn_from_metric,
+)
+from monai.networks import predict_segmentation
+from monai.transforms import AddChannel, Compose, RandSpatialCrop, Resize, ScaleIntensity, ToTensor
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # create a temporary directory and 40 random image, mask pairs
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(40):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1)
+
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz"))
+
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+
+    # define transforms for image and segmentation
+    train_imtrans = Compose(
+        [ScaleIntensity(), AddChannel(), RandSpatialCrop((96, 96, 96), random_size=False), ToTensor()]
+    )
+    train_segtrans = Compose([AddChannel(), RandSpatialCrop((96, 96, 96), random_size=False), ToTensor()])
+    val_imtrans = Compose([ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), ToTensor()])
+    val_segtrans = Compose([AddChannel(), Resize((96, 96, 96)), ToTensor()])
+
+    # define nifti dataset, data loader
+    check_ds = NiftiDataset(images, segs, transform=train_imtrans, seg_transform=train_segtrans)
+    check_loader = DataLoader(check_ds, batch_size=10, num_workers=2, pin_memory=torch.cuda.is_available())
+    im, seg = monai.utils.misc.first(check_loader)
+    print(im.shape, seg.shape)
+
+    # create a training data loader
+    train_ds = NiftiDataset(images[:20], segs[:20], transform=train_imtrans, seg_transform=train_segtrans)
+    train_loader = DataLoader(train_ds, batch_size=5, shuffle=True, num_workers=8, pin_memory=torch.cuda.is_available())
+    # create a validation data loader
+    val_ds = NiftiDataset(images[-20:], segs[-20:], transform=val_imtrans, seg_transform=val_segtrans)
+    val_loader = DataLoader(val_ds, batch_size=5, num_workers=8, pin_memory=torch.cuda.is_available())
+
+    # create UNet, DiceLoss and Adam optimizer
+    net = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    )
+    loss = monai.losses.DiceLoss(sigmoid=True)
+    lr = 1e-3
+    opt = torch.optim.Adam(net.parameters(), lr)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Ignite trainer expects batch=(img, seg) and returns output=loss at every iteration,
+    # user can add output_transform to return other values, like: y_pred, y, etc.
+    trainer = create_supervised_trainer(net, opt, loss, device, False)
+
+    # adding checkpoint handler to save models (network params and optimizer stats) during training
+    checkpoint_handler = ModelCheckpoint("./runs_array/", "net", n_saved=10, require_empty=False)
+    trainer.add_event_handler(
+        event_name=Events.EPOCH_COMPLETED, handler=checkpoint_handler, to_save={"net": net, "opt": opt}
+    )
+
+    # StatsHandler prints loss at every iteration and print metrics at every epoch,
+    # we don't set metrics for trainer here, so just print loss, user can also customize print functions
+    # and can use output_transform to convert engine.state.output if it's not a loss value
+    train_stats_handler = StatsHandler(name="trainer")
+    train_stats_handler.attach(trainer)
+
+    # TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler
+    train_tensorboard_stats_handler = TensorBoardStatsHandler()
+    train_tensorboard_stats_handler.attach(trainer)
+
+    validation_every_n_epochs = 1
+    # Set parameters for validation
+    metric_name = "Mean_Dice"
+    # add evaluation metric to the evaluator engine
+    val_metrics = {metric_name: MeanDice(sigmoid=True, to_onehot_y=False)}
+
+    # Ignite evaluator expects batch=(img, seg) and returns output=(y_pred, y) at every iteration,
+    # user can add output_transform to return other values
+    evaluator = create_supervised_evaluator(net, val_metrics, device, True)
+
+    @trainer.on(Events.EPOCH_COMPLETED(every=validation_every_n_epochs))
+    def run_validation(engine):
+        evaluator.run(val_loader)
+
+    # add early stopping handler to evaluator
+    early_stopper = EarlyStopping(patience=4, score_function=stopping_fn_from_metric(metric_name), trainer=trainer)
+    evaluator.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=early_stopper)
+
+    # add stats event handler to print validation stats via evaluator
+    val_stats_handler = StatsHandler(
+        name="evaluator",
+        output_transform=lambda x: None,  # no need to print loss value, so disable per iteration output
+        global_epoch_transform=lambda x: trainer.state.epoch,
+    )  # fetch global epoch number from trainer
+    val_stats_handler.attach(evaluator)
+
+    # add handler to record metrics to TensorBoard at every validation epoch
+    val_tensorboard_stats_handler = TensorBoardStatsHandler(
+        output_transform=lambda x: None,  # no need to plot loss value, so disable per iteration output
+        global_epoch_transform=lambda x: trainer.state.epoch,
+    )  # fetch global epoch number from trainer
+    val_tensorboard_stats_handler.attach(evaluator)
+
+    # add handler to draw the first image and the corresponding label and model output in the last batch
+    # here we draw the 3D output as GIF format along Depth axis, at every validation epoch
+    val_tensorboard_image_handler = TensorBoardImageHandler(
+        batch_transform=lambda batch: (batch[0], batch[1]),
+        output_transform=lambda output: predict_segmentation(output[0]),
+        global_iter_transform=lambda x: trainer.state.epoch,
+    )
+    evaluator.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=val_tensorboard_image_handler)
+
+    train_epochs = 30
+    state = trainer.run(train_loader, train_epochs)
+    print(state)
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/3d_segmentation/ignite/unet_training_dict.py b/3d_segmentation/ignite/unet_training_dict.py
new file mode 100644
index 0000000000..fcdce7efdd
--- /dev/null
+++ b/3d_segmentation/ignite/unet_training_dict.py
@@ -0,0 +1,200 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from ignite.engine import Events, _prepare_batch, create_supervised_evaluator, create_supervised_trainer
+from ignite.handlers import EarlyStopping, ModelCheckpoint
+from torch.utils.data import DataLoader
+
+import monai
+from monai.data import create_test_image_3d, list_data_collate
+from monai.handlers import (
+    MeanDice,
+    StatsHandler,
+    TensorBoardImageHandler,
+    TensorBoardStatsHandler,
+    stopping_fn_from_metric,
+)
+from monai.networks import predict_segmentation
+from monai.transforms import (
+    AsChannelFirstd,
+    Compose,
+    LoadNiftid,
+    RandCropByPosNegLabeld,
+    RandRotate90d,
+    ScaleIntensityd,
+    ToTensord,
+)
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # create a temporary directory and 40 random image, mask pairs
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(40):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"img{i:d}.nii.gz"))
+
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+    train_files = [{"img": img, "seg": seg} for img, seg in zip(images[:20], segs[:20])]
+    val_files = [{"img": img, "seg": seg} for img, seg in zip(images[-20:], segs[-20:])]
+
+    # define transforms for image and segmentation
+    train_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            RandCropByPosNegLabeld(
+                keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
+            ),
+            RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+
+    # define dataset, data loader
+    check_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    check_loader = DataLoader(
+        check_ds, batch_size=2, num_workers=4, collate_fn=list_data_collate, pin_memory=torch.cuda.is_available()
+    )
+    check_data = monai.utils.misc.first(check_loader)
+    print(check_data["img"].shape, check_data["seg"].shape)
+
+    # create a training data loader
+    train_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=2,
+        shuffle=True,
+        num_workers=4,
+        collate_fn=list_data_collate,
+        pin_memory=torch.cuda.is_available(),
+    )
+    # create a validation data loader
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    val_loader = DataLoader(
+        val_ds, batch_size=5, num_workers=8, collate_fn=list_data_collate, pin_memory=torch.cuda.is_available()
+    )
+
+    # create UNet, DiceLoss and Adam optimizer
+    net = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    )
+    loss = monai.losses.DiceLoss(sigmoid=True)
+    lr = 1e-3
+    opt = torch.optim.Adam(net.parameters(), lr)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Ignite trainer expects batch=(img, seg) and returns output=loss at every iteration,
+    # user can add output_transform to return other values, like: y_pred, y, etc.
+    def prepare_batch(batch, device=None, non_blocking=False):
+        return _prepare_batch((batch["img"], batch["seg"]), device, non_blocking)
+
+    trainer = create_supervised_trainer(net, opt, loss, device, False, prepare_batch=prepare_batch)
+
+    # adding checkpoint handler to save models (network params and optimizer stats) during training
+    checkpoint_handler = ModelCheckpoint("./runs_dict/", "net", n_saved=10, require_empty=False)
+    trainer.add_event_handler(
+        event_name=Events.EPOCH_COMPLETED, handler=checkpoint_handler, to_save={"net": net, "opt": opt}
+    )
+
+    # StatsHandler prints loss at every iteration and print metrics at every epoch,
+    # we don't set metrics for trainer here, so just print loss, user can also customize print functions
+    # and can use output_transform to convert engine.state.output if it's not loss value
+    train_stats_handler = StatsHandler(name="trainer")
+    train_stats_handler.attach(trainer)
+
+    # TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler
+    train_tensorboard_stats_handler = TensorBoardStatsHandler()
+    train_tensorboard_stats_handler.attach(trainer)
+
+    validation_every_n_iters = 5
+    # set parameters for validation
+    metric_name = "Mean_Dice"
+    # add evaluation metric to the evaluator engine
+    val_metrics = {metric_name: MeanDice(sigmoid=True, to_onehot_y=False)}
+
+    # Ignite evaluator expects batch=(img, seg) and returns output=(y_pred, y) at every iteration,
+    # user can add output_transform to return other values
+    evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch)
+
+    @trainer.on(Events.ITERATION_COMPLETED(every=validation_every_n_iters))
+    def run_validation(engine):
+        evaluator.run(val_loader)
+
+    # add early stopping handler to evaluator
+    early_stopper = EarlyStopping(patience=4, score_function=stopping_fn_from_metric(metric_name), trainer=trainer)
+    evaluator.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=early_stopper)
+
+    # add stats event handler to print validation stats via evaluator
+    val_stats_handler = StatsHandler(
+        name="evaluator",
+        output_transform=lambda x: None,  # no need to print loss value, so disable per iteration output
+        global_epoch_transform=lambda x: trainer.state.epoch,
+    )  # fetch global epoch number from trainer
+    val_stats_handler.attach(evaluator)
+
+    # add handler to record metrics to TensorBoard at every validation epoch
+    val_tensorboard_stats_handler = TensorBoardStatsHandler(
+        output_transform=lambda x: None,  # no need to plot loss value, so disable per iteration output
+        global_epoch_transform=lambda x: trainer.state.iteration,
+    )  # fetch global iteration number from trainer
+    val_tensorboard_stats_handler.attach(evaluator)
+
+    # add handler to draw the first image and the corresponding label and model output in the last batch
+    # here we draw the 3D output as GIF format along the depth axis, every 2 validation iterations.
+    val_tensorboard_image_handler = TensorBoardImageHandler(
+        batch_transform=lambda batch: (batch["img"], batch["seg"]),
+        output_transform=lambda output: predict_segmentation(output[0]),
+        global_iter_transform=lambda x: trainer.state.epoch,
+    )
+    evaluator.add_event_handler(event_name=Events.ITERATION_COMPLETED(every=2), handler=val_tensorboard_image_handler)
+
+    train_epochs = 5
+    state = trainer.run(train_loader, train_epochs)
+    print(state)
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/spleen_segmentation_3d.ipynb b/3d_segmentation/spleen_segmentation_3d.ipynb
similarity index 99%
rename from spleen_segmentation_3d.ipynb
rename to 3d_segmentation/spleen_segmentation_3d.ipynb
index ee1d2d733f..a9b151140d 100644
--- a/spleen_segmentation_3d.ipynb
+++ b/3d_segmentation/spleen_segmentation_3d.ipynb
@@ -29,7 +29,7 @@
     "Source: Memorial Sloan Kettering Cancer Center  \n",
     "Challenge: Large ranging foreground size\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/spleen_segmentation_3d.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/3d_segmentation/spleen_segmentation_3d.ipynb)"
    ]
   },
   {
@@ -765,7 +765,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/spleen_segmentation_3d_lightning.ipynb b/3d_segmentation/spleen_segmentation_3d_lightning.ipynb
similarity index 99%
rename from spleen_segmentation_3d_lightning.ipynb
rename to 3d_segmentation/spleen_segmentation_3d_lightning.ipynb
index 4128300909..648a7f26ef 100644
--- a/spleen_segmentation_3d_lightning.ipynb
+++ b/3d_segmentation/spleen_segmentation_3d_lightning.ipynb
@@ -34,7 +34,7 @@
     "Source: Memorial Sloan Kettering Cancer Center  \n",
     "Challenge: Large ranging foreground size\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/spleen_segmentation_3d_lightning.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/3d_segmentation/spleen_segmentation_3d_lightning.ipynb)"
    ]
   },
   {
@@ -708,7 +708,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/3d_segmentation/torch/unet_evaluation_array.py b/3d_segmentation/torch/unet_evaluation_array.py
new file mode 100644
index 0000000000..09fcf42103
--- /dev/null
+++ b/3d_segmentation/torch/unet_evaluation_array.py
@@ -0,0 +1,89 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+from monai import config
+from monai.data import NiftiDataset, NiftiSaver, create_test_image_3d
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.networks.nets import UNet
+from monai.transforms import AddChannel, Compose, ScaleIntensity, ToTensor
+
+
+def main(tempdir):
+    config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(5):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1)
+
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz"))
+
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+
+    # define transforms for image and segmentation
+    imtrans = Compose([ScaleIntensity(), AddChannel(), ToTensor()])
+    segtrans = Compose([AddChannel(), ToTensor()])
+    val_ds = NiftiDataset(images, segs, transform=imtrans, seg_transform=segtrans, image_only=False)
+    # sliding window inference for one image at every iteration
+    val_loader = DataLoader(val_ds, batch_size=1, num_workers=1, pin_memory=torch.cuda.is_available())
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+
+    model.load_state_dict(torch.load("best_metric_model_segmentation3d_array.pth"))
+    model.eval()
+    with torch.no_grad():
+        metric_sum = 0.0
+        metric_count = 0
+        saver = NiftiSaver(output_dir="./output")
+        for val_data in val_loader:
+            val_images, val_labels = val_data[0].to(device), val_data[1].to(device)
+            # define sliding window size and batch size for windows inference
+            roi_size = (96, 96, 96)
+            sw_batch_size = 4
+            val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+            value = dice_metric(y_pred=val_outputs, y=val_labels)
+            metric_count += len(value)
+            metric_sum += value.item() * len(value)
+            val_outputs = (val_outputs.sigmoid() >= 0.5).float()
+            saver.save_batch(val_outputs, val_data[2])
+        metric = metric_sum / metric_count
+        print("evaluation metric:", metric)
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/3d_segmentation/torch/unet_evaluation_dict.py b/3d_segmentation/torch/unet_evaluation_dict.py
new file mode 100644
index 0000000000..cc39e82232
--- /dev/null
+++ b/3d_segmentation/torch/unet_evaluation_dict.py
@@ -0,0 +1,103 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+import monai
+from monai.data import NiftiSaver, create_test_image_3d, list_data_collate
+from monai.engines import get_devices_spec
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.networks.nets import UNet
+from monai.transforms import AsChannelFirstd, Compose, LoadNiftid, ScaleIntensityd, ToTensord
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(5):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz"))
+
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+    val_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    # sliding window inference need to input 1 image in every iteration
+    val_loader = DataLoader(val_ds, batch_size=1, num_workers=4, collate_fn=list_data_collate)
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    # try to use all the available GPUs
+    devices = get_devices_spec(None)
+    model = UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(devices[0])
+
+    model.load_state_dict(torch.load("best_metric_model_segmentation3d_dict.pth"))
+
+    # if we have multiple GPUs, set data parallel to execute sliding window inference
+    if len(devices) > 1:
+        model = torch.nn.DataParallel(model, device_ids=devices)
+
+    model.eval()
+    with torch.no_grad():
+        metric_sum = 0.0
+        metric_count = 0
+        saver = NiftiSaver(output_dir="./output")
+        for val_data in val_loader:
+            val_images, val_labels = val_data["img"].to(devices[0]), val_data["seg"].to(devices[0])
+            # define sliding window size and batch size for windows inference
+            roi_size = (96, 96, 96)
+            sw_batch_size = 4
+            val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+            value = dice_metric(y_pred=val_outputs, y=val_labels)
+            metric_count += len(value)
+            metric_sum += value.item() * len(value)
+            val_outputs = (val_outputs.sigmoid() >= 0.5).float()
+            saver.save_batch(val_outputs, val_data["img_meta_dict"])
+        metric = metric_sum / metric_count
+        print("evaluation metric:", metric)
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/3d_segmentation/torch/unet_training_array.py b/3d_segmentation/torch/unet_training_array.py
new file mode 100644
index 0000000000..1fc82089a9
--- /dev/null
+++ b/3d_segmentation/torch/unet_training_array.py
@@ -0,0 +1,167 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+import monai
+from monai.data import NiftiDataset, create_test_image_3d
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.transforms import AddChannel, Compose, RandRotate90, RandSpatialCrop, ScaleIntensity, ToTensor
+from monai.visualize import plot_2d_or_3d_image
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # create a temporary directory and 40 random image, mask pairs
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(40):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1)
+
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz"))
+
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+
+    # define transforms for image and segmentation
+    train_imtrans = Compose(
+        [
+            ScaleIntensity(),
+            AddChannel(),
+            RandSpatialCrop((96, 96, 96), random_size=False),
+            RandRotate90(prob=0.5, spatial_axes=(0, 2)),
+            ToTensor(),
+        ]
+    )
+    train_segtrans = Compose(
+        [
+            AddChannel(),
+            RandSpatialCrop((96, 96, 96), random_size=False),
+            RandRotate90(prob=0.5, spatial_axes=(0, 2)),
+            ToTensor(),
+        ]
+    )
+    val_imtrans = Compose([ScaleIntensity(), AddChannel(), ToTensor()])
+    val_segtrans = Compose([AddChannel(), ToTensor()])
+
+    # define nifti dataset, data loader
+    check_ds = NiftiDataset(images, segs, transform=train_imtrans, seg_transform=train_segtrans)
+    check_loader = DataLoader(check_ds, batch_size=10, num_workers=2, pin_memory=torch.cuda.is_available())
+    im, seg = monai.utils.misc.first(check_loader)
+    print(im.shape, seg.shape)
+
+    # create a training data loader
+    train_ds = NiftiDataset(images[:20], segs[:20], transform=train_imtrans, seg_transform=train_segtrans)
+    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=8, pin_memory=torch.cuda.is_available())
+    # create a validation data loader
+    val_ds = NiftiDataset(images[-20:], segs[-20:], transform=val_imtrans, seg_transform=val_segtrans)
+    val_loader = DataLoader(val_ds, batch_size=1, num_workers=4, pin_memory=torch.cuda.is_available())
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    loss_function = monai.losses.DiceLoss(sigmoid=True)
+    optimizer = torch.optim.Adam(model.parameters(), 1e-3)
+
+    # start a typical PyTorch training
+    val_interval = 2
+    best_metric = -1
+    best_metric_epoch = -1
+    epoch_loss_values = list()
+    metric_values = list()
+    writer = SummaryWriter()
+    for epoch in range(5):
+        print("-" * 10)
+        print(f"epoch {epoch + 1}/{5}")
+        model.train()
+        epoch_loss = 0
+        step = 0
+        for batch_data in train_loader:
+            step += 1
+            inputs, labels = batch_data[0].to(device), batch_data[1].to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            epoch_len = len(train_ds) // train_loader.batch_size
+            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
+            writer.add_scalar("train_loss", loss.item(), epoch_len * epoch + step)
+        epoch_loss /= step
+        epoch_loss_values.append(epoch_loss)
+        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
+
+        if (epoch + 1) % val_interval == 0:
+            model.eval()
+            with torch.no_grad():
+                metric_sum = 0.0
+                metric_count = 0
+                val_images = None
+                val_labels = None
+                val_outputs = None
+                for val_data in val_loader:
+                    val_images, val_labels = val_data[0].to(device), val_data[1].to(device)
+                    roi_size = (96, 96, 96)
+                    sw_batch_size = 4
+                    val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+                    value = dice_metric(y_pred=val_outputs, y=val_labels)
+                    metric_count += len(value)
+                    metric_sum += value.item() * len(value)
+                metric = metric_sum / metric_count
+                metric_values.append(metric)
+                if metric > best_metric:
+                    best_metric = metric
+                    best_metric_epoch = epoch + 1
+                    torch.save(model.state_dict(), "best_metric_model_segmentation3d_array.pth")
+                    print("saved new best metric model")
+                print(
+                    "current epoch: {} current mean dice: {:.4f} best mean dice: {:.4f} at epoch {}".format(
+                        epoch + 1, metric, best_metric, best_metric_epoch
+                    )
+                )
+                writer.add_scalar("val_mean_dice", metric, epoch + 1)
+                # plot the last model output as GIF image in TensorBoard with the corresponding image and label
+                plot_2d_or_3d_image(val_images, epoch + 1, writer, index=0, tag="image")
+                plot_2d_or_3d_image(val_labels, epoch + 1, writer, index=0, tag="label")
+                plot_2d_or_3d_image(val_outputs, epoch + 1, writer, index=0, tag="output")
+
+    print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")
+    writer.close()
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/3d_segmentation/torch/unet_training_dict.py b/3d_segmentation/torch/unet_training_dict.py
new file mode 100644
index 0000000000..381be34409
--- /dev/null
+++ b/3d_segmentation/torch/unet_training_dict.py
@@ -0,0 +1,187 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+import monai
+from monai.data import create_test_image_3d, list_data_collate
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.transforms import (
+    AsChannelFirstd,
+    Compose,
+    LoadNiftid,
+    RandCropByPosNegLabeld,
+    RandRotate90d,
+    ScaleIntensityd,
+    ToTensord,
+)
+from monai.visualize import plot_2d_or_3d_image
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # create a temporary directory and 40 random image, mask pairs
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(40):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"img{i:d}.nii.gz"))
+
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+    train_files = [{"img": img, "seg": seg} for img, seg in zip(images[:20], segs[:20])]
+    val_files = [{"img": img, "seg": seg} for img, seg in zip(images[-20:], segs[-20:])]
+
+    # define transforms for image and segmentation
+    train_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            RandCropByPosNegLabeld(
+                keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
+            ),
+            RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+
+    # define dataset, data loader
+    check_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    check_loader = DataLoader(check_ds, batch_size=2, num_workers=4, collate_fn=list_data_collate)
+    check_data = monai.utils.misc.first(check_loader)
+    print(check_data["img"].shape, check_data["seg"].shape)
+
+    # create a training data loader
+    train_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=2,
+        shuffle=True,
+        num_workers=4,
+        collate_fn=list_data_collate,
+        pin_memory=torch.cuda.is_available(),
+    )
+    # create a validation data loader
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    val_loader = DataLoader(val_ds, batch_size=1, num_workers=4, collate_fn=list_data_collate)
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    loss_function = monai.losses.DiceLoss(sigmoid=True)
+    optimizer = torch.optim.Adam(model.parameters(), 1e-3)
+
+    # start a typical PyTorch training
+    val_interval = 2
+    best_metric = -1
+    best_metric_epoch = -1
+    epoch_loss_values = list()
+    metric_values = list()
+    writer = SummaryWriter()
+    for epoch in range(5):
+        print("-" * 10)
+        print(f"epoch {epoch + 1}/{5}")
+        model.train()
+        epoch_loss = 0
+        step = 0
+        for batch_data in train_loader:
+            step += 1
+            inputs, labels = batch_data["img"].to(device), batch_data["seg"].to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            epoch_len = len(train_ds) // train_loader.batch_size
+            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
+            writer.add_scalar("train_loss", loss.item(), epoch_len * epoch + step)
+        epoch_loss /= step
+        epoch_loss_values.append(epoch_loss)
+        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
+
+        if (epoch + 1) % val_interval == 0:
+            model.eval()
+            with torch.no_grad():
+                metric_sum = 0.0
+                metric_count = 0
+                val_images = None
+                val_labels = None
+                val_outputs = None
+                for val_data in val_loader:
+                    val_images, val_labels = val_data["img"].to(device), val_data["seg"].to(device)
+                    roi_size = (96, 96, 96)
+                    sw_batch_size = 4
+                    val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+                    value = dice_metric(y_pred=val_outputs, y=val_labels)
+                    metric_count += len(value)
+                    metric_sum += value.item() * len(value)
+                metric = metric_sum / metric_count
+                metric_values.append(metric)
+                if metric > best_metric:
+                    best_metric = metric
+                    best_metric_epoch = epoch + 1
+                    torch.save(model.state_dict(), "best_metric_model_segmentation3d_dict.pth")
+                    print("saved new best metric model")
+                print(
+                    "current epoch: {} current mean dice: {:.4f} best mean dice: {:.4f} at epoch {}".format(
+                        epoch + 1, metric, best_metric, best_metric_epoch
+                    )
+                )
+                writer.add_scalar("val_mean_dice", metric, epoch + 1)
+                # plot the last model output as GIF image in TensorBoard with the corresponding image and label
+                plot_2d_or_3d_image(val_images, epoch + 1, writer, index=0, tag="image")
+                plot_2d_or_3d_image(val_labels, epoch + 1, writer, index=0, tag="label")
+                plot_2d_or_3d_image(val_outputs, epoch + 1, writer, index=0, tag="output")
+
+    print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")
+    writer.close()
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/unet_segmentation_3d_catalyst.ipynb b/3d_segmentation/unet_segmentation_3d_catalyst.ipynb
similarity index 99%
rename from unet_segmentation_3d_catalyst.ipynb
rename to 3d_segmentation/unet_segmentation_3d_catalyst.ipynb
index a4ffebdd76..130cf87570 100644
--- a/unet_segmentation_3d_catalyst.ipynb
+++ b/3d_segmentation/unet_segmentation_3d_catalyst.ipynb
@@ -24,7 +24,7 @@
     "\n",
     "This tutorial is based on [unet_training_dict.py](https://github.com/Project-MONAI/MONAI/blob/master/examples/segmentation_3d/unet_training_dict.py) and [spleen_segmentation_3d.ipynb](https://github.com/Project-MONAI/Tutorials/blob/master/spleen_segmentation_3d.ipynb).\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/unet_segmentation_3d_catalyst.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/3d_segmentation/unet_segmentation_3d_catalyst.ipynb)"
    ]
   },
   {
@@ -653,7 +653,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/unet_segmentation_3d_ignite.ipynb b/3d_segmentation/unet_segmentation_3d_ignite.ipynb
similarity index 99%
rename from unet_segmentation_3d_ignite.ipynb
rename to 3d_segmentation/unet_segmentation_3d_ignite.ipynb
index b67bbef971..0ef31ca87b 100644
--- a/unet_segmentation_3d_ignite.ipynb
+++ b/3d_segmentation/unet_segmentation_3d_ignite.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# 3D Segmentation with UNet\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/unet_segmentation_3d_ignite.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/3d_segmentation/unet_segmentation_3d_ignite.ipynb)"
    ]
   },
   {
@@ -478,7 +478,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/README.md b/README.md
index 91d178d644..4f619ed50a 100644
--- a/README.md
+++ b/README.md
@@ -18,65 +18,104 @@ Or install all optional requirements by:
 ```
 pip install -r https://raw.githubusercontent.com/Project-MONAI/MONAI/master/requirements-dev.txt
 ```
-### 2. List of notebooks
-#### [3d_image_transforms](./3d_image_transforms.ipynb)
-This notebook demonstrates the transformations on volumetric images.
-#### [automatic_mixed_precision](./automatic_mixed_precision.ipynb)
+### 2. List of notebooks and examples
+**2D classification**
+#### [mednist_tutorial](./2d_classification/mednist_tutorial.ipynb)
+This notebook shows how to easily integrate MONAI features into existing PyTorch programs.
+It's based on the MedNIST dataset which is very suitable for beginners as a tutorial.
+The content is also available as [a Colab tutorial](https://colab.research.google.com/drive/1wy8XUSnNWlhDNazFdvGBHLfdkGvOHBKe).
+
+**2D segmentation**
+#### [torch examples](./2d_segmentation/torch)
+Training and evaluation examples of 2D segmentation based on UNet and synthetic dataset.
+The examples are standard PyTorch programs and have both dictionary-based and array-based versions.
+
+**3D classification**
+#### [ignite examples](./3d_classification/ignite)
+Training and evaluation examples of 3D classification based on DenseNet3D and [IXI dataset](https://brain-development.org/ixi-dataset).
+The examples are PyTorch Ignite programs and have both dictionary-based and array-based transformation versions.
+#### [torch examples](./3d_classification/torch)
+Training and evaluation examples of 3D classification based on DenseNet3D and [IXI dataset](https://brain-development.org/ixi-dataset).
+The examples are standard PyTorch programs and have both dictionary-based and array-based transformation versions.
+
+**3D segmentation**
+#### [ignite examples](./3d_segmentation/ignite)
+Training and evaluation examples of 3D segmentation based on UNet3D and synthetic dataset.
+The examples are PyTorch Ignite programs and have both dictionary-base and array-based transformations.
+#### [torch examples](./3d_segmentation/torch)
+Training and evaluation examples of 3D segmentation based on UNet3D and synthetic dataset.
+The examples are standard PyTorch programs and have both dictionary-based and array-based versions.
+#### [brats_segmentation_3d](./3d_segmentation/brats_segmentation_3d.ipynb)
+This tutorial shows how to construct a training workflow of multi-labels segmentation task based on [MSD Brain Tumor dataset](http://medicaldecathlon.com).
+#### [spleen_segmentation_3d_lightning](./3d_segmentation/spleen_segmentation_3d_lightning.ipynb)
+This notebook shows how MONAI may be used in conjunction with the [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) framework.
+#### [spleen_segmentation_3d](./3d_segmentation/spleen_segmentation_3d.ipynb)
+This notebook is an end-to-end training and evaluation example of 3D segmentation based on [MSD Spleen dataset](http://medicaldecathlon.com).
+The example shows the flexibility of MONAI modules in a PyTorch-based program:
+- Transforms for dictionary-based training data structure.
+- Load NIfTI images with metadata.
+- Scale medical image intensity with expected range.
+- Crop out a batch of balanced image patch samples based on positive / negative label ratio.
+- Cache IO and transforms to accelerate training and validation.
+- 3D UNet, Dice loss function, Mean Dice metric for 3D segmentation task.
+- Sliding window inference.
+- Deterministic training for reproducibility.
+#### [unet_segmentation_3d_catalyst](./3d_segmentation/unet_segmentation_3d_catalyst.ipynb)
+This notebook shows how MONAI may be used in conjunction with the [Catalyst](https://github.com/catalyst-team/catalyst) framework.
+#### [unet_segmentation_3d_ignite](./3d_segmentation/unet_segmentation_3d_ignite.ipynb)
+This notebook is an end-to-end training & evaluation example of 3D segmentation based on synthetic dataset.
+The example is a PyTorch Ignite program and shows several key features of MONAI, especially with medical domain specific transforms and event handlers.
+
+**acceleration**
+#### [distributed_training](./acceleration/distributed_training)
+The examples show how to execute distributed training and evaluation based on 3 different frameworks:
+- PyTorch native `DistributedDataParallel` module with `torch.distributed.launch`.
+- Horovod APIs with `horovodrun`.
+- PyTorch ignite and MONAI workflows.
+
+They can run on several distributed nodes with multiple GPU devices on every node.
+#### [automatic_mixed_precision](./acceleration/accautomatic_mixed_precision.ipynb)
 This tutorial shows how to apply the automatic mixed precision(AMP) feature of PyTorch into training and evaluation programs.
 And compares the training speed and memory usage with/without AMP.
-#### [brats_segmentation_3d](./brats_segmentation_3d.ipynb)
-This tutorial shows how to construct a training workflow of multi-labels segmentation task based on [MSD Brain Tumor dataset](http://medicaldecathlon.com).
-#### [dataset_type_performance](./dataset_type_performance.ipynb)
+#### [dataset_type_performance](./acceleration/dataset_type_performance.ipynb)
 This notebook compares the performance of `Dataset`, `CacheDataset` and `PersistentDataset`. These classes differ in how data is stored (in memory or on disk), and at which moment transforms are applied.
-#### [fast_training_tutorial](./fast_training_tutorial.ipynb)
+#### [fast_training_tutorial](./acceleration/fast_training_tutorial.ipynb)
 This tutorial compares the training performance of pure PyTorch program and optimized program in MONAI based on NVIDIA GPU device and latest CUDA library.
 The optimization methods mainly include: `AMP`, `CacheDataset` and `Novograd`.
-#### [integrate_3rd_party_transforms](./integrate_3rd_party_transforms.ipynb)
+#### [multi_gpu_test](./acceleration/multi_gpu_test.ipynb)
+This notebook is a quick demo for devices, run the Ignite trainer engine on CPU, GPU and multiple GPUs.
+#### [transform_speed](./acceleration/transform_speed.ipynb)
+Illustrate reading NIfTI files and test speed of different transforms on different devices.
+
+**modules**
+#### [workflows](./modules/workflows)
+Training and evaluation examples of 3D segmentation based on UNet3D and synthetic dataset. And GAN training and evaluation example for a medical image generative adversarial network. Easy run training script uses `GanTrainer` to train a 2D CT scan reconstruction network. Evaluation script generates random samples from a trained network.
+
+The examples are built with MONAI workflows, mainly contain: trainer/evaluator, handlers, post_transforms, etc.
+#### [3d_image_transforms](./modules/3d_image_transforms.ipynb)
+This notebook demonstrates the transformations on volumetric images.
+#### [dynunet_tutorial](./modules/dynunet_tutorial.ipynb)
+This tutorial shows how to train 3D segmentation tasks on all the 10 decathlon datasets with the reimplementation of dynUNet in MONAI.
+#### [integrate_3rd_party_transforms](./modules/integrate_3rd_party_transforms.ipynb)
 This tutorial shows how to integrate 3rd party transforms into MONAI program.
 Mainly shows transforms from BatchGenerator, TorchIO, Rising and ITK.
-#### [load_medical_imagesl](./load_medical_images.ipynb)
+#### [load_medical_imagesl](./modules/load_medical_images.ipynb)
 This notebook introduces how to easily load different formats of medical images in MONAI and execute many additional operations.
-#### [mednist_GAN_tutorial](./mednist_GAN_tutorial.ipynb)
+#### [mednist_GAN_tutorial](./modules/mednist_GAN_tutorial.ipynb)
 This notebook illustrates the use of MONAI for training a network to generate images from a random input tensor.
 A simple GAN is employed to do with a separate Generator and Discriminator networks.
-#### [mednist_GAN_workflow](./mednist_GAN_workflow.ipynb)
+#### [mednist_GAN_workflow](./modules/mednist_GAN_workflow.ipynb)
 This notebook shows the `GanTrainer`, a MONAI workflow engine for modularized adversarial learning. Train a medical image reconstruction network using the MedNIST hand CT scan dataset. Based on the tutorial.
-#### [mednist_tutorial](./mednist_tutorial.ipynb)
-This notebook shows how to easily integrate MONAI features into existing PyTorch programs.
-It's based on the MedNIST dataset which is very suitable for beginners as a tutorial.
-The content is also available as [a Colab tutorial](https://colab.research.google.com/drive/1wy8XUSnNWlhDNazFdvGBHLfdkGvOHBKe).
-#### [models_ensemble](./models_ensemble.ipynb)
+#### [models_ensemble](./modules/models_ensemble.ipynb)
 This tutorial shows how to leverage `EnsembleEvaluator`, `MeanEnsemble` and `VoteEnsemble` modules in MONAI to set up ensemble program.
-#### [multi_gpu_test](./multi_gpu_test.ipynb)
-This notebook is a quick demo for devices, run the Ignite trainer engine on CPU, GPU and multiple GPUs.
-#### [nifti_read_example](./nifti_read_example.ipynb)
+#### [nifti_read_example](./modules/nifti_read_example.ipynb)
 Illustrate reading NIfTI files and iterating over image patches of the volumes loaded from them.
-#### [dynunet_tutorial](./dynunet_tutorial.ipynb)
+#### [dynunet_tutorial](./modules/dynunet_tutorial.ipynb)
 This tutorial shows how to train 3D segmentation tasks on all the 10 decathlon datasets with the reimplementation of dynUNet in MONAI.
-#### [post_transforms](./post_transforms.ipynb)
+#### [post_transforms](./modules/post_transforms.ipynb)
 This notebook shows the usage of several post transforms based on the model output of spleen segmentation task.
-#### [public_datasets](./public_datasets.ipynb)
+#### [public_datasets](./modules/public_datasets.ipynb)
 This notebook shows how to quickly set up training workflow based on `MedNISTDataset` and `DecathlonDataset`, and how to create a new dataset.
-#### [spleen_segmentation_3d](./spleen_segmentation_3d.ipynb)
-This notebook is an end-to-end training and evaluation example of 3D segmentation based on [MSD Spleen dataset](http://medicaldecathlon.com).
-The example shows the flexibility of MONAI modules in a PyTorch-based program:
-- Transforms for dictionary-based training data structure.
-- Load NIfTI images with metadata.
-- Scale medical image intensity with expected range.
-- Crop out a batch of balanced image patch samples based on positive / negative label ratio.
-- Cache IO and transforms to accelerate training and validation.
-- 3D UNet, Dice loss function, Mean Dice metric for 3D segmentation task.
-- Sliding window inference.
-- Deterministic training for reproducibility.
-#### [spleen_segmentation_3d_lightning](./spleen_segmentation_3d_lightning.ipynb)
-This notebook shows how MONAI may be used in conjunction with the [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) framework.
-#### [unet_segmentation_3d_catalyst](./unet_segmentation_3d_catalyst.ipynb)
-This notebook shows how MONAI may be used in conjunction with the [Catalyst](https://github.com/catalyst-team/catalyst) framework.
-#### [transform_speed](./transform_speed.ipynb)
-Illustrate reading NIfTI files and test speed of different transforms on different devices.
-#### [transforms_demo_2d](./transforms_demo_2d.ipynb)
+#### [transforms_demo_2d](./modules/transforms_demo_2d.ipynb)
 This notebook demonstrates the image transformations on histology images using
 [the GlaS Contest dataset](https://warwick.ac.uk/fac/sci/dcs/research/tia/glascontest/download/).
-#### [unet_segmentation_3d_ignite](./unet_segmentation_3d_ignite.ipynb)
-This notebook is an end-to-end training & evaluation example of 3D segmentation based on synthetic dataset.
-The example is a PyTorch Ignite program and shows several key features of MONAI, especially with medical domain specific transforms and event handlers.
diff --git a/automatic_mixed_precision.ipynb b/acceleration/automatic_mixed_precision.ipynb
similarity index 99%
rename from automatic_mixed_precision.ipynb
rename to acceleration/automatic_mixed_precision.ipynb
index 7d63e2bd96..043e169ec1 100644
--- a/automatic_mixed_precision.ipynb
+++ b/acceleration/automatic_mixed_precision.ipynb
@@ -16,7 +16,7 @@
     "\n",
     "The Spleen dataset can be downloaded from http://medicaldecathlon.com/.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/automatic_mixed_precision.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/automatic_mixed_precision.ipynb)"
    ]
   },
   {
diff --git a/dataset_type_performance.ipynb b/acceleration/dataset_type_performance.ipynb
similarity index 99%
rename from dataset_type_performance.ipynb
rename to acceleration/dataset_type_performance.ipynb
index 2b658e08e9..18782c6393 100644
--- a/dataset_type_performance.ipynb
+++ b/acceleration/dataset_type_performance.ipynb
@@ -17,7 +17,7 @@
     "\n",
     "It's modified from the Spleen 3D segmentation tutorial notebook.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/persistent_dataset_speed.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/persistent_dataset_speed.ipynb)"
    ]
   },
   {
@@ -694,7 +694,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/acceleration/distributed_training/brats_training_ddp.py b/acceleration/distributed_training/brats_training_ddp.py
new file mode 100644
index 0000000000..1dcbb084b8
--- /dev/null
+++ b/acceleration/distributed_training/brats_training_ddp.py
@@ -0,0 +1,475 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to execute distributed training based on PyTorch native `DistributedDataParallel` module.
+It can run on several nodes with multiple GPU devices on every node.
+
+This example is a real-world task based on Decathlon challenge Task01: Brain Tumor segmentation.
+So it's more complicated than other distributed training demo examples.
+
+Main steps to set up the distributed training:
+
+- Execute `torch.distributed.launch` to create processes on every node for every GPU.
+  It receives parameters as below:
+  `--nproc_per_node=NUM_GPUS_PER_NODE`
+  `--nnodes=NUM_NODES`
+  `--node_rank=INDEX_CURRENT_NODE`
+  `--master_addr="192.168.1.1"`
+  `--master_port=1234`
+  For more details, refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py.
+  Alternatively, we can also use `torch.multiprocessing.spawn` to start program, but it that case, need to handle
+  all the above parameters and compute `rank` manually, then set to `init_process_group`, etc.
+  `torch.distributed.launch` is even more efficient than `torch.multiprocessing.spawn` during training.
+- Use `init_process_group` to initialize every process, every GPU runs in a separate process with unique rank.
+  Here we use `NVIDIA NCCL` as the backend and must set `init_method="env://"` if use `torch.distributed.launch`.
+- Wrap the model with `DistributedDataParallel` after moving to expected device.
+- Partition dataset before training, so every rank process will only handle its own data partition.
+
+Note:
+    `torch.distributed.launch` will launch `nnodes * nproc_per_node = world_size` processes in total.
+    Suggest setting exactly the same software environment for every node, especially `PyTorch`, `nccl`, etc.
+    A good practice is to use the same MONAI docker image for all nodes directly.
+    Example script to execute this program on every node:
+    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+           --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+           --master_addr="192.168.1.1" --master_port=1234
+           brats_training_ddp.py -d DIR_OF_TESTDATA
+
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3].
+
+Referring to: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+
+Some codes are taken from https://github.com/pytorch/examples/blob/master/imagenet/main.py
+
+"""
+
+import argparse
+import os
+import sys
+import time
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.tensorboard import SummaryWriter
+
+from monai.apps import DecathlonDataset
+from monai.data import DataLoader
+from monai.losses import DiceLoss
+from monai.metrics import DiceMetric
+from monai.networks.nets import SegResNet, UNet
+from monai.transforms import (
+    AsChannelFirstd,
+    CenterSpatialCropd,
+    Compose,
+    LoadNiftid,
+    MapTransform,
+    NormalizeIntensityd,
+    Orientationd,
+    RandFlipd,
+    RandScaleIntensityd,
+    RandShiftIntensityd,
+    RandSpatialCropd,
+    Spacingd,
+    ToTensord,
+)
+from monai.utils import set_determinism
+
+
+class ConvertToMultiChannelBasedOnBratsClassesd(MapTransform):
+    """
+    Convert labels to multi channels based on brats classes:
+    label 1 is the peritumoral edema
+    label 2 is the GD-enhancing tumor
+    label 3 is the necrotic and non-enhancing tumor core
+    The possible classes are TC (Tumor core), WC (Whole tumor)
+    and ET (Enhancing tumor).
+
+    """
+
+    def __call__(self, data):
+        d = dict(data)
+        for key in self.keys:
+            result = list()
+            # merge label 2 and label 3 to construct TC
+            result.append(np.logical_or(d[key] == 2, d[key] == 3))
+            # merge labels 1, 2 and 3 to construct WC
+            result.append(np.logical_or(np.logical_or(d[key] == 2, d[key] == 3), d[key] == 1))
+            # label 2 is ET
+            result.append(d[key] == 2)
+            d[key] = np.stack(result, axis=0).astype(np.float32)
+        return d
+
+
+def partition_dataset(data, shuffle: bool = False, seed: int = 0):
+    """
+    Partition the dataset for distributed training, every rank process only train with its own data partition.
+    It can be useful for `CacheDataset` or `SmartCacheDataset`, because every rank process can only compute and
+    cache its own data.
+    Note that every rank process will shuffle data only in its own partition if set `shuffle=True` to DataLoader.
+    The alternative solution is to use `DistributedSampler`, which supports global shuffle before every epoch.
+    But if using `CacheDataset` or `SmartCacheDataset`, every rank process will cache duplicated data content and
+    raise system memory usage.
+    Args:
+        data: data list to partition, assumed to be of constant size.
+        shuffle: if true, will shuffle the indices of data list before partition.
+        seed: random seed to shuffle the indices if `shuffle=True`.
+            this number should be identical across all processes in the distributed group.
+    """
+    sampler: DistributedSampler = DistributedSampler(dataset=data, shuffle=shuffle)  # type: ignore
+    sampler.set_epoch(seed)
+    return [data[i] for i in sampler]
+
+
+class BratsCacheDataset(DecathlonDataset):
+    def __init__(
+        self,
+        root_dir,
+        section,
+        transform=LoadNiftid(["image", "label"]),
+        cache_rate=1.0,
+        num_workers=0,
+        shuffle=False,
+    ) -> None:
+
+        if not os.path.isdir(root_dir):
+            raise ValueError("Root directory root_dir must be a directory.")
+        self.section = section
+        self.shuffle = shuffle
+        self.val_frac = 0.2
+        self.set_random_state(seed=0)
+        dataset_dir = os.path.join(root_dir, "Task01_BrainTumour")
+        if not os.path.exists(dataset_dir):
+            raise RuntimeError(
+                f"Cannot find dataset directory: {dataset_dir}, please download it from Decathlon challenge."
+            )
+        data = self._generate_data_list(dataset_dir)
+        super(DecathlonDataset, self).__init__(data, transform, cache_rate=cache_rate, num_workers=num_workers)
+
+    def _generate_data_list(self, dataset_dir):
+        data = super()._generate_data_list(dataset_dir)
+        return partition_dataset(data, shuffle=self.shuffle, seed=0)
+
+
+def main_worker(args):
+    # disable logging for processes except 0 on every node
+    if args.local_rank != 0:
+        f = open(os.devnull, "w")
+        sys.stdout = sys.stderr = f
+    if not os.path.exists(args.dir):
+        raise FileNotFoundError(f"Missing directory {args.dir}")
+
+    # initialize the distributed training process, every GPU runs in a process
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    total_start = time.time()
+    train_transforms = Compose(
+        [
+            # load 4 Nifti images and stack them together
+            LoadNiftid(keys=["image", "label"]),
+            AsChannelFirstd(keys="image"),
+            ConvertToMultiChannelBasedOnBratsClassesd(keys="label"),
+            Spacingd(keys=["image", "label"], pixdim=(1.5, 1.5, 2.0), mode=("bilinear", "nearest")),
+            Orientationd(keys=["image", "label"], axcodes="RAS"),
+            RandSpatialCropd(keys=["image", "label"], roi_size=[128, 128, 64], random_size=False),
+            NormalizeIntensityd(keys="image", nonzero=True, channel_wise=True),
+            RandFlipd(keys=["image", "label"], prob=0.5, spatial_axis=0),
+            RandScaleIntensityd(keys="image", factors=0.1, prob=0.5),
+            RandShiftIntensityd(keys="image", offsets=0.1, prob=0.5),
+            ToTensord(keys=["image", "label"]),
+        ]
+    )
+
+    # create a training data loader
+    train_ds = BratsCacheDataset(
+        root_dir=args.dir,
+        transform=train_transforms,
+        section="training",
+        num_workers=4,
+        cache_rate=args.cache_rate,
+        shuffle=True,
+    )
+    train_loader = DataLoader(
+        train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True
+    )
+
+    # validation transforms and dataset
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["image", "label"]),
+            AsChannelFirstd(keys="image"),
+            ConvertToMultiChannelBasedOnBratsClassesd(keys="label"),
+            Spacingd(keys=["image", "label"], pixdim=(1.5, 1.5, 2.0), mode=("bilinear", "nearest")),
+            Orientationd(keys=["image", "label"], axcodes="RAS"),
+            CenterSpatialCropd(keys=["image", "label"], roi_size=[128, 128, 64]),
+            NormalizeIntensityd(keys="image", nonzero=True, channel_wise=True),
+            ToTensord(keys=["image", "label"]),
+        ]
+    )
+    val_ds = BratsCacheDataset(
+        root_dir=args.dir,
+        transform=val_transforms,
+        section="validation",
+        num_workers=4,
+        cache_rate=args.cache_rate,
+        shuffle=False,
+    )
+    val_loader = DataLoader(
+        val_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True
+    )
+
+    if dist.get_rank() == 0:
+        # Logging for TensorBoard
+        writer = SummaryWriter(log_dir=args.log_dir)
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device(f"cuda:{args.local_rank}")
+    if args.network == "UNet":
+        model = UNet(
+            dimensions=3,
+            in_channels=4,
+            out_channels=3,
+            channels=(16, 32, 64, 128, 256),
+            strides=(2, 2, 2, 2),
+            num_res_units=2,
+        ).to(device)
+    else:
+        model = SegResNet(in_channels=4, out_channels=3, init_filters=16, dropout_prob=0.2).to(device)
+    loss_function = DiceLoss(to_onehot_y=False, sigmoid=True, squared_pred=True)
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5, amsgrad=True)
+    # wrap the model with DistributedDataParallel module
+    model = DistributedDataParallel(model, device_ids=[args.local_rank])
+
+    # start a typical PyTorch training
+    total_epoch = args.epochs
+    best_metric = -1000000
+    best_metric_epoch = -1
+    epoch_time = AverageMeter("Time", ":6.3f")
+    progress = ProgressMeter(total_epoch, [epoch_time], prefix="Epoch: ")
+    end = time.time()
+    print(f"Time elapsed before training: {end-total_start}")
+    for epoch in range(total_epoch):
+
+        train_loss = train(train_loader, model, loss_function, optimizer, epoch, args, device)
+        epoch_time.update(time.time() - end)
+
+        if epoch % args.print_freq == 0:
+            progress.display(epoch)
+
+        if dist.get_rank() == 0:
+            writer.add_scalar("Loss/train", train_loss, epoch)
+
+        if (epoch + 1) % args.val_interval == 0:
+            metric, metric_tc, metric_wt, metric_et = evaluate(model, val_loader, device)
+
+            if dist.get_rank() == 0:
+                writer.add_scalar("Mean Dice/val", metric, epoch)
+                writer.add_scalar("Mean Dice TC/val", metric_tc, epoch)
+                writer.add_scalar("Mean Dice WT/val", metric_wt, epoch)
+                writer.add_scalar("Mean Dice ET/val", metric_et, epoch)
+                if metric > best_metric:
+                    best_metric = metric
+                    best_metric_epoch = epoch + 1
+                print(
+                    f"current epoch: {epoch + 1} current mean dice: {metric:.4f}"
+                    f" tc: {metric_tc:.4f} wt: {metric_wt:.4f} et: {metric_et:.4f}"
+                    f"\nbest mean dice: {best_metric:.4f} at epoch: {best_metric_epoch}"
+                )
+        end = time.time()
+        print(f"Time elapsed after epoch {epoch + 1} is {end - total_start}")
+
+    if dist.get_rank() == 0:
+        print(f"train completed, best_metric: {best_metric:.4f}  at epoch: {best_metric_epoch}")
+        # all processes should see same parameters as they all start from same
+        # random parameters and gradients are synchronized in backward passes,
+        # therefore, saving it in one process is sufficient
+        torch.save(model.state_dict(), "final_model.pth")
+        writer.flush()
+    dist.destroy_process_group()
+
+
+def train(train_loader, model, criterion, optimizer, epoch, args, device):
+    batch_time = AverageMeter("Time", ":6.3f")
+    data_time = AverageMeter("Data", ":6.3f")
+    losses = AverageMeter("Loss", ":.4e")
+    progress = ProgressMeter(len(train_loader), [batch_time, data_time, losses], prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, batch_data in enumerate(train_loader):
+        image = batch_data["image"].to(device, non_blocking=True)
+        target = batch_data["label"].to(device, non_blocking=True)
+
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        # compute output
+        optimizer.zero_grad()
+        output = model(image)
+        loss = criterion(output, target)
+
+        # record loss
+        losses.update(loss.item(), image.size(0))
+
+        # compute gradient and do GD step
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % 10 == 0:
+            progress.display(i)
+    return losses.avg
+
+
+def evaluate(model, data_loader, device):
+    metric = torch.zeros(8, dtype=torch.float, device=device)
+
+    model.eval()
+    with torch.no_grad():
+        dice_metric = DiceMetric(include_background=True, sigmoid=True, reduction="mean")
+        for val_data in data_loader:
+            val_inputs, val_labels = (
+                val_data["image"].to(device, non_blocking=True),
+                val_data["label"].to(device, non_blocking=True),
+            )
+            val_outputs = model(val_inputs)
+            # compute overall mean dice
+            value = dice_metric(y_pred=val_outputs, y=val_labels).squeeze()
+            metric[0] += value * dice_metric.not_nans
+            metric[1] += dice_metric.not_nans
+            # compute mean dice for TC
+            value_tc = dice_metric(y_pred=val_outputs[:, 0:1], y=val_labels[:, 0:1]).squeeze()
+            metric[2] += value_tc * dice_metric.not_nans
+            metric[3] += dice_metric.not_nans
+            # compute mean dice for WT
+            value_wt = dice_metric(y_pred=val_outputs[:, 1:2], y=val_labels[:, 1:2]).squeeze()
+            metric[4] += value_wt * dice_metric.not_nans
+            metric[5] += dice_metric.not_nans
+            # compute mean dice for ET
+            value_et = dice_metric(y_pred=val_outputs[:, 2:3], y=val_labels[:, 2:3]).squeeze()
+            metric[6] += value_et * dice_metric.not_nans
+            metric[7] += dice_metric.not_nans
+
+        # synchronizes all processes and reduce results
+        dist.barrier()
+        dist.all_reduce(metric, op=torch.distributed.ReduceOp.SUM)
+        metric = metric.tolist()
+
+    return metric[0] / metric[1], metric[2] / metric[3], metric[4] / metric[5], metric[6] / metric[7]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dir", default="./testdata", type=str, help="directory of Brain Tumor dataset.")
+    # must parse the command-line argument: ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by DDP
+    parser.add_argument("--local_rank", type=int, help="node rank for distributed training")
+    parser.add_argument(
+        "-j", "--workers", default=1, type=int, metavar="N", help="number of data loading workers (default: 1)"
+    )
+    parser.add_argument("--epochs", default=90, type=int, metavar="N", help="number of total epochs to run")
+    parser.add_argument("--lr", default=1e-4, type=float, help="learning rate")
+    parser.add_argument(
+        "-b",
+        "--batch_size",
+        default=4,
+        type=int,
+        metavar="N",
+        help="mini-batch size (default: 256), this is the total "
+        "batch size of all GPUs on the current node when "
+        "using Data Parallel or Distributed Data Parallel",
+    )
+    parser.add_argument("-p", "--print_freq", default=10, type=int, metavar="N", help="print frequency (default: 10)")
+    parser.add_argument(
+        "-e", "--evaluate", dest="evaluate", action="store_true", help="evaluate model on validation set"
+    )
+    parser.add_argument("--seed", default=None, type=int, help="seed for initializing training.")
+    parser.add_argument("--cache_rate", type=float, default=1.0)
+    parser.add_argument("--val_interval", type=int, default=5)
+    parser.add_argument("--network", type=str, default="UNet", choices=["UNet", "SegResNet"])
+    parser.add_argument("--log_dir", type=str, default=None)
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        set_determinism(seed=args.seed)
+        warnings.warn(
+            "You have chosen to seed training. "
+            "This will turn on the CUDNN deterministic setting, "
+            "which can slow down your training considerably! "
+            "You may see unexpected behavior when restarting "
+            "from checkpoints."
+        )
+
+    main_worker(args=args)
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=":f"):
+        self.name = name
+        self.fmt = fmt
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print("\t".join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = "{:" + str(num_digits) + "d}"
+        return "[" + fmt + "/" + fmt.format(num_batches) + "]"
+
+
+# usage example(refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py):
+
+# python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+#        --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+#        --master_addr="10.110.44.150" --master_port=1234
+#        brats_training_ddp.py -d DIR_OF_TESTDATA
+
+if __name__ == "__main__":
+    main()
diff --git a/acceleration/distributed_training/unet_evaluation_ddp.py b/acceleration/distributed_training/unet_evaluation_ddp.py
new file mode 100644
index 0000000000..9cc1851a0c
--- /dev/null
+++ b/acceleration/distributed_training/unet_evaluation_ddp.py
@@ -0,0 +1,166 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to execute distributed evaluation based on PyTorch native `DistributedDataParallel` module.
+It can run on several nodes with multiple GPU devices on every node.
+Main steps to set up the distributed evaluation:
+
+- Execute `torch.distributed.launch` to create processes on every node for every GPU.
+  It receives parameters as below:
+  `--nproc_per_node=NUM_GPUS_PER_NODE`
+  `--nnodes=NUM_NODES`
+  `--node_rank=INDEX_CURRENT_NODE`
+  `--master_addr="192.168.1.1"`
+  `--master_port=1234`
+  For more details, refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py.
+  Alternatively, we can also use `torch.multiprocessing.spawn` to start program, but it that case, need to handle
+  all the above parameters and compute `rank` manually, then set to `init_process_group`, etc.
+  `torch.distributed.launch` is even more efficient than `torch.multiprocessing.spawn`.
+- Use `init_process_group` to initialize every process, every GPU runs in a separate process with unique rank.
+  Here we use `NVIDIA NCCL` as the backend and must set `init_method="env://"` if use `torch.distributed.launch`.
+- Wrap the model with `DistributedDataParallel` after moving to expected device.
+- Put model file on every node, then load and map to expected GPU device in every process.
+- Wrap Dataset with `DistributedSampler`, disable the `shuffle` in sampler and DataLoader.
+- Compute `Dice Metric` on every process, reduce the results after synchronization.
+
+Note:
+    `torch.distributed.launch` will launch `nnodes * nproc_per_node = world_size` processes in total.
+    Suggest setting exactly the same software environment for every node, especially `PyTorch`, `nccl`, etc.
+    A good practice is to use the same MONAI docker image for all nodes directly.
+    Example script to execute this program on every node:
+    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+           --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+           --master_addr="192.168.1.1" --master_port=1234
+           unet_evaluation_ddp.py -d DIR_OF_TESTDATA
+
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3].
+
+Referring to: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+
+"""
+
+import argparse
+import os
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data.distributed import DistributedSampler
+
+import monai
+from monai.data import DataLoader, Dataset, create_test_image_3d
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.transforms import AsChannelFirstd, Compose, LoadNiftid, ScaleIntensityd, ToTensord
+
+
+def evaluate(args):
+    if args.local_rank == 0 and not os.path.exists(args.dir):
+        # create 16 random image, mask paris for evaluation
+        print(f"generating synthetic data to {args.dir} (this may take a while)")
+        os.makedirs(args.dir)
+        # set random seed to generate same random data for every node
+        np.random.seed(seed=0)
+        for i in range(16):
+            im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+            n = nib.Nifti1Image(im, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
+            n = nib.Nifti1Image(seg, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))
+
+    # initialize the distributed evaluation process, every GPU runs in a process
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
+    val_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+
+    # create a evaluation data loader
+    val_ds = Dataset(data=val_files, transform=val_transforms)
+    # create a evaluation data sampler
+    val_sampler = DistributedSampler(val_ds, shuffle=False)
+    # sliding window inference need to input 1 image in every iteration
+    val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, num_workers=2, pin_memory=True, sampler=val_sampler)
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device(f"cuda:{args.local_rank}")
+    model = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    # wrap the model with DistributedDataParallel module
+    model = DistributedDataParallel(model, device_ids=[args.local_rank])
+    # config mapping to expected GPU device
+    map_location = {"cuda:0": f"cuda:{args.local_rank}"}
+    # load model parameters to GPU device
+    model.load_state_dict(torch.load("final_model.pth", map_location=map_location))
+
+    model.eval()
+    with torch.no_grad():
+        # define PyTorch Tensor to record metrics result at each GPU
+        # the first value is `sum` of all dice metric, the second value is `count` of not_nan items
+        metric = torch.zeros(2, dtype=torch.float, device=device)
+        for val_data in val_loader:
+            val_images, val_labels = val_data["img"].to(device), val_data["seg"].to(device)
+            # define sliding window size and batch size for windows inference
+            roi_size = (96, 96, 96)
+            sw_batch_size = 4
+            val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+            value = dice_metric(y_pred=val_outputs, y=val_labels).squeeze()
+            metric[0] += value * dice_metric.not_nans
+            metric[1] += dice_metric.not_nans
+        # synchronizes all processes and reduce results
+        dist.barrier()
+        dist.all_reduce(metric, op=torch.distributed.ReduceOp.SUM)
+        metric = metric.tolist()
+        if dist.get_rank() == 0:
+            print("evaluation metric:", metric[0] / metric[1])
+        dist.destroy_process_group()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dir", default="./testdata", type=str, help="directory to create random data")
+    # must parse the command-line argument: ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by DDP
+    parser.add_argument("--local_rank", type=int)
+    args = parser.parse_args()
+
+    evaluate(args=args)
+
+
+# usage example(refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py):
+
+# python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+#        --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+#        --master_addr="192.168.1.1" --master_port=1234
+#        unet_evaluation_ddp.py -d DIR_OF_TESTDATA
+
+if __name__ == "__main__":
+    main()
diff --git a/acceleration/distributed_training/unet_evaluation_horovod.py b/acceleration/distributed_training/unet_evaluation_horovod.py
new file mode 100644
index 0000000000..463e5bcc6a
--- /dev/null
+++ b/acceleration/distributed_training/unet_evaluation_horovod.py
@@ -0,0 +1,165 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to execute distributed evaluation based on Horovod APIs.
+It can run on several nodes with multiple GPU devices on every node.
+Main steps to set up the distributed evaluation:
+
+- Install Horovod referring to the guide: https://github.com/horovod/horovod/blob/master/docs/gpus.rst
+  If using MONAI docker, which already has NCCL and MPI, can quickly install Horovod with command:
+  `HOROVOD_NCCL_INCLUDE=/usr/include HOROVOD_NCCL_LIB=/usr/lib/x86_64-linux-gnu HOROVOD_GPU_OPERATIONS=NCCL \
+  pip install --no-cache-dir horovod`
+- Set SSH permissions for root login without password at all nodes except master, referring to:
+  http://www.linuxproblem.org/art_9.html
+- Run `hvd.init()` to initialize Horovod.
+- Pin each GPU to a single process to avoid resource contention, use `hvd.local_rank()` to get GPU index.
+  And use `hvd.rank()` to get the overall rank index.
+- Wrap Dataset with `DistributedSampler`, disable `shuffle` for sampler and DataLoader.
+- Broadcast the model parameters from rank 0 to all other processes.
+
+Note:
+    Suggest setting exactly the same software environment for every node, especially `mpi`, `nccl`, etc.
+    A good practice is to use the same MONAI docker image for all nodes directly, if using docker, need
+    to set SSH permissions both at the node and in docker, referring to Horovod guide for more details:
+    https://github.com/horovod/horovod/blob/master/docs/docker.rst
+
+    Example script to execute this program, only need to run on the master node:
+    `horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python unet_evaluation_horovod.py -d "./testdata"`
+
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3], [horovod 0.19.5].
+
+Referring to: https://github.com/horovod/horovod/blob/master/examples/pytorch_mnist.py
+
+"""
+
+import argparse
+import os
+from glob import glob
+
+import horovod.torch as hvd
+import nibabel as nib
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+from torch.utils.data.distributed import DistributedSampler
+
+import monai
+from monai.data import DataLoader, Dataset, create_test_image_3d
+from monai.inferers import sliding_window_inference
+from monai.metrics import DiceMetric
+from monai.transforms import AsChannelFirstd, Compose, LoadNiftid, ScaleIntensityd, ToTensord
+
+
+def evaluate(args):
+    # initialize Horovod library
+    hvd.init()
+    # Horovod limits CPU threads to be used per worker
+    torch.set_num_threads(1)
+
+    if hvd.local_rank() == 0 and not os.path.exists(args.dir):
+        # create 16 random image, mask paris for evaluation
+        print(f"generating synthetic data to {args.dir} (this may take a while)")
+        os.makedirs(args.dir)
+        # set random seed to generate same random data for every node
+        np.random.seed(seed=0)
+        for i in range(16):
+            im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+            n = nib.Nifti1Image(im, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
+            n = nib.Nifti1Image(seg, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
+    val_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+
+    # create a evaluation data loader
+    val_ds = Dataset(data=val_files, transform=val_transforms)
+    # create a evaluation data sampler
+    val_sampler = DistributedSampler(val_ds, shuffle=False, num_replicas=hvd.size(), rank=hvd.rank())
+    # when supported, use "forkserver" to spawn dataloader workers instead of "fork" to prevent
+    # issues with Infiniband implementations that are not fork-safe
+    multiprocessing_context = None
+    if hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods():
+        multiprocessing_context = "forkserver"
+    # sliding window inference need to input 1 image in every iteration
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=1,
+        shuffle=False,
+        num_workers=2,
+        pin_memory=True,
+        sampler=val_sampler,
+        multiprocessing_context=multiprocessing_context,
+    )
+    dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean")
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device(f"cuda:{hvd.local_rank()}")
+    model = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    if hvd.rank() == 0:
+        # load model parameters for evaluation
+        model.load_state_dict(torch.load("final_model.pth"))
+    # Horovod broadcasts parameters
+    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+
+    model.eval()
+    with torch.no_grad():
+        # define PyTorch Tensor to record metrics result at each GPU
+        # the first value is `sum` of all dice metric, the second value is `count` of not_nan items
+        metric = torch.zeros(2, dtype=torch.float, device=device)
+        for val_data in val_loader:
+            val_images, val_labels = val_data["img"].to(device), val_data["seg"].to(device)
+            # define sliding window size and batch size for windows inference
+            roi_size = (96, 96, 96)
+            sw_batch_size = 4
+            val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
+            value = dice_metric(y_pred=val_outputs, y=val_labels).squeeze()
+            metric[0] += value * dice_metric.not_nans
+            metric[1] += dice_metric.not_nans
+        # synchronizes all processes and reduce results
+        print(f"metric in rank {hvd.rank()}: sum={metric[0].item()}, count={metric[1].item()}")
+        avg_metric = hvd.allreduce(metric, name="mean_dice")
+        if hvd.rank() == 0:
+            print(f"average metric: sum={avg_metric[0].item()}, count={avg_metric[1].item()}")
+            print("evaluation metric:", (avg_metric[0] / avg_metric[1]).item())
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dir", default="./testdata", type=str, help="directory to create random data")
+    args = parser.parse_args()
+
+    evaluate(args=args)
+
+
+# Example script to execute this program only on the master node:
+# horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python unet_evaluation_horovod.py -d "./testdata"
+if __name__ == "__main__":
+    main()
diff --git a/acceleration/distributed_training/unet_evaluation_workflows.py b/acceleration/distributed_training/unet_evaluation_workflows.py
new file mode 100644
index 0000000000..22b41206a4
--- /dev/null
+++ b/acceleration/distributed_training/unet_evaluation_workflows.py
@@ -0,0 +1,203 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to execute distributed evaluation based on PyTorch native `DistributedDataParallel` module
+and MONAI workflows. It can run on several nodes with multiple GPU devices on every node.
+Main steps to set up the distributed evaluation:
+
+- Execute `torch.distributed.launch` to create processes on every node for every GPU.
+  It receives parameters as below:
+  `--nproc_per_node=NUM_GPUS_PER_NODE`
+  `--nnodes=NUM_NODES`
+  `--node_rank=INDEX_CURRENT_NODE`
+  `--master_addr="192.168.1.1"`
+  `--master_port=1234`
+  For more details, refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py.
+  Alternatively, we can also use `torch.multiprocessing.spawn` to start program, but it that case, need to handle
+  all the above parameters and compute `rank` manually, then set to `init_process_group`, etc.
+  `torch.distributed.launch` is even more efficient than `torch.multiprocessing.spawn`.
+- Use `init_process_group` to initialize every process, every GPU runs in a separate process with unique rank.
+  Here we use `NVIDIA NCCL` as the backend and must set `init_method="env://"` if use `torch.distributed.launch`.
+- Wrap the model with `DistributedDataParallel` after moving to expected device.
+- Put model file on every node, then load and map to expected GPU device in every process.
+- Wrap Dataset with `DistributedSampler`, disable the `shuffle` in sampler and DataLoader.
+- Add `StatsHandler` and `SegmentationSaver` to the master process which is `dist.get_rank() == 0`.
+- ignite can automatically reduce metrics for distributed evaluation, refer to:
+  https://github.com/pytorch/ignite/blob/v0.3.0/ignite/metrics/metric.py#L85
+
+Note:
+    `torch.distributed.launch` will launch `nnodes * nproc_per_node = world_size` processes in total.
+    Suggest setting exactly the same software environment for every node, especially `PyTorch`, `nccl`, etc.
+    A good practice is to use the same MONAI docker image for all nodes directly.
+    Example script to execute this program on every node:
+    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+           --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+           --master_addr="192.168.1.1" --master_port=1234
+           unet_evaluation_workflows.py -d DIR_OF_TESTDATA
+
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3].
+
+Referring to: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+
+"""
+
+import argparse
+import logging
+import os
+import sys
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+import torch.distributed as dist
+from ignite.metrics import Accuracy
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data.distributed import DistributedSampler
+
+import monai
+from monai.data import DataLoader, Dataset, create_test_image_3d
+from monai.engines import SupervisedEvaluator
+from monai.handlers import CheckpointLoader, MeanDice, SegmentationSaver, StatsHandler
+from monai.inferers import SlidingWindowInferer
+from monai.transforms import (
+    Activationsd,
+    AsChannelFirstd,
+    AsDiscreted,
+    Compose,
+    KeepLargestConnectedComponentd,
+    LoadNiftid,
+    ScaleIntensityd,
+    ToTensord,
+)
+
+
+def evaluate(args):
+    if args.local_rank == 0 and not os.path.exists(args.dir):
+        # create 16 random image, mask paris for evaluation
+        print(f"generating synthetic data to {args.dir} (this may take a while)")
+        os.makedirs(args.dir)
+        # set random seed to generate same random data for every node
+        np.random.seed(seed=0)
+        for i in range(16):
+            im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+            n = nib.Nifti1Image(im, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
+            n = nib.Nifti1Image(seg, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))
+
+    # initialize the distributed evaluation process, every GPU runs in a process
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
+    val_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["image", "label"]),
+            AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
+            ScaleIntensityd(keys="image"),
+            ToTensord(keys=["image", "label"]),
+        ]
+    )
+
+    # create a evaluation data loader
+    val_ds = Dataset(data=val_files, transform=val_transforms)
+    # create a evaluation data sampler
+    val_sampler = DistributedSampler(val_ds, shuffle=False)
+    # sliding window inference need to input 1 image in every iteration
+    val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, num_workers=2, pin_memory=True, sampler=val_sampler)
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device(f"cuda:{args.local_rank}")
+    net = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    # wrap the model with DistributedDataParallel module
+    net = DistributedDataParallel(net, device_ids=[args.local_rank])
+
+    val_post_transforms = Compose(
+        [
+            Activationsd(keys="pred", sigmoid=True),
+            AsDiscreted(keys="pred", threshold_values=True),
+            KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
+        ]
+    )
+    val_handlers = [
+        CheckpointLoader(
+            load_path="./runs/checkpoint_epoch=4.pth",
+            load_dict={"net": net},
+            # config mapping to expected GPU device
+            map_location={"cuda:0": f"cuda:{args.local_rank}"},
+        ),
+    ]
+    if dist.get_rank() == 0:
+        logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+        val_handlers.extend(
+            [
+                StatsHandler(output_transform=lambda x: None),
+                SegmentationSaver(
+                    output_dir="./runs/",
+                    batch_transform=lambda batch: batch["image_meta_dict"],
+                    output_transform=lambda output: output["pred"],
+                ),
+            ]
+        )
+
+    evaluator = SupervisedEvaluator(
+        device=device,
+        val_data_loader=val_loader,
+        network=net,
+        inferer=SlidingWindowInferer(roi_size=(96, 96, 96), sw_batch_size=4, overlap=0.5),
+        post_transform=val_post_transforms,
+        key_val_metric={
+            "val_mean_dice": MeanDice(
+                include_background=True,
+                output_transform=lambda x: (x["pred"], x["label"]),
+                device=device,
+            )
+        },
+        additional_metrics={"val_acc": Accuracy(output_transform=lambda x: (x["pred"], x["label"]), device=device)},
+        val_handlers=val_handlers,
+        # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation
+        amp=True if monai.config.get_torch_version_tuple() >= (1, 6) else False,
+    )
+    evaluator.run()
+    dist.destroy_process_group()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dir", default="./testdata", type=str, help="directory to create random data")
+    # must parse the command-line argument: ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by DDP
+    parser.add_argument("--local_rank", type=int)
+    args = parser.parse_args()
+
+    evaluate(args=args)
+
+
+# usage example(refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py):
+
+# python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+#        --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+#        --master_addr="192.168.1.1" --master_port=1234
+#        unet_evaluation_workflows.py -d DIR_OF_TESTDATA
+
+if __name__ == "__main__":
+    main()
diff --git a/acceleration/distributed_training/unet_training_ddp.py b/acceleration/distributed_training/unet_training_ddp.py
new file mode 100644
index 0000000000..1a2db6fc8a
--- /dev/null
+++ b/acceleration/distributed_training/unet_training_ddp.py
@@ -0,0 +1,193 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to execute distributed training based on PyTorch native `DistributedDataParallel` module.
+It can run on several nodes with multiple GPU devices on every node.
+Main steps to set up the distributed training:
+
+- Execute `torch.distributed.launch` to create processes on every node for every GPU.
+  It receives parameters as below:
+  `--nproc_per_node=NUM_GPUS_PER_NODE`
+  `--nnodes=NUM_NODES`
+  `--node_rank=INDEX_CURRENT_NODE`
+  `--master_addr="192.168.1.1"`
+  `--master_port=1234`
+  For more details, refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py.
+  Alternatively, we can also use `torch.multiprocessing.spawn` to start program, but it that case, need to handle
+  all the above parameters and compute `rank` manually, then set to `init_process_group`, etc.
+  `torch.distributed.launch` is even more efficient than `torch.multiprocessing.spawn` during training.
+- Use `init_process_group` to initialize every process, every GPU runs in a separate process with unique rank.
+  Here we use `NVIDIA NCCL` as the backend and must set `init_method="env://"` if use `torch.distributed.launch`.
+- Wrap the model with `DistributedDataParallel` after moving to expected device.
+- Wrap Dataset with `DistributedSampler`, and disable the `shuffle` in DataLoader.
+  Instead, shuffle data by `train_sampler.set_epoch(epoch)` before every epoch.
+
+Note:
+    `torch.distributed.launch` will launch `nnodes * nproc_per_node = world_size` processes in total.
+    Suggest setting exactly the same software environment for every node, especially `PyTorch`, `nccl`, etc.
+    A good practice is to use the same MONAI docker image for all nodes directly.
+    Example script to execute this program on every node:
+    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+           --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+           --master_addr="192.168.1.1" --master_port=1234
+           unet_training_ddp.py -d DIR_OF_TESTDATA
+
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3].
+
+Referring to: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+
+"""
+
+import argparse
+import os
+import sys
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data.distributed import DistributedSampler
+
+import monai
+from monai.data import DataLoader, Dataset, create_test_image_3d
+from monai.transforms import (
+    AsChannelFirstd,
+    Compose,
+    LoadNiftid,
+    RandCropByPosNegLabeld,
+    RandRotate90d,
+    ScaleIntensityd,
+    ToTensord,
+)
+
+
+def train(args):
+    # disable logging for processes except 0 on every node
+    if args.local_rank != 0:
+        f = open(os.devnull, "w")
+        sys.stdout = sys.stderr = f
+    elif not os.path.exists(args.dir):
+        # create 40 random image, mask paris for training
+        print(f"generating synthetic data to {args.dir} (this may take a while)")
+        os.makedirs(args.dir)
+        # set random seed to generate same random data for every node
+        np.random.seed(seed=0)
+        for i in range(40):
+            im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+            n = nib.Nifti1Image(im, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
+            n = nib.Nifti1Image(seg, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))
+
+    # initialize the distributed training process, every GPU runs in a process
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
+    train_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    train_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            RandCropByPosNegLabeld(
+                keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
+            ),
+            RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+
+    # create a training data loader
+    train_ds = Dataset(data=train_files, transform=train_transforms)
+    # create a training data sampler
+    train_sampler = DistributedSampler(train_ds)
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=2,
+        shuffle=False,
+        num_workers=2,
+        pin_memory=True,
+        sampler=train_sampler,
+    )
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device(f"cuda:{args.local_rank}")
+    model = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    loss_function = monai.losses.DiceLoss(sigmoid=True).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), 1e-3)
+    # wrap the model with DistributedDataParallel module
+    model = DistributedDataParallel(model, device_ids=[args.local_rank])
+
+    # start a typical PyTorch training
+    epoch_loss_values = list()
+    for epoch in range(5):
+        print("-" * 10)
+        print(f"epoch {epoch + 1}/{5}")
+        model.train()
+        epoch_loss = 0
+        step = 0
+        train_sampler.set_epoch(epoch)
+        for batch_data in train_loader:
+            step += 1
+            inputs, labels = batch_data["img"].to(device), batch_data["seg"].to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            epoch_len = len(train_ds) // train_loader.batch_size
+            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
+        epoch_loss /= step
+        epoch_loss_values.append(epoch_loss)
+        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
+    print(f"train completed, epoch losses: {epoch_loss_values}")
+    if dist.get_rank() == 0:
+        # all processes should see same parameters as they all start from same
+        # random parameters and gradients are synchronized in backward passes,
+        # therefore, saving it in one process is sufficient
+        torch.save(model.state_dict(), "final_model.pth")
+    dist.destroy_process_group()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dir", default="./testdata", type=str, help="directory to create random data")
+    # must parse the command-line argument: ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by DDP
+    parser.add_argument("--local_rank", type=int)
+    args = parser.parse_args()
+
+    train(args=args)
+
+
+# usage example(refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py):
+
+# python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+#        --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+#        --master_addr="192.168.1.1" --master_port=1234
+#        unet_training_ddp.py -d DIR_OF_TESTDATA
+
+if __name__ == "__main__":
+    main()
diff --git a/acceleration/distributed_training/unet_training_horovod.py b/acceleration/distributed_training/unet_training_horovod.py
new file mode 100644
index 0000000000..4462fb6507
--- /dev/null
+++ b/acceleration/distributed_training/unet_training_horovod.py
@@ -0,0 +1,193 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to execute distributed training based on Horovod APIs.
+It can run on several nodes with multiple GPU devices on every node.
+Main steps to set up the distributed training:
+
+- Install Horovod referring to the guide: https://github.com/horovod/horovod/blob/master/docs/gpus.rst
+  If using MONAI docker, which already has NCCL and MPI, can quickly install Horovod with command:
+  `HOROVOD_NCCL_INCLUDE=/usr/include HOROVOD_NCCL_LIB=/usr/lib/x86_64-linux-gnu HOROVOD_GPU_OPERATIONS=NCCL \
+  pip install --no-cache-dir horovod`
+- Set SSH permissions for root login without password at all nodes except master, referring to:
+  http://www.linuxproblem.org/art_9.html
+- Run `hvd.init()` to initialize Horovod.
+- Pin each GPU to a single process to avoid resource contention, use `hvd.local_rank()` to get GPU index.
+  And use `hvd.rank()` to get the overall rank index.
+- Wrap Dataset with `DistributedSampler`, and disable the `shuffle` in DataLoader.
+  Instead, shuffle data by `train_sampler.set_epoch(epoch)` before every epoch.
+- Wrap the optimizer in hvd.DistributedOptimizer. The distributed optimizer delegates gradient
+  computation to the original optimizer, averages gradients using allreduce or allgather,
+  and then applies those averaged gradients.
+- Broadcast the initial variable states from rank 0 to all other processes.
+
+Note:
+    Suggest setting exactly the same software environment for every node, especially `mpi`, `nccl`, etc.
+    A good practice is to use the same MONAI docker image for all nodes directly, if using docker, need
+    to set SSH permissions both at the node and in docker, referring to Horovod guide for more details:
+    https://github.com/horovod/horovod/blob/master/docs/docker.rst
+
+    Example script to execute this program, only need to run on the master node:
+    `horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python unet_training_horovod.py -d "./testdata"`
+
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3], [horovod 0.19.5].
+
+Referring to: https://github.com/horovod/horovod/blob/master/examples/pytorch_mnist.py
+
+"""
+
+import argparse
+import os
+import sys
+from glob import glob
+
+import horovod.torch as hvd
+import nibabel as nib
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+from torch.utils.data.distributed import DistributedSampler
+
+import monai
+from monai.data import DataLoader, Dataset, create_test_image_3d
+from monai.transforms import (
+    AsChannelFirstd,
+    Compose,
+    LoadNiftid,
+    RandCropByPosNegLabeld,
+    RandRotate90d,
+    ScaleIntensityd,
+    ToTensord,
+)
+
+
+def train(args):
+    # initialize Horovod library
+    hvd.init()
+    # Horovod limits CPU threads to be used per worker
+    torch.set_num_threads(1)
+    # disable logging for processes except 0 on every node
+    if hvd.local_rank() != 0:
+        f = open(os.devnull, "w")
+        sys.stdout = sys.stderr = f
+    elif not os.path.exists(args.dir):
+        # create 40 random image, mask paris on master node for training
+        print(f"generating synthetic data to {args.dir} (this may take a while)")
+        os.makedirs(args.dir)
+        # set random seed to generate same random data for every node
+        np.random.seed(seed=0)
+        for i in range(40):
+            im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+            n = nib.Nifti1Image(im, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
+            n = nib.Nifti1Image(seg, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
+    train_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    train_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            RandCropByPosNegLabeld(
+                keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
+            ),
+            RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+
+    # create a training data loader
+    train_ds = Dataset(data=train_files, transform=train_transforms)
+    # create a training data sampler
+    train_sampler = DistributedSampler(train_ds, num_replicas=hvd.size(), rank=hvd.rank())
+    # when supported, use "forkserver" to spawn dataloader workers instead of "fork" to prevent
+    # issues with Infiniband implementations that are not fork-safe
+    multiprocessing_context = None
+    if hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods():
+        multiprocessing_context = "forkserver"
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=2,
+        shuffle=False,
+        num_workers=2,
+        pin_memory=True,
+        sampler=train_sampler,
+        multiprocessing_context=multiprocessing_context,
+    )
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device(f"cuda:{hvd.local_rank()}")
+    model = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    loss_function = monai.losses.DiceLoss(sigmoid=True).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), 1e-3)
+    # Horovod broadcasts parameters & optimizer state
+    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
+    # Horovod wraps optimizer with DistributedOptimizer
+    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
+
+    # start a typical PyTorch training
+    epoch_loss_values = list()
+    for epoch in range(5):
+        print("-" * 10)
+        print(f"epoch {epoch + 1}/{5}")
+        model.train()
+        epoch_loss = 0
+        step = 0
+        train_sampler.set_epoch(epoch)
+        for batch_data in train_loader:
+            step += 1
+            inputs, labels = batch_data["img"].to(device), batch_data["seg"].to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            epoch_len = len(train_ds) // train_loader.batch_size
+            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
+        epoch_loss /= step
+        epoch_loss_values.append(epoch_loss)
+        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
+    print(f"train completed, epoch losses: {epoch_loss_values}")
+    if hvd.rank() == 0:
+        # all processes should see same parameters as they all start from same
+        # random parameters and gradients are synchronized in backward passes,
+        # therefore, saving it in one process is sufficient
+        torch.save(model.state_dict(), "final_model.pth")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dir", default="./testdata", type=str, help="directory to create random data")
+    args = parser.parse_args()
+
+    train(args=args)
+
+
+# Example script to execute this program only on the master node:
+# horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python unet_training_horovod.py -d "./testdata"
+if __name__ == "__main__":
+    main()
diff --git a/acceleration/distributed_training/unet_training_smartcache.py b/acceleration/distributed_training/unet_training_smartcache.py
new file mode 100644
index 0000000000..1a2fad1866
--- /dev/null
+++ b/acceleration/distributed_training/unet_training_smartcache.py
@@ -0,0 +1,259 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to execute distributed training based on PyTorch native module and SmartCacheDataset.
+It can run on several nodes with multiple GPU devices on every node.
+It splits data into partitions, every rank only cache and train with its own partition.
+
+Main steps to set up the distributed training:
+
+- Execute `torch.distributed.launch` to create processes on every node for every GPU.
+  It receives parameters as below:
+  `--nproc_per_node=NUM_GPUS_PER_NODE`
+  `--nnodes=NUM_NODES`
+  `--node_rank=INDEX_CURRENT_NODE`
+  `--master_addr="192.168.1.1"`
+  `--master_port=1234`
+  For more details, refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py.
+  Alternatively, we can also use `torch.multiprocessing.spawn` to start program, but it that case, need to handle
+  all the above parameters and compute `rank` manually, then set to `init_process_group`, etc.
+  `torch.distributed.launch` is even more efficient than `torch.multiprocessing.spawn` during training.
+- Use `init_process_group` to initialize every process, every GPU runs in a separate process with unique rank.
+  Here we use `NVIDIA NCCL` as the backend and must set `init_method="env://"` if use `torch.distributed.launch`.
+- Wrap the model with `DistributedDataParallel` after moving to expected device.
+- Execute `partition_dataset` to load data only for current rank, no need `DistributedSampler` anymore.
+- `SmartCacheDataset` computes and caches the data for the first epoch.
+- Call `start()` function of `SmartCacheDataset` to start the replacement thread.
+- Call `update_cache()` function of `SmartCacheDataset` before every epoch to replace part of cache content.
+- Call `shutdown()` function of `SmartCacheDataset` to stop replacement thread when training ends.
+
+Note:
+    `torch.distributed.launch` will launch `nnodes * nproc_per_node = world_size` processes in total.
+    Suggest setting exactly the same software environment for every node, especially `PyTorch`, `nccl`, etc.
+    A good practice is to use the same MONAI docker image for all nodes directly.
+    Example script to execute this program on every node:
+    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+           --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+           --master_addr="192.168.1.1" --master_port=1234
+           unet_training_smartcache.py -d DIR_OF_TESTDATA
+
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3].
+
+Referring to: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+
+"""
+
+import argparse
+import math
+import os
+import sys
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel
+
+import monai
+from monai.data import DataLoader, SmartCacheDataset, create_test_image_3d
+from monai.transforms import (
+    AsChannelFirstd,
+    Compose,
+    LoadNiftid,
+    RandCropByPosNegLabeld,
+    RandRotate90d,
+    ScaleIntensityd,
+    ToTensord,
+)
+
+
+def partition_dataset(data, num_replicas=None, rank=None, shuffle=False, seed=0, drop_last=False):
+    """
+    Partition the dataset for distributed training, every rank process only train with its own data partition.
+    It can be useful for `CacheDataset` or `SmartCacheDataset`, because every rank process can only compute and
+    cache its own data.
+    Note that every rank process will shuffle data only in its own partition if set `shuffle=True` to DataLoader.
+
+    The alternative solution is to use `DistributedSampler`, which supports global shuffle before every epoch.
+    But if using `CacheDataset` or `SmartCacheDataset`, every rank process will cache duplicated data content and
+    raise system memory usage.
+
+    Args:
+        data: data list to partition, assumed to be of constant size.
+        num_replicas: number of processes participating in the distributed training.
+            if None, retrieve the `world_size` from current distributed group.
+        rank: rank of the current process within `num_replicas`.
+            if None, retrieve the rank index from current distributed group.
+        shuffle: if true, will shuffle the indices of data list before partition.
+        seed: random seed to shuffle the indices if `shuffle=True`, default is `0`.
+            this number should be identical across all processes in the distributed group.
+        drop_last: if `True`, will drop the tail of the data to make it evenly divisible across the number of replicas.
+            if `False`, add extra indices to make the data evenly divisible across the replicas. default is `False`.
+
+    """
+    if num_replicas is None or rank is None:
+        if not dist.is_available():
+            raise RuntimeError("require distributed package to be available.")
+        if num_replicas is None:
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            rank = dist.get_rank()
+
+    if drop_last and len(data) % num_replicas != 0:
+        # split to nearest available length that is evenly divisible
+        num_samples = math.ceil((len(data) - num_replicas) / num_replicas)
+    else:
+        num_samples = math.ceil(len(data) / num_replicas)
+    total_size = num_samples * num_replicas
+
+    indices = np.array(list(range(len(data))))
+    if shuffle:
+        # deterministically shuffle based on fixed seed for every process
+        np.random.seed(seed)
+        np.random.shuffle(indices)
+
+    if not drop_last and total_size - len(indices) > 0:
+        # add extra samples to make it evenly divisible
+        indices += indices[: (total_size - len(indices))]
+    else:
+        # remove tail of data to make it evenly divisible
+        indices = indices[:total_size]
+
+    indices = indices[rank:total_size:num_replicas]
+    return [data[i] for i in indices]
+
+
+def train(args):
+    # disable logging for processes except 0 on every node
+    if args.local_rank != 0:
+        f = open(os.devnull, "w")
+        sys.stdout = sys.stderr = f
+    elif not os.path.exists(args.dir):
+        # create 40 random image, mask paris for training
+        print(f"generating synthetic data to {args.dir} (this may take a while)")
+        os.makedirs(args.dir)
+        # set random seed to generate same random data for every node
+        np.random.seed(seed=0)
+        for i in range(40):
+            im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+            n = nib.Nifti1Image(im, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
+            n = nib.Nifti1Image(seg, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))
+
+    # initialize the distributed training process, every GPU runs in a process
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
+    train_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    train_transforms = Compose(
+        [
+            LoadNiftid(keys=["img", "seg"]),
+            AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
+            ScaleIntensityd(keys="img"),
+            RandCropByPosNegLabeld(
+                keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
+            ),
+            RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),
+            ToTensord(keys=["img", "seg"]),
+        ]
+    )
+
+    # partition dataset based on current rank number, every rank trains with its own data
+    data_part = partition_dataset(train_files, shuffle=True)
+    train_ds = SmartCacheDataset(
+        data=data_part,
+        transform=train_transforms,
+        replace_rate=0.2,
+        cache_num=15,  # we suppose to use 2 ranks in this example, every rank has 20 training images
+        num_init_workers=2,
+        num_replace_workers=2,
+    )
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=2, pin_memory=True)
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device(f"cuda:{args.local_rank}")
+    model = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    loss_function = monai.losses.DiceLoss(sigmoid=True).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), 1e-3)
+    # wrap the model with DistributedDataParallel module
+    model = DistributedDataParallel(model, device_ids=[args.local_rank])
+
+    # start a typical PyTorch training
+    epoch_loss_values = list()
+    # start the replacement thread of SmartCache
+    train_ds.start()
+
+    for epoch in range(5):
+        print("-" * 10)
+        print(f"epoch {epoch + 1}/{5}")
+        model.train()
+        epoch_loss = 0
+        step = 0
+        for batch_data in train_loader:
+            step += 1
+            inputs, labels = batch_data["img"].to(device), batch_data["seg"].to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            epoch_len = math.ceil(len(train_ds) / train_loader.batch_size)
+            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
+        epoch_loss /= step
+        epoch_loss_values.append(epoch_loss)
+        # replace 20% of cache content for next epoch
+        train_ds.update_cache()
+        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
+    # stop replacement thread of SmartCache
+    train_ds.shutdown()
+    print(f"train completed, epoch losses: {epoch_loss_values}")
+    if dist.get_rank() == 0:
+        # all processes should see same parameters as they all start from same
+        # random parameters and gradients are synchronized in backward passes,
+        # therefore, saving it in one process is sufficient
+        torch.save(model.state_dict(), "final_model.pth")
+    dist.destroy_process_group()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dir", default="./testdata", type=str, help="directory to create random data")
+    # must parse the command-line argument: ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by DDP
+    parser.add_argument("--local_rank", type=int)
+    args = parser.parse_args()
+
+    train(args=args)
+
+
+# usage example(refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py):
+
+# python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+#        --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+#        --master_addr="192.168.1.1" --master_port=1234
+#        unet_training_smartcache.py -d DIR_OF_TESTDATA
+
+if __name__ == "__main__":
+    main()
diff --git a/acceleration/distributed_training/unet_training_workflows.py b/acceleration/distributed_training/unet_training_workflows.py
new file mode 100644
index 0000000000..713a2cd5ff
--- /dev/null
+++ b/acceleration/distributed_training/unet_training_workflows.py
@@ -0,0 +1,206 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to execute distributed training based on PyTorch native `DistributedDataParallel` module
+and MONAI workflows. It can run on several nodes with multiple GPU devices on every node.
+Main steps to set up the distributed training:
+
+- Execute `torch.distributed.launch` to create processes on every node for every GPU.
+  It receives parameters as below:
+  `--nproc_per_node=NUM_GPUS_PER_NODE`
+  `--nnodes=NUM_NODES`
+  `--node_rank=INDEX_CURRENT_NODE`
+  `--master_addr="192.168.1.1"`
+  `--master_port=1234`
+  For more details, refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py.
+  Alternatively, we can also use `torch.multiprocessing.spawn` to start program, but it that case, need to handle
+  all the above parameters and compute `rank` manually, then set to `init_process_group`, etc.
+  `torch.distributed.launch` is even more efficient than `torch.multiprocessing.spawn` during training.
+- Use `init_process_group` to initialize every process, every GPU runs in a separate process with unique rank.
+  Here we use `NVIDIA NCCL` as the backend and must set `init_method="env://"` if use `torch.distributed.launch`.
+- Wrap the model with `DistributedDataParallel` after moving to expected device.
+- Wrap Dataset with `DistributedSampler`, and disable the `shuffle` in DataLoader.
+  Instead, `SupervisedTrainer` shuffles data by `train_sampler.set_epoch(epoch)` before every epoch.
+- Add `StatsHandler` and `CheckpointHandler` to the master process which is `dist.get_rank() == 0`.
+- ignite can automatically reduce metrics for distributed training, refer to:
+  https://github.com/pytorch/ignite/blob/v0.3.0/ignite/metrics/metric.py#L85
+
+Note:
+    `torch.distributed.launch` will launch `nnodes * nproc_per_node = world_size` processes in total.
+    Suggest setting exactly the same software environment for every node, especially `PyTorch`, `nccl`, etc.
+    A good practice is to use the same MONAI docker image for all nodes directly.
+    Example script to execute this program on every node:
+    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+           --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+           --master_addr="192.168.1.1" --master_port=1234
+           unet_training_workflows.py -d DIR_OF_TESTDATA
+
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3].
+
+Referring to: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+
+"""
+
+import argparse
+import logging
+import os
+import sys
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+import torch.distributed as dist
+from ignite.metrics import Accuracy
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data.distributed import DistributedSampler
+
+import monai
+from monai.data import DataLoader, Dataset, create_test_image_3d
+from monai.engines import SupervisedTrainer
+from monai.handlers import CheckpointSaver, LrScheduleHandler, StatsHandler
+from monai.inferers import SimpleInferer
+from monai.transforms import (
+    Activationsd,
+    AsChannelFirstd,
+    AsDiscreted,
+    Compose,
+    KeepLargestConnectedComponentd,
+    LoadNiftid,
+    RandCropByPosNegLabeld,
+    RandRotate90d,
+    ScaleIntensityd,
+    ToTensord,
+)
+
+
+def train(args):
+    if args.local_rank == 0 and not os.path.exists(args.dir):
+        # create 40 random image, mask paris for training
+        print(f"generating synthetic data to {args.dir} (this may take a while)")
+        os.makedirs(args.dir)
+        # set random seed to generate same random data for every node
+        np.random.seed(seed=0)
+        for i in range(40):
+            im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+            n = nib.Nifti1Image(im, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
+            n = nib.Nifti1Image(seg, np.eye(4))
+            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))
+
+    # initialize the distributed training process, every GPU runs in a process
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
+    train_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)]
+
+    # define transforms for image and segmentation
+    train_transforms = Compose(
+        [
+            LoadNiftid(keys=["image", "label"]),
+            AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
+            ScaleIntensityd(keys="image"),
+            RandCropByPosNegLabeld(
+                keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
+            ),
+            RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]),
+            ToTensord(keys=["image", "label"]),
+        ]
+    )
+
+    # create a training data loader
+    train_ds = Dataset(data=train_files, transform=train_transforms)
+    # create a training data sampler
+    train_sampler = DistributedSampler(train_ds)
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=2,
+        shuffle=False,
+        num_workers=2,
+        pin_memory=True,
+        sampler=train_sampler,
+    )
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device(f"cuda:{args.local_rank}")
+    net = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    loss = monai.losses.DiceLoss(sigmoid=True).to(device)
+    opt = torch.optim.Adam(net.parameters(), 1e-3)
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1)
+    # wrap the model with DistributedDataParallel module
+    net = DistributedDataParallel(net, device_ids=[args.local_rank])
+
+    train_post_transforms = Compose(
+        [
+            Activationsd(keys="pred", sigmoid=True),
+            AsDiscreted(keys="pred", threshold_values=True),
+            KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
+        ]
+    )
+    train_handlers = [
+        LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True),
+    ]
+    if dist.get_rank() == 0:
+        logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+        train_handlers.extend(
+            [
+                StatsHandler(tag_name="train_loss", output_transform=lambda x: x["loss"]),
+                CheckpointSaver(save_dir="./runs/", save_dict={"net": net, "opt": opt}, save_interval=2),
+            ]
+        )
+
+    trainer = SupervisedTrainer(
+        device=device,
+        max_epochs=5,
+        train_data_loader=train_loader,
+        network=net,
+        optimizer=opt,
+        loss_function=loss,
+        inferer=SimpleInferer(),
+        # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation
+        amp=True if monai.config.get_torch_version_tuple() >= (1, 6) else False,
+        post_transform=train_post_transforms,
+        key_train_metric={"train_acc": Accuracy(output_transform=lambda x: (x["pred"], x["label"]), device=device)},
+        train_handlers=train_handlers,
+    )
+    trainer.run()
+    dist.destroy_process_group()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dir", default="./testdata", type=str, help="directory to create random data")
+    # must parse the command-line argument: ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by DDP
+    parser.add_argument("--local_rank", type=int)
+    args = parser.parse_args()
+
+    train(args=args)
+
+
+# usage example(refer to https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py):
+
+# python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
+#        --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
+#        --master_addr="192.168.1.1" --master_port=1234
+#        unet_training_workflows.py -d DIR_OF_TESTDATA
+
+if __name__ == "__main__":
+    main()
diff --git a/fast_training_tutorial.ipynb b/acceleration/fast_training_tutorial.ipynb
similarity index 99%
rename from fast_training_tutorial.ipynb
rename to acceleration/fast_training_tutorial.ipynb
index 88bf432797..962d4b74cd 100644
--- a/fast_training_tutorial.ipynb
+++ b/acceleration/fast_training_tutorial.ipynb
@@ -18,7 +18,7 @@
     "\n",
     "It's modified from the Spleen 3D segmentation tutorial notebook, the Spleen dataset can be downloaded from http://medicaldecathlon.com/.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/fast_training_tutorial.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/fast_training_tutorial.ipynb)"
    ]
   },
   {
diff --git a/multi_gpu_test.ipynb b/acceleration/multi_gpu_test.ipynb
similarity index 98%
rename from multi_gpu_test.ipynb
rename to acceleration/multi_gpu_test.ipynb
index 73ebdfe8ab..292aea2f64 100644
--- a/multi_gpu_test.ipynb
+++ b/acceleration/multi_gpu_test.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Multi GPU Test\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/multi_gpu_test.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/multi_gpu_test.ipynb)"
    ]
   },
   {
@@ -270,7 +270,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/transform_speed.ipynb b/acceleration/transform_speed.ipynb
similarity index 99%
rename from transform_speed.ipynb
rename to acceleration/transform_speed.ipynb
index 3246c87daf..f723da0dbb 100644
--- a/transform_speed.ipynb
+++ b/acceleration/transform_speed.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "The purpose of this notebook is to illustrate reading Nifti files and test speed of different methods.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/transform_speed.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/transform_speed.ipynb)"
    ]
   },
   {
@@ -489,7 +489,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/images/brats_tasks.png b/figures/brats_tasks.png
similarity index 100%
rename from images/brats_tasks.png
rename to figures/brats_tasks.png
diff --git a/images/models_ensemble.png b/figures/models_ensemble.png
similarity index 100%
rename from images/models_ensemble.png
rename to figures/models_ensemble.png
diff --git a/images/multi_transform_chains.png b/figures/multi_transform_chains.png
similarity index 100%
rename from images/multi_transform_chains.png
rename to figures/multi_transform_chains.png
diff --git a/3d_image_transforms.ipynb b/modules/3d_image_transforms.ipynb
similarity index 99%
rename from 3d_image_transforms.ipynb
rename to modules/3d_image_transforms.ipynb
index 221342eb1d..526db53cfb 100644
--- a/3d_image_transforms.ipynb
+++ b/modules/3d_image_transforms.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "This notebook introduces you MONAI's transformation module for 3D images.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/3d_image_transforms.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/3d_image_transforms.ipynb)"
    ]
   },
   {
@@ -811,7 +811,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/dynunet_tutorial.ipynb b/modules/dynunet_tutorial.ipynb
similarity index 99%
rename from dynunet_tutorial.ipynb
rename to modules/dynunet_tutorial.ipynb
index 415c6b0839..d5568344ca 100644
--- a/dynunet_tutorial.ipynb
+++ b/modules/dynunet_tutorial.ipynb
@@ -13,7 +13,7 @@
     "\n",
     "`nnU-Net: Self-adapting Framework for U-Net-Based Medical Image Segmentation <https://arxiv.org/abs/1809.10486>`\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/nnunet_tutorial.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/nnunet_tutorial.ipynb)"
    ]
   },
   {
diff --git a/integrate_3rd_party_transforms.ipynb b/modules/integrate_3rd_party_transforms.ipynb
similarity index 99%
rename from integrate_3rd_party_transforms.ipynb
rename to modules/integrate_3rd_party_transforms.ipynb
index 79aab6225a..9e7e0445c5 100644
--- a/integrate_3rd_party_transforms.ipynb
+++ b/modules/integrate_3rd_party_transforms.ipynb
@@ -9,7 +9,7 @@
     "This tutorial shows how to integrate 3rd party transforms into a MONAI program.  \n",
     "Mainly showing transforms from `BatchGenerator`, `TorchIO`, `Rising` and `ITK`.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/integrate_3rd_party_transforms.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/integrate_3rd_party_transforms.ipynb)"
    ]
   },
   {
@@ -521,7 +521,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/load_medical_images.ipynb b/modules/load_medical_images.ipynb
similarity index 99%
rename from load_medical_images.ipynb
rename to modules/load_medical_images.ipynb
index f0af97b1b0..ae88a15597 100644
--- a/load_medical_images.ipynb
+++ b/modules/load_medical_images.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "This notebook introduces how to easily load different formats of medical images in MONAI and execute many additional operations.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/load_medical_images.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/load_medical_images.ipynb)"
    ]
   },
   {
diff --git a/mednist_GAN_tutorial.ipynb b/modules/mednist_GAN_tutorial.ipynb
similarity index 99%
rename from mednist_GAN_tutorial.ipynb
rename to modules/mednist_GAN_tutorial.ipynb
index d9dafb5f9a..6335195bfb 100644
--- a/mednist_GAN_tutorial.ipynb
+++ b/modules/mednist_GAN_tutorial.ipynb
@@ -14,7 +14,7 @@
     "* Defining the networks\n",
     "* Training and evaluation\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/mednist_GAN_tutorial.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/mednist_GAN_tutorial.ipynb)"
    ]
   },
   {
@@ -497,7 +497,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/mednist_GAN_workflow.ipynb b/modules/mednist_GAN_workflow.ipynb
similarity index 99%
rename from mednist_GAN_workflow.ipynb
rename to modules/mednist_GAN_workflow.ipynb
index 5979eb598f..ffe8c13885 100644
--- a/mednist_GAN_workflow.ipynb
+++ b/modules/mednist_GAN_workflow.ipynb
@@ -23,7 +23,7 @@
     "3. Run Training\n",
     "4. Evaluate Results\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/mednist_GAN_workflow.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/mednist_GAN_workflow.ipynb)"
    ]
   },
   {
@@ -649,7 +649,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/models_ensemble.ipynb b/modules/models_ensemble.ipynb
similarity index 99%
rename from models_ensemble.ipynb
rename to modules/models_ensemble.ipynb
index c9fd989626..75b8cc47dc 100644
--- a/models_ensemble.ipynb
+++ b/modules/models_ensemble.ipynb
@@ -13,13 +13,13 @@
     "* Execute inference on the test data with all the K models.\n",
     "* Compute the average values with weights or vote the most common value as the final result.\n",
     "<p>\n",
-    "<img src=\"./images/models_ensemble.png\" width=\"80%\" alt='models_ensemble'>\n",
+    "<img src=\"../figures/models_ensemble.png\" width=\"80%\" alt='models_ensemble'>\n",
     "</p>\n",
     "\n",
     "MONAI provides `EnsembleEvaluator` and `MeanEnsemble`, `VoteEnsemble` post transforms.  \n",
     "This tutorial shows how to leverage ensemble modules in MONAI to set up ensemble program.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/models_ensemble.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/models_ensemble.ipynb)"
    ]
   },
   {
@@ -579,7 +579,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/nifti_read_example.ipynb b/modules/nifti_read_example.ipynb
similarity index 98%
rename from nifti_read_example.ipynb
rename to modules/nifti_read_example.ipynb
index ba2029855b..78eacca151 100644
--- a/nifti_read_example.ipynb
+++ b/modules/nifti_read_example.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "The purpose of this notebook is to illustrate reading Nifti files and iterating over patches of the volumes loaded from them.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/nifti_read_example.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/nifti_read_example.ipynb)"
    ]
   },
   {
@@ -291,7 +291,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/post_transforms.ipynb b/modules/post_transforms.ipynb
similarity index 99%
rename from post_transforms.ipynb
rename to modules/post_transforms.ipynb
index 03cf16527c..36e0c09c74 100644
--- a/post_transforms.ipynb
+++ b/modules/post_transforms.ipynb
@@ -17,12 +17,12 @@
     "\n",
     "A typical usage is to scale and concatenate 3 different intensity ranges of an input image:\n",
     "<p>\n",
-    "<img src=\"./images/multi_transform_chains.png\" width=\"70%\" alt='multi_transform_chains'>\n",
+    "<img src=\"../figures/multi_transform_chains.png\" width=\"70%\" alt='multi_transform_chains'>\n",
     "</p>\n",
     "\n",
     "This tutorial shows several of above post transforms based on the model output of spleen segmentation.\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/post_transforms.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/post_transforms.ipynb)"
    ]
   },
   {
@@ -641,7 +641,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/public_datasets.ipynb b/modules/public_datasets.ipynb
similarity index 99%
rename from public_datasets.ipynb
rename to modules/public_datasets.ipynb
index 92075981d7..252e114f33 100644
--- a/public_datasets.ipynb
+++ b/modules/public_datasets.ipynb
@@ -15,7 +15,7 @@
     "* Create training experiment with DecathlonDataset and workflow\n",
     "* Share other public data and add Dataset in MONAI\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/public_datasets.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/public_datasets.ipynb)"
    ]
   },
   {
@@ -743,7 +743,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/transforms_demo_2d.ipynb b/modules/transforms_demo_2d.ipynb
similarity index 99%
rename from transforms_demo_2d.ipynb
rename to modules/transforms_demo_2d.ipynb
index c8393d5338..eac2624bd7 100644
--- a/transforms_demo_2d.ipynb
+++ b/modules/transforms_demo_2d.ipynb
@@ -13,7 +13,7 @@
     "    \n",
     "Find out more in MONAI's wiki page: https://github.com/Project-MONAI/MONAI/wiki\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/Tutorials/blob/master/transforms_demo_2d.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/master/acceleration/transforms_demo_2d.ipynb)"
    ]
   },
   {
@@ -431,7 +431,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/modules/workflows/gan_evaluation.py b/modules/workflows/gan_evaluation.py
new file mode 100644
index 0000000000..8f7ebcfa0e
--- /dev/null
+++ b/modules/workflows/gan_evaluation.py
@@ -0,0 +1,67 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MONAI GAN Evaluation Example
+    Generate fake images from trained generator file.
+
+"""
+
+import logging
+import os
+import sys
+from glob import glob
+
+import torch
+
+import monai
+from monai.data import png_writer
+from monai.engines.utils import default_make_latent as make_latent
+from monai.networks.nets import Generator
+from monai.utils.misc import set_determinism
+
+
+def save_generator_fakes(run_folder, g_output_tensor):
+    for i, image in enumerate(g_output_tensor):
+        filename = "gen-fake-%d.png" % i
+        save_path = os.path.join(run_folder, filename)
+        img_array = image[0].cpu().data.numpy()
+        png_writer.write_png(img_array, save_path, scale=255)
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+    set_determinism(12345)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # load generator
+    network_filepath = glob("./model_out/*.pth")[0]
+    data = torch.load(network_filepath)
+    latent_size = 64
+    gen_net = Generator(
+        latent_shape=latent_size, start_shape=(latent_size, 8, 8), channels=[32, 16, 8, 1], strides=[2, 2, 2, 1]
+    )
+    gen_net.conv.add_module("activation", torch.nn.Sigmoid())
+    gen_net.load_state_dict(data["g_net"])
+    gen_net = gen_net.to(device)
+
+    # create fakes
+    output_dir = "./generated_images"
+    if not os.path.isdir(output_dir):
+        os.mkdir(output_dir)
+    num_fakes = 10
+    print("Generating %d fakes and saving in %s" % (num_fakes, output_dir))
+    fake_latents = make_latent(num_fakes, latent_size).to(device)
+    save_generator_fakes(output_dir, gen_net(fake_latents))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/workflows/gan_training.py b/modules/workflows/gan_training.py
new file mode 100644
index 0000000000..b745db8da9
--- /dev/null
+++ b/modules/workflows/gan_training.py
@@ -0,0 +1,203 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MONAI Generative Adversarial Networks Workflow Example
+    Sample script using MONAI to train a GAN to synthesize images from a latent code.
+
+## Get the dataset
+    MedNIST.tar.gz link: https://www.dropbox.com/s/5wwskxctvcxiuea/MedNIST.tar.gz
+    Extract tarball and set input_dir variable. GAN script trains using hand CT scan jpg images.
+
+    Dataset information available in MedNIST Tutorial
+    https://github.com/Project-MONAI/Tutorials/blob/master/mednist_tutorial.ipynb
+"""
+
+import logging
+import os
+import sys
+
+import torch
+
+import monai
+from monai.apps.utils import download_and_extract
+from monai.data import CacheDataset, DataLoader, png_writer
+from monai.engines import GanTrainer
+from monai.engines.utils import GanKeys as Keys
+from monai.engines.utils import default_make_latent as make_latent
+from monai.handlers import CheckpointSaver, StatsHandler
+from monai.networks import normal_init
+from monai.networks.nets import Discriminator, Generator
+from monai.transforms import (
+    AddChannelD,
+    Compose,
+    LoadPNGD,
+    RandFlipD,
+    RandRotateD,
+    RandZoomD,
+    ScaleIntensityD,
+    ToTensorD,
+)
+from monai.utils.misc import set_determinism
+
+
+def main():
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+    set_determinism(12345)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # load real data
+    mednist_url = "https://www.dropbox.com/s/5wwskxctvcxiuea/MedNIST.tar.gz?dl=1"
+    md5_value = "0bc7306e7427e00ad1c5526a6677552d"
+    extract_dir = "data"
+    tar_save_path = os.path.join(extract_dir, "MedNIST.tar.gz")
+    download_and_extract(mednist_url, tar_save_path, extract_dir, md5_value)
+    hand_dir = os.path.join(extract_dir, "MedNIST", "Hand")
+    real_data = [{"hand": os.path.join(hand_dir, filename)} for filename in os.listdir(hand_dir)]
+
+    # define real data transforms
+    train_transforms = Compose(
+        [
+            LoadPNGD(keys=["hand"]),
+            AddChannelD(keys=["hand"]),
+            ScaleIntensityD(keys=["hand"]),
+            RandRotateD(keys=["hand"], range_x=15, prob=0.5, keep_size=True),
+            RandFlipD(keys=["hand"], spatial_axis=0, prob=0.5),
+            RandZoomD(keys=["hand"], min_zoom=0.9, max_zoom=1.1, prob=0.5),
+            ToTensorD(keys=["hand"]),
+        ]
+    )
+
+    # create dataset and dataloader
+    real_dataset = CacheDataset(real_data, train_transforms)
+    batch_size = 300
+    real_dataloader = DataLoader(real_dataset, batch_size=batch_size, shuffle=True, num_workers=10)
+
+    # define function to process batchdata for input into discriminator
+    def prepare_batch(batchdata):
+        """
+        Process Dataloader batchdata dict object and return image tensors for D Inferer
+        """
+        return batchdata["hand"]
+
+    # define networks
+    disc_net = Discriminator(
+        in_shape=(1, 64, 64), channels=(8, 16, 32, 64, 1), strides=(2, 2, 2, 2, 1), num_res_units=1, kernel_size=5
+    ).to(device)
+
+    latent_size = 64
+    gen_net = Generator(
+        latent_shape=latent_size, start_shape=(latent_size, 8, 8), channels=[32, 16, 8, 1], strides=[2, 2, 2, 1]
+    )
+
+    # initialize both networks
+    disc_net.apply(normal_init)
+    gen_net.apply(normal_init)
+
+    # input images are scaled to [0,1] so enforce the same of generated outputs
+    gen_net.conv.add_module("activation", torch.nn.Sigmoid())
+    gen_net = gen_net.to(device)
+
+    # create optimizers and loss functions
+    learning_rate = 2e-4
+    betas = (0.5, 0.999)
+    disc_opt = torch.optim.Adam(disc_net.parameters(), learning_rate, betas=betas)
+    gen_opt = torch.optim.Adam(gen_net.parameters(), learning_rate, betas=betas)
+
+    disc_loss_criterion = torch.nn.BCELoss()
+    gen_loss_criterion = torch.nn.BCELoss()
+    real_label = 1
+    fake_label = 0
+
+    def discriminator_loss(gen_images, real_images):
+        """
+        The discriminator loss is calculated by comparing D
+        prediction for real and generated images.
+
+        """
+        real = real_images.new_full((real_images.shape[0], 1), real_label)
+        gen = gen_images.new_full((gen_images.shape[0], 1), fake_label)
+
+        realloss = disc_loss_criterion(disc_net(real_images), real)
+        genloss = disc_loss_criterion(disc_net(gen_images.detach()), gen)
+
+        return (genloss + realloss) / 2
+
+    def generator_loss(gen_images):
+        """
+        The generator loss is calculated by determining how realistic
+        the discriminator classifies the generated images.
+
+        """
+        output = disc_net(gen_images)
+        cats = output.new_full(output.shape, real_label)
+        return gen_loss_criterion(output, cats)
+
+    # initialize current run dir
+    run_dir = "model_out"
+    print("Saving model output to: %s " % run_dir)
+
+    # create workflow handlers
+    handlers = [
+        StatsHandler(
+            name="batch_training_loss",
+            output_transform=lambda x: {Keys.GLOSS: x[Keys.GLOSS], Keys.DLOSS: x[Keys.DLOSS]},
+        ),
+        CheckpointSaver(
+            save_dir=run_dir,
+            save_dict={"g_net": gen_net, "d_net": disc_net},
+            save_interval=10,
+            save_final=True,
+            epoch_level=True,
+        ),
+    ]
+
+    # define key metric
+    key_train_metric = None
+
+    # create adversarial trainer
+    disc_train_steps = 5
+    num_epochs = 50
+
+    trainer = GanTrainer(
+        device,
+        num_epochs,
+        real_dataloader,
+        gen_net,
+        gen_opt,
+        generator_loss,
+        disc_net,
+        disc_opt,
+        discriminator_loss,
+        d_prepare_batch=prepare_batch,
+        d_train_steps=disc_train_steps,
+        latent_shape=latent_size,
+        key_train_metric=key_train_metric,
+        train_handlers=handlers,
+    )
+
+    # run GAN training
+    trainer.run()
+
+    # Training completed, save a few random generated images.
+    print("Saving trained generator sample output.")
+    test_img_count = 10
+    test_latents = make_latent(test_img_count, latent_size).to(device)
+    fakes = gen_net(test_latents)
+    for i, image in enumerate(fakes):
+        filename = "gen-fake-final-%d.png" % i
+        save_path = os.path.join(run_dir, filename)
+        img_array = image[0].cpu().data.numpy()
+        png_writer.write_png(img_array, save_path, scale=255)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/workflows/unet_evaluation_dict.py b/modules/workflows/unet_evaluation_dict.py
new file mode 100644
index 0000000000..48f195ed01
--- /dev/null
+++ b/modules/workflows/unet_evaluation_dict.py
@@ -0,0 +1,121 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from ignite.metrics import Accuracy
+
+import monai
+from monai.data import create_test_image_3d
+from monai.engines import SupervisedEvaluator
+from monai.handlers import CheckpointLoader, MeanDice, SegmentationSaver, StatsHandler
+from monai.inferers import SlidingWindowInferer
+from monai.transforms import (
+    Activationsd,
+    AsChannelFirstd,
+    AsDiscreted,
+    Compose,
+    KeepLargestConnectedComponentd,
+    LoadNiftid,
+    ScaleIntensityd,
+    ToTensord,
+)
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # create a temporary directory and 40 random image, mask pairs
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(5):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz"))
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+    val_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)]
+
+    # model file path
+    model_file = glob("./runs/net_key_metric*")[0]
+
+    # define transforms for image and segmentation
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["image", "label"]),
+            AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
+            ScaleIntensityd(keys="image"),
+            ToTensord(keys=["image", "label"]),
+        ]
+    )
+
+    # create a validation data loader
+    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
+    val_loader = monai.data.DataLoader(val_ds, batch_size=1, num_workers=4)
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    net = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+
+    val_post_transforms = Compose(
+        [
+            Activationsd(keys="pred", sigmoid=True),
+            AsDiscreted(keys="pred", threshold_values=True),
+            KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
+        ]
+    )
+    val_handlers = [
+        StatsHandler(output_transform=lambda x: None),
+        CheckpointLoader(load_path=model_file, load_dict={"net": net}),
+        SegmentationSaver(
+            output_dir="./runs/",
+            batch_transform=lambda batch: batch["image_meta_dict"],
+            output_transform=lambda output: output["pred"],
+        ),
+    ]
+
+    evaluator = SupervisedEvaluator(
+        device=device,
+        val_data_loader=val_loader,
+        network=net,
+        inferer=SlidingWindowInferer(roi_size=(96, 96, 96), sw_batch_size=4, overlap=0.5),
+        post_transform=val_post_transforms,
+        key_val_metric={
+            "val_mean_dice": MeanDice(include_background=True, output_transform=lambda x: (x["pred"], x["label"]))
+        },
+        additional_metrics={"val_acc": Accuracy(output_transform=lambda x: (x["pred"], x["label"]))},
+        val_handlers=val_handlers,
+        # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation
+        amp=True if monai.config.get_torch_version_tuple() >= (1, 6) else False,
+    )
+    evaluator.run()
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/modules/workflows/unet_training_dict.py b/modules/workflows/unet_training_dict.py
new file mode 100644
index 0000000000..1f5dbb4c09
--- /dev/null
+++ b/modules/workflows/unet_training_dict.py
@@ -0,0 +1,179 @@
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+from glob import glob
+
+import nibabel as nib
+import numpy as np
+import torch
+from ignite.metrics import Accuracy
+
+import monai
+from monai.data import create_test_image_3d
+from monai.engines import SupervisedEvaluator, SupervisedTrainer
+from monai.handlers import (
+    CheckpointSaver,
+    LrScheduleHandler,
+    MeanDice,
+    StatsHandler,
+    TensorBoardImageHandler,
+    TensorBoardStatsHandler,
+    ValidationHandler,
+)
+from monai.inferers import SimpleInferer, SlidingWindowInferer
+from monai.transforms import (
+    Activationsd,
+    AsChannelFirstd,
+    AsDiscreted,
+    Compose,
+    KeepLargestConnectedComponentd,
+    LoadNiftid,
+    RandCropByPosNegLabeld,
+    RandRotate90d,
+    ScaleIntensityd,
+    ToTensord,
+)
+
+
+def main(tempdir):
+    monai.config.print_config()
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    # create a temporary directory and 40 random image, mask pairs
+    print(f"generating synthetic data to {tempdir} (this may take a while)")
+    for i in range(40):
+        im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
+        n = nib.Nifti1Image(im, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"img{i:d}.nii.gz"))
+        n = nib.Nifti1Image(seg, np.eye(4))
+        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))
+
+    images = sorted(glob(os.path.join(tempdir, "img*.nii.gz")))
+    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
+    train_files = [{"image": img, "label": seg} for img, seg in zip(images[:20], segs[:20])]
+    val_files = [{"image": img, "label": seg} for img, seg in zip(images[-20:], segs[-20:])]
+
+    # define transforms for image and segmentation
+    train_transforms = Compose(
+        [
+            LoadNiftid(keys=["image", "label"]),
+            AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
+            ScaleIntensityd(keys="image"),
+            RandCropByPosNegLabeld(
+                keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
+            ),
+            RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]),
+            ToTensord(keys=["image", "label"]),
+        ]
+    )
+    val_transforms = Compose(
+        [
+            LoadNiftid(keys=["image", "label"]),
+            AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
+            ScaleIntensityd(keys="image"),
+            ToTensord(keys=["image", "label"]),
+        ]
+    )
+
+    # create a training data loader
+    train_ds = monai.data.CacheDataset(data=train_files, transform=train_transforms, cache_rate=0.5)
+    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
+    train_loader = monai.data.DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=4)
+    # create a validation data loader
+    val_ds = monai.data.CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0)
+    val_loader = monai.data.DataLoader(val_ds, batch_size=1, num_workers=4)
+
+    # create UNet, DiceLoss and Adam optimizer
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    net = monai.networks.nets.UNet(
+        dimensions=3,
+        in_channels=1,
+        out_channels=1,
+        channels=(16, 32, 64, 128, 256),
+        strides=(2, 2, 2, 2),
+        num_res_units=2,
+    ).to(device)
+    loss = monai.losses.DiceLoss(sigmoid=True)
+    opt = torch.optim.Adam(net.parameters(), 1e-3)
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1)
+
+    val_post_transforms = Compose(
+        [
+            Activationsd(keys="pred", sigmoid=True),
+            AsDiscreted(keys="pred", threshold_values=True),
+            KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
+        ]
+    )
+    val_handlers = [
+        StatsHandler(output_transform=lambda x: None),
+        TensorBoardStatsHandler(log_dir="./runs/", output_transform=lambda x: None),
+        TensorBoardImageHandler(
+            log_dir="./runs/",
+            batch_transform=lambda x: (x["image"], x["label"]),
+            output_transform=lambda x: x["pred"],
+        ),
+        CheckpointSaver(save_dir="./runs/", save_dict={"net": net}, save_key_metric=True),
+    ]
+
+    evaluator = SupervisedEvaluator(
+        device=device,
+        val_data_loader=val_loader,
+        network=net,
+        inferer=SlidingWindowInferer(roi_size=(96, 96, 96), sw_batch_size=4, overlap=0.5),
+        post_transform=val_post_transforms,
+        key_val_metric={
+            "val_mean_dice": MeanDice(include_background=True, output_transform=lambda x: (x["pred"], x["label"]))
+        },
+        additional_metrics={"val_acc": Accuracy(output_transform=lambda x: (x["pred"], x["label"]))},
+        val_handlers=val_handlers,
+        # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation
+        amp=True if monai.config.get_torch_version_tuple() >= (1, 6) else False,
+    )
+
+    train_post_transforms = Compose(
+        [
+            Activationsd(keys="pred", sigmoid=True),
+            AsDiscreted(keys="pred", threshold_values=True),
+            KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
+        ]
+    )
+    train_handlers = [
+        LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True),
+        ValidationHandler(validator=evaluator, interval=2, epoch_level=True),
+        StatsHandler(tag_name="train_loss", output_transform=lambda x: x["loss"]),
+        TensorBoardStatsHandler(log_dir="./runs/", tag_name="train_loss", output_transform=lambda x: x["loss"]),
+        CheckpointSaver(save_dir="./runs/", save_dict={"net": net, "opt": opt}, save_interval=2, epoch_level=True),
+    ]
+
+    trainer = SupervisedTrainer(
+        device=device,
+        max_epochs=5,
+        train_data_loader=train_loader,
+        network=net,
+        optimizer=opt,
+        loss_function=loss,
+        inferer=SimpleInferer(),
+        post_transform=train_post_transforms,
+        key_train_metric={"train_acc": Accuracy(output_transform=lambda x: (x["pred"], x["label"]))},
+        train_handlers=train_handlers,
+        # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP training
+        amp=True if monai.config.get_torch_version_tuple() >= (1, 6) else False,
+    )
+    trainer.run()
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tempdir:
+        main(tempdir)
diff --git a/runexamples.sh b/runexamples.sh
new file mode 100755
index 0000000000..7035a79b5a
--- /dev/null
+++ b/runexamples.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+# Copyright 2020 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+# script for running the examples
+
+
+# install necessary packages
+pip install numpy
+pip install torch
+pip install 'monai[itk, nibabel, pillow]'
+
+
+# home directory
+homedir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+TEMP_LOG="temp.txt"
+
+cd "$homedir"
+find "$homedir" -type f -name $TEMP_LOG -delete
+
+
+# download data to specific directory
+if [ -e "./testing_ixi_t1.tar.gz" ] && [ -d "./workspace/" ]; then
+	echo "1" >> $TEMP_LOG
+else
+	wget  https://www.dropbox.com/s/y890gb6axzzqff5/testing_ixi_t1.tar.gz?dl=1
+        mv testing_ixi_t1.tar.gz?dl=1 testing_ixi_t1.tar.gz
+        mkdir -p ./workspace/data/medical/ixi/IXI-T1/
+        tar -C ./workspace/data/medical/ixi/IXI-T1/ -xf testing_ixi_t1.tar.gz
+fi
+
+
+# run training files in 3d_classification/torch
+for file in "3d_classification/torch"/*train*
+do
+    python "$file"
+done
+
+# check training files generated from 3d_classification/torch
+[ -e "./best_metric_model_classification3d_array.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 3d classification torch: model file not generated" | tee $TEMP_LOG && exit 0)
+[ -e "./best_metric_model_classification3d_dict.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 3d classification torch: model file not generated" | tee $TEMP_LOG && exit 0)
+
+# run eval files in 3d_classification/torch
+for file in "3d_classification/torch"/*eval*
+do
+    python "$file"
+done
+
+
+# run training files in 3d_classification/ignite
+for file in "3d_classification/ignite"/*train*
+do
+    python "$file"
+done
+
+# check training files generated from 3d_classification/ignite
+[ -e "./runs_array/net_checkpoint_20.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 3d classification ignite: model file not generated" | tee $TEMP_LOG && exit 0)
+[ -e "./runs_dict/net_checkpoint_20.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 3d classification ignite: model file not generated" | tee $TEMP_LOG && exit 0)
+
+# run eval files in 3d_classification/ignite
+for file in "3d_classification/ignite"/*eval*
+do
+    python "$file"
+done
+
+
+# run training files in 2d_segmentation/torch
+for file in "2d_segmentation/torch"/*train*
+do
+    python "$file"
+done
+
+# check training files generated from 2d_segmentation/torch
+[ -e "./best_metric_model_segmentation2d_array.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 2d segmentation torch: model file not generated" | tee $TEMP_LOG && exit 0)
+[ -e "./best_metric_model_segmentation2d_dict.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 2d segmentation torch: model file not generated" | tee $TEMP_LOG && exit 0)
+
+# run eval files in 2d_segmentation/torch
+for file in "2d_segmentation/torch"/*eval*
+do
+    python "$file"
+done
+
+
+# run training files in 3d_segmentation/torch
+for file in "3d_segmentation/torch"/*train*
+do
+    python "$file"
+done
+
+# check training files generated from 3d_segmentation/torch
+[ -e "./best_metric_model_segmentation3d_array.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 3d segmentation torch: model file not generated" | tee $TEMP_LOG && exit 0)
+[ -e "./best_metric_model_segmentation3d_dict.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 3d segmentation torch: model file not generated" | tee $TEMP_LOG && exit 0)
+
+# run eval files in 3d_segmentation/torch
+for file in "3d_segmentation/torch"/*eval*
+do
+    python "$file"
+done
+
+
+# run training files in 3d_segmentation/ignite
+for file in "3d_segmentation/ignite"/*train*
+do
+    python "$file"
+done
+
+# check training files generated from 3d_segmentation/ignite
+[ -e "./runs_array/net_checkpoint_100.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 3d segmentation ignite: model file not generated" | tee $TEMP_LOG && exit 0)
+[ -e "./runs_dict/net_checkpoint_50.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples 3d segmentation ignite: model file not generated" | tee $TEMP_LOG && exit 0)
+
+# run eval files in 3d_segmentation/ignite
+for file in "3d_segmentation/ignite"/*eval*
+do
+    python "$file"
+done
+
+
+# run training file in modules/workflows
+for file in "modules/workflows"/*train*
+do
+    python "$file"
+done
+
+# check training file generated from modules/workflows
+[ -e "./runs/net_key_metric*.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples supervised workflows: model file not generated" | tee $TEMP_LOG && exit 0)
+[ -e "./model_out/*.pth" ] && echo "1" >> $TEMP_LOG || (echo "examples GAN workflows: model file not generated" | tee $TEMP_LOG && exit 0)
+
+# run eval file in modules/workflows
+for file in "modules/workflows"/*eval*
+do
+    python "$file"
+done